Merge pull request #3 from CCBR/fix-combine-homer

fix: ignore "#" comment lines in peaks files
CCBR · Jun 11, 2024 · 48431e6 · 48431e6
2 parents 0ad396b + 338b23c
commit 48431e6
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 34 deletions.
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -160,7 +160,7 @@ def get_enrich(wildcards):
 def get_combined(wildcards):
     files = []
     if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE):    
-        combined_m = expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx"),qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_M,treatment_control_list=TREATMENT_LIST_M)
+        combined_m = expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.tsv"),qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_M,treatment_control_list=TREATMENT_LIST_M)
         files.extend(combined_m)
     return files
 

diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk
@@ -51,7 +51,7 @@ rule findMotif:
         else
             annotatePeaks.pl {input.peak_file} {params.genome} -annStats {output.annotation_summary} > {output.annotation}
         fi
-        
+
         # hs1 is not part of HOMER's config genome db. Must add it as a separate param
         if [[ {params.genome} == "hs1" ]]; then
             findMotifsGenome.pl {input.peak_file} {params.fa} {params.outDir} -size {params.motif_size} -p {threads} -preparsedDir {params.preparsedDir}
@@ -67,13 +67,13 @@ def get_annotation_files(wildcards):
     return expand(join(RESULTSDIR,"peaks", wildcards.qthresholds, wildcards.peak_caller, "annotation","homer",
            "{treatment_control_list}" + "." + wildcards.dupstatus + "." + wildcards.peak_caller_type + ".annotation.summary"),
            treatment_control_list = TREATMENT_LIST_M if wildcards.peak_caller.startswith('macs2') else TREATMENT_LIST_SG)
-    
+
 
 rule homer_enrich:
     """
     Plot enrichment over genic features
     """
-    input: 
+    input:
         annotation_summary=get_annotation_files
     output:
         enrich_png=join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png")
@@ -94,23 +94,24 @@ rule combine_homer:
     """
     input:
         annotation=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation.txt"),
-        xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.{peak_caller_type}.peaks.xls")
+        peaks_file=join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.{peak_caller_type}.peaks.xls")
     output:
-        combined=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx")
+        combined_tsv=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.tsv"),
+        combined_xlsx=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx")
     container: config['containers']['carlisle_r']
     params:
         rscript=join(SCRIPTSDIR,"_combine_macs2_homer.R")
     shell:
         """
-        Rscript {params.rscript} {input.xls_file} {input.annotation} {output.combined}
+        Rscript {params.rscript} {input.peaks_file} {input.annotation} {output.combined}
         """
 
 rule rose:
     """
-    Developed from code: 
+    Developed from code:
     https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/rules/pipeline.chipseq.smk
-    outdated verison of rose: https://github.com/younglab/ROSE
-            
+    outdated version of rose: https://github.com/younglab/ROSE
+
     # SEACR bed file output format
     <1>     <2>     <3>     <4>             <5>             <6>
     <chr>   <start> <end>   <total signal>  <max signal>	<max signal region>
@@ -124,8 +125,8 @@ rule rose:
     https://github.com/macs3-project/MACS/blob/master/docs/callpeak.md
     <1>     <2>     <3>     <4>     <5>                             <6>     <7>             <8>             <9>             <10>
     <chr>   <start> <end>   <name>  <integer score for display>     <empty> <fold-change> <-log10pvalue>    <-log10qvalue>  <relative summit position to peak start>
-    ## integer score for display: It's calculated as int(-10*log10pvalue) or int(-10*log10qvalue) depending on whether -p (pvalue) or -q (qvalue) is used as score cutoff. 
-    ### Please note that currently this value might be out of the [0-1000] range defined in UCSC ENCODE narrowPeak format. You can let the value saturated at 1000 (i.e. p/q-value = 10^-100) 
+    ## integer score for display: It's calculated as int(-10*log10pvalue) or int(-10*log10qvalue) depending on whether -p (pvalue) or -q (qvalue) is used as score cutoff.
+    ### Please note that currently this value might be out of the [0-1000] range defined in UCSC ENCODE narrowPeak format. You can let the value saturated at 1000 (i.e. p/q-value = 10^-100)
     ### by using the following 1-liner awk: awk -v OFS="\t" '{$5=$5>1000?1000:$5} {print}' NAME_peaks.narrowPeak
     ## broadPeak does not have 10th column
     ### Since in the broad peak calling mode, the peak summit won't be called, the values in the 5th, and 7-9th columns are the mean value across all positions in the peak region
@@ -177,20 +178,20 @@ rule rose:
         """
         # set tmp
         set -exo pipefail
-        if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then 
+        if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then
             TMPDIR="/lscratch/$SLURM_JOB_ID"
         else
             dirname=$(basename $(mktemp))
             TMPDIR="/dev/shm/$dirname"
             mkdir -p $TMPDIR
         fi
-                
+
         # set ROSE specific paths
         PATHTO=/usr/local/apps/ROSE/1.3.1
         PYTHONPATH=/usr/local/apps/ROSE/1.3.1/src/lib
         export PYTHONPATH
         export PATH=$PATH:$PATHTO/bin
-                
+
         # pull treatment and control ids
         treatment=`echo {params.tc_file} | awk -F"_vs_" '{{print $1}}'`
         control=`echo {params.tc_file} | awk -F"_vs_" '{{print $2}}'`
@@ -204,11 +205,11 @@ rule rose:
         samtools view -b ${{treat_bam}} {params.regions} > $TMPDIR/subset.bam
         samtools index $TMPDIR/subset.bam
         grep -v "NC_" {input.peak_file} > $TMPDIR/subset.bed
-                      
+
         # prep for ROSE
         # macs2 output is prepared for ROSE formatting
         # seacr and gopeaks must be edited to correct for formatting
-        
+
         ## correct GOPEAKS
         ### original: <chr>   <start> <end>
         ### output: <chr>   <start> <end> <$sampleid_uniquenumber> <0> <.>
@@ -219,7 +220,7 @@ rule rose:
             nl --number-format=rz --number-width=3 $TMPDIR/subset.bed | awk -v sample_id="${{treatment}}_" \'{{print sample_id$1"\\t0\\t."}}\' > $TMPDIR/col.txt
             paste -d "\t" $TMPDIR/save.bed $TMPDIR/col.txt > $TMPDIR/subset.bed
         fi
-        
+
         ## correct SEACR
         ### original: <chr>   <start> <end>   <total signal>  <max signal>	<max signal region>
         ### output: <chr>   <start> <end> <$sampleid_uniquenumber> <total signal> <.>
@@ -241,11 +242,11 @@ rule rose:
         if [[ ${{num_of_peaks}} -gt 5 ]]; then
             echo "## More than 5 usable peaks detected ${{num_of_peaks}} - Running rose"
             cd {params.workdir}
-            
+
             # if macs2 control is off, there will be no macs2 control to annotate
             if [[ {params.control_flag} == "N" ]] & [[ {params.peak_caller_type} == "macs2_narrow" ]] || [[ {params.peak_caller_type} == "macs2_broad" ]]; then
                 echo "#### No control was used"
-                rose_files="$TMPDIR/subset.bam"  
+                rose_files="$TMPDIR/subset.bam"
             else
                 echo "#### A control used ${{cntrl_bam}}"
                 rose_files="$TMPDIR/subset.bam ${{cntrl_bam}}"
@@ -258,7 +259,7 @@ rule rose:
                     -r ${{rose_files}} \
                     -t {params.tss_distance} \
                     -s {params.stitch_distance} \
-                    -o {params.file_base}                        
+                    -o {params.file_base}
             else
                 ROSE_main.py \
                     -i {output.no_tss_bed} \
@@ -268,16 +269,16 @@ rule rose:
                     -s {params.stitch_distance} \
                     -o {params.file_base}
             fi
-                        
+
             # rose to bed file
             echo "## Convert bed"
             # developed from https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/scripts/roseTable2Bed.sh
             grep -v "^[#|REGION]" {output.all} | awk -v OFS="\\t" -F"\\t" \'$NF==0 {{for(i=2; i<=NF; i++){{printf $i; printf (i<NF?"\\t":"\\n")}}}}\' > $TMPDIR/regular
             bedtools sort -i $TMPDIR/regular > {output.regular}
-                        
+
             grep -v "^[#|REGION]" {output.all} | awk -v OFS="\\t" -F"\\t" \'$NF==1 {{for(i=2; i<=NF; i++){{printf $i; printf (i<NF?"\\t":"\\n")}}}}\' > $TMPDIR/super
             bedtools sort -i $TMPDIR/super > {output.super}
-                        
+
             # cut rose output files, create summits
             echo "## Cut and summit"
             cut -f1-3 {output.regular} > {output.regular_great}
@@ -348,7 +349,7 @@ if config["run_contrasts"]:
             sample_list=`awk '{{print $3}}' {input.contrast_file}`
             clean_sample_list=`echo $sample_list | sed "s/\s/xxx/g"`
 
-            # rum script       
+            # rum script
             Rscript {params.rscript_wrapper} \\
                 --rmd {params.rmd} \\
                 --carlisle_functions {params.carlisle_functions} \\

diff --git a/workflow/scripts/_combine_macs2_homer.R b/workflow/scripts/_combine_macs2_homer.R
@@ -1,19 +1,27 @@
 # Add peak statistics to HOMER annotations file
 
-library(openxlsx)
+library(readr)
+library(dplyr)
+library(tidyr)
 
-args = commandArgs(trailingOnly=TRUE)
+args <- commandArgs(trailingOnly = TRUE)
 
-peaks.file = args[1]
-homer.file = args[2]
-combined.file = args[3]
+peaks_file <- args[1]
+homer_file <- args[2]
+combined_tsv <- args[3]
+combined_xlsx <- args[4]
 
 # Load peak file
-peaks = read.table(peaks.file,skip = 23,header=TRUE,sep='\t',check.names = FALSE)
+# the file extension is '.xls' but it's actually just a tsv file
+peaks <- read_tsv(peaks_file, comment = "#") %>%
+  select(c("name", "fold_enrichment", "-log10(qvalue)"))
 
 # Load HOMER annotations
-homer = read.table(homer.file,header=TRUE,sep='\t',check.names=FALSE,comment.char="",quote="")
+homer <- read_tsv(homer_file) %>%
+  # rename first column to match peaks file
+  rename(name = 1)
 
 # Combine tables and write out
-combined = merge(homer,peaks[,c("name","fold_enrichment","-log10(qvalue)")],by.x=1,by.y="name")
-write.xlsx(combined,file=combined.file)
+combined <- full_join(homer, peaks, by = "name")
+write_tsv(combined, file = combined_tsv)
+openxlsx::write.xlsx(combined, file = combined_xlsx)