Skip to content

Commit

Permalink
Merge pull request #127 from CCRGeneticsBranch/no_dedup
Browse files Browse the repository at this point in the history
Fixed bugs related to library normalization with no_dedup, replicate formatting, and DESeq
  • Loading branch information
kelly-sovacool authored Apr 18, 2024
2 parents 4f89fed + 464bbed commit bb5c9ec
Show file tree
Hide file tree
Showing 10 changed files with 94 additions and 99 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
## CARLISLE development version
- Bug fixes (#127, @epehrsson)
- Removes single-sample group check for DESeq.
- Increases memory for DESeq.
- Ensures control replicate number is an integer.
- Fixes FDR cutoff misassigned to log2FC cutoff.
- Fixes `no_dedup` variable names in library normalization scripts.

## CARLISLE v2.5.0
- Refactors R packages to a common source location (#118, @slsevilla)
Expand Down
2 changes: 2 additions & 0 deletions docs/user-guide/preparing-files.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,5 @@ An example contrast file:
| condition1 | condition2 |
| --- | --- |
| MOC1_siSmyd3_2m_25_HCHO | MOC1_siNC_2m_25_HCHO |

**Note**: you must have more than one sample per condition in order to perform differential analysis with DESeq2
5 changes: 5 additions & 0 deletions resources/cluster_biowulf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,8 @@ deeptools_heatmap:
mem: 30g
time: 01-00:00:00
###################################################################
DESeq:
mem: 80g
time: 00-02:00:00
threads: 2
###################################################################
151 changes: 66 additions & 85 deletions workflow/rules/diff.smk
Original file line number Diff line number Diff line change
Expand Up @@ -194,95 +194,76 @@ rule DESeq:
mkdir -p ${{TMPDIR_AUC}}/knit_root_dir
cd $TMPDIR_AUC
# if there is only one sample per group, DESEQ2 is not the tool to use
# check the sampleinfo file to determine how many samples are in each group. If any samplegroupp has a count of 1
# create empty files with message that the file cannot be created due to conditions provided
search_list=`awk '{{print $2}}' {input.si} | grep -v "group" | sort | uniq -c | sort -nr | grep -o 1`
check="1"
_contains () {{ # Check if space-separated list $1 contains line $2
echo "$1" | tr ' ' '\\n' | grep -F -x -q "$2"
}}
if _contains "${{check}}" "${{search_list}}"; then
echo "There is only one sample per group, which is not allowed in DESEQ2"
touch {output.results_auc}
touch {output.html_auc}
touch {output.elbowlimits_auc}
touch {output.results_frag}
touch {output.html_frag}
touch {output.elbowlimits_frag}
touch {output.pdf}
else
Rscript {params.rscript_diff} \\
--rmd {params.rmd} \\
--carlisle_functions {params.carlisle_functions} \\
--Rlib_dir {params.Rlib_dir} \\
--Rpkg_config {params.Rpkg_config} \\
--countsmatrix {input.cm_auc} \\
--sampleinfo {input.si} \\
--dupstatus {params.dupstatus} \\
--condition1 ${{condition1}} \\
--condition2 ${{condition2}} \\
--fdr_cutoff {params.fdr_cutoff} \\
--log2fc_cutoff {params.log2fc_cutoff} \\
--results {output.results_auc} \\
--report {output.html_auc} \\
--elbowlimits {output.elbowlimits_auc} \\
--spiked {params.spiked} \\
--rawcountsprescaled \\
--scalesfbymean \\
--contrast_data {input.contrast_data} \\
--tmpdir $TMPDIR_AUC \\
--species {params.species} \\
--gtf {params.gtf}
Rscript {params.rscript_diff} \\
--rmd {params.rmd} \\
--carlisle_functions {params.carlisle_functions} \\
--Rlib_dir {params.Rlib_dir} \\
--Rpkg_config {params.Rpkg_config} \\
--countsmatrix {input.cm_auc} \\
--sampleinfo {input.si} \\
--dupstatus {params.dupstatus} \\
--condition1 ${{condition1}} \\
--condition2 ${{condition2}} \\
--fdr_cutoff {params.fdr_cutoff} \\
--log2fc_cutoff {params.log2fc_cutoff} \\
--results {output.results_auc} \\
--report {output.html_auc} \\
--elbowlimits {output.elbowlimits_auc} \\
--spiked {params.spiked} \\
--rawcountsprescaled \\
--scalesfbymean \\
--contrast_data {input.contrast_data} \\
--tmpdir $TMPDIR_AUC \\
--species {params.species} \\
--gtf {params.gtf}
# change elbow limits to provided log2fc if limit is set to .na.real
sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits_auc}
sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits_auc}
# change elbow limits to provided log2fc if limit is set to .na.real
sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits_auc}
sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits_auc}
## Run FRAG method
mkdir -p ${{TMPDIR_FRAG}}
mkdir -p ${{TMPDIR_FRAG}}/intermediates_dir
mkdir -p ${{TMPDIR_FRAG}}/knit_root_dir
cd $TMPDIR_FRAG
## Run FRAG method
mkdir -p ${{TMPDIR_FRAG}}
mkdir -p ${{TMPDIR_FRAG}}/intermediates_dir
mkdir -p ${{TMPDIR_FRAG}}/knit_root_dir
cd $TMPDIR_FRAG
# Do not use --rawcountsprescaled as these counts are not prescaled!
Rscript {params.rscript_diff} \\
--rmd {params.rmd} \\
--carlisle_functions {params.carlisle_functions} \\
--Rlib_dir {params.Rlib_dir} \\
--Rpkg_config {params.Rpkg_config} \\
--countsmatrix {input.cm_frag} \\
--sampleinfo {input.si} \\
--dupstatus {params.dupstatus} \\
--condition1 ${{condition1}} \\
--condition2 ${{condition2}} \\
--fdr_cutoff {params.fdr_cutoff} \\
--log2fc_cutoff {params.log2fc_cutoff} \\
--results {output.results_frag} \\
--report {output.html_frag} \\
--elbowlimits {output.elbowlimits_frag} \\
--spiked {params.spiked} \\
--scalesfbymean \\
--contrast_data {input.contrast_data} \\
--tmpdir $TMPDIR_FRAG \\
--species {params.species} \\
--gtf {params.gtf}
# Do not use --rawcountsprescaled as these counts are not prescaled!
Rscript {params.rscript_diff} \\
--rmd {params.rmd} \\
--carlisle_functions {params.carlisle_functions} \\
--Rlib_dir {params.Rlib_dir} \\
--Rpkg_config {params.Rpkg_config} \\
--countsmatrix {input.cm_frag} \\
--sampleinfo {input.si} \\
--dupstatus {params.dupstatus} \\
--condition1 ${{condition1}} \\
--condition2 ${{condition2}} \\
--fdr_cutoff {params.fdr_cutoff} \\
--log2fc_cutoff {params.log2fc_cutoff} \\
--results {output.results_frag} \\
--report {output.html_frag} \\
--elbowlimits {output.elbowlimits_frag} \\
--spiked {params.spiked} \\
--scalesfbymean \\
--contrast_data {input.contrast_data} \\
--tmpdir $TMPDIR_FRAG \\
--species {params.species} \\
--gtf {params.gtf}
# change elbow limits to provided log2fc if limit is set to .na.real
sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits_frag}
sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits_frag}
# change elbow limits to provided log2fc if limit is set to .na.real
sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits_frag}
sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits_frag}
## Run venns
mkdir -p ${{TMPDIR_VENN}}
mkdir -p ${{TMPDIR_VENN}}/intermediates_dir
mkdir -p ${{TMPDIR_VENN}}/knit_root_dir
cd $TMPDIR_VENN
Rscript {params.rscript_venn} \\
--aucresults {output.results_auc} \\
--fragmentsresults {output.results_frag} \\
--pdf {output.pdf} \\
--title "${{condition1}}_vs_${{condition2}}__{params.dupstatus}__{params.peak_caller_type}"
fi
## Run venns
mkdir -p ${{TMPDIR_VENN}}
mkdir -p ${{TMPDIR_VENN}}/intermediates_dir
mkdir -p ${{TMPDIR_VENN}}/knit_root_dir
cd $TMPDIR_VENN
Rscript {params.rscript_venn} \\
--aucresults {output.results_auc} \\
--fragmentsresults {output.results_frag} \\
--pdf {output.pdf} \\
--title "${{condition1}}_vs_${{condition2}}__{params.dupstatus}__{params.peak_caller_type}"
"""

rule diffbb:
Expand Down Expand Up @@ -344,4 +325,4 @@ rule diffbb:
bedToBigBed -type=bed9 {output.fbed} {input.genome_len} {output.fbigbed}
fi
"""
"""
6 changes: 3 additions & 3 deletions workflow/rules/init.smk
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,10 @@ TREATMENT_WITHOUTCONTROL_LIST=[]
TREAT_to_CONTRL_DICT=dict()
for i,t in enumerate(list(df[df['isControl']=="N"]['replicateName'].unique())):
crow=df[df['replicateName']==t].iloc[0]
c=crow.controlName+"_"+str(crow.controlReplicateNumber)
c=crow.controlName+"_"+str(int(crow.controlReplicateNumber))
if not c in REPLICATES:
print("# Control NOT found for sampleName_replicateNumber:"+t)
print("# "+config["samplemanifest"]+" has no entry for sample:"+crow.controlName+" replicateNumber:"+crow.controlReplicateNumber)
print("# "+config["samplemanifest"]+" has no entry for sample:"+crow.controlName+" replicateNumber:"+str(crow.controlReplicateNumber))
exit()
print("## "+str(i+1)+") "+t+" "+c)
process_replicates.extend([t,c])
Expand Down Expand Up @@ -453,4 +453,4 @@ rule create_reference:
mv $(dirname {output.bt2})/tmp/ref.yaml {output.refjson}
fi
"""
"""
2 changes: 1 addition & 1 deletion workflow/scripts/_diff_markdown.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -394,4 +394,4 @@ EnhancedVolcano(results_df,

```{r resultstable,echo=FALSE,include=TRUE}
DT::datatable(results_df)
```
```
4 changes: 2 additions & 2 deletions workflow/scripts/_diff_markdown_wrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ if (debug){
condition1=args$condition1
condition2=args$condition2
fdr_cutoff=args$fdr_cutoff
log2fc_cutoff=args$fdr_cutoff
log2fc_cutoff=args$log2fc_cutoff
indexcols="peakID"
results=args$results
report=args$report
Expand Down Expand Up @@ -128,4 +128,4 @@ rmarkdown::render(args$rmd,
params=parameters,
output_file = report,
intermediates_dir = paste(tmpdir,"intermediates_dir",sep="/"),
knit_root_dir = paste(tmpdir,"knit_root_dir",sep="/"))
knit_root_dir = paste(tmpdir,"knit_root_dir",sep="/"))
6 changes: 3 additions & 3 deletions workflow/scripts/_get_nreads_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
stats["raw_nreads_genome"] = sum(list(raw_idxstats[raw_idxstats[0].isin(list(genomelen[0]))][2]))
stats["raw_nreads_spikein"] = sum(list(raw_idxstats[raw_idxstats[0].isin(list(spikeinlen[0]))][2]))

stats["nodedup_nreads_genome"] = sum(list(nodedup_idxstats[nodedup_idxstats[0].isin(list(genomelen[0]))][2]))
stats["nodedup_nreads_spikein"] = sum(list(nodedup_idxstats[nodedup_idxstats[0].isin(list(spikeinlen[0]))][2]))
stats["no_dedup_nreads_genome"] = sum(list(nodedup_idxstats[nodedup_idxstats[0].isin(list(genomelen[0]))][2]))
stats["no_dedup_nreads_spikein"] = sum(list(nodedup_idxstats[nodedup_idxstats[0].isin(list(spikeinlen[0]))][2]))

stats["dedup_nreads_genome"] = sum(list(dedup_idxstats[dedup_idxstats[0].isin(list(genomelen[0]))][2]))
stats["dedup_nreads_spikein"] = sum(list(dedup_idxstats[dedup_idxstats[0].isin(list(spikeinlen[0]))][2]))
Expand All @@ -41,4 +41,4 @@
stats["nreads"] = int(npairs*2)

with open(sys.argv[7], 'w') as file:
dumped = yaml.dump(stats, file)
dumped = yaml.dump(stats, file)
4 changes: 2 additions & 2 deletions workflow/scripts/_make_alignment_stats_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ column_order = c("sample_name",
"scaling_factor",
"duplication_rate_genome",
"duplication_rate_spikein",
"nodedup_nreads_genome",
"nodedup_nreads_spikein",
"no_dedup_nreads_genome",
"no_dedup_nreads_spikein",
"dedup_nreads_genome",
"dedup_nreads_spikein")

Expand Down
7 changes: 4 additions & 3 deletions workflow/scripts/_make_library_norm_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ for ( f in files ){

# run once for dedup, once for nondedup
final_df=data.frame()
for (dedup_type in c("dedup_nreads_genome","nodedup_nreads_genome")){
for (dedup_type in c("dedup_nreads_genome","no_dedup_nreads_genome")){
# determine scaling factor dependent on library size
col_median=median(df[,dedup_type])
if (col_median>1000000000){
Expand All @@ -70,10 +70,11 @@ for (dedup_type in c("dedup_nreads_genome","nodedup_nreads_genome")){
df$library_size=df[,dedup_type]/lib_factor
df$sampleid=rownames(df)
df$dedup_type=strsplit(dedup_type,"_")[[1]][1]

df$dedup_type=gsub("no","no_dedup",df$dedup_type)

# create final df
select_cols=c("sampleid","library_size","dedup_type")
final_df=rbind(final_df,df[,select_cols])
}

write.table(final_df,out_table,row.names = FALSE)
write.table(final_df,out_table,row.names = FALSE)

0 comments on commit bb5c9ec

Please sign in to comment.