From 6bd0305c5fc12e92a562ef1e686111d5e979bbd2 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Sat, 20 Jan 2024 10:45:41 -0800 Subject: [PATCH 1/5] Move MarkDuplicates to the shared GATK.nf MarkDuplicates was shared between the RNAseq and LongRead pipelines. Subsequent changes will add the MarkDuplicates step to the DNA pipeline. --- modules/local/GATK.nf | 30 ++++++++++++++++++ modules/local/RNAseq.nf | 31 ------------------- .../local/long_read_variant_calling/main.nf | 7 ++--- .../local/rna_variant_calling/main.nf | 2 +- 4 files changed, 33 insertions(+), 37 deletions(-) diff --git a/modules/local/GATK.nf b/modules/local/GATK.nf index 0bb03b8..fdc980b 100644 --- a/modules/local/GATK.nf +++ b/modules/local/GATK.nf @@ -91,6 +91,36 @@ process CreateSequenceDictionary { """ } +process MarkDuplicates { + tag "$i_readname" + label 'gatk' + publishDir "${params.outdir}/03_PrepGATK" + + input: // [readgroup, unmapped reads, mapped reads] + tuple val(i_readname), path(merge_bam), path(merge_bai) + + output: // merged bam and bai files + tuple val("${i_readname}"), path("${i_readname}_markduplicates.bam"), path("${i_readname}_markduplicates.bai") + + script: + """ + ${gatk_app} \ + MarkDuplicates \ + --INPUT ${merge_bam} \ + --OUTPUT ${i_readname}_markduplicates.bam \ + --CREATE_INDEX true \ + --VALIDATION_STRINGENCY SILENT \ + --METRICS_FILE ${i_readname}_markduplicates.metrics + """ + + stub: + """ + #! /usr/bin/env bash + touch ${i_readname}_markduplicates.bam + touch ${i_readname}_markduplicates.bai + """ +} + process samtools_faidx { tag "${genome_fasta.simpleName}" label 'samtools' diff --git a/modules/local/RNAseq.nf b/modules/local/RNAseq.nf index ee74c75..8b6ff53 100644 --- a/modules/local/RNAseq.nf +++ b/modules/local/RNAseq.nf @@ -131,37 +131,6 @@ process MergeBamAlignment { """ } - -process MarkDuplicates { - tag "$i_readname" - label 'gatk' - publishDir "${params.outdir}/03_PrepGATK" - - input: // [readgroup, unmapped reads, mapped reads] - tuple val(i_readname), path(merge_bam), path(merge_bai) - - output: // merged bam and bai files - tuple val("${i_readname}"), path("${i_readname}_markduplicates.bam"), path("${i_readname}_markduplicates.bai") - - script: - """ - ${gatk_app} \ - MarkDuplicates \ - --INPUT ${merge_bam} \ - --OUTPUT ${i_readname}_markduplicates.bam \ - --CREATE_INDEX true \ - --VALIDATION_STRINGENCY SILENT \ - --METRICS_FILE ${i_readname}_markduplicates.metrics - """ - - stub: - """ - #! /usr/bin/env bash - touch ${i_readname}_markduplicates.bam - touch ${i_readname}_markduplicates.bai - """ -} - process SplitNCigarReads { tag "$i_readname" label 'gatk' diff --git a/subworkflows/local/long_read_variant_calling/main.nf b/subworkflows/local/long_read_variant_calling/main.nf index 6aa6d95..773b96d 100644 --- a/subworkflows/local/long_read_variant_calling/main.nf +++ b/subworkflows/local/long_read_variant_calling/main.nf @@ -3,6 +3,7 @@ nextflow.enable.dsl=2 include { CreateSequenceDictionary; + MarkDuplicates; samtools_faidx; keep_only_pass; bedtools_makewindows; } from '../../../modules/local/GATK.nf' @@ -15,10 +16,6 @@ include { pbmm2_index; calc_DPvalue as calc_DPvalue_LongRead; VariantFiltration as VariantFiltration_LongRead } from '../../../modules/local/LongReadseq.nf' -include { MarkDuplicates as MarkDuplicates_LongRead; } from '../../../modules/local/RNAseq.nf' - - - workflow LONGREAD_VARIANT_CALLING { take: genome_ch @@ -44,7 +41,7 @@ workflow LONGREAD_VARIANT_CALLING { | pbmm2_index | combine(cleanreads_ch) | pbmm2_align - | MarkDuplicates_LongRead + | MarkDuplicates | combine(windows_ch) | view | combine(genome_ch) diff --git a/subworkflows/local/rna_variant_calling/main.nf b/subworkflows/local/rna_variant_calling/main.nf index 64189c7..63bb3c9 100644 --- a/subworkflows/local/rna_variant_calling/main.nf +++ b/subworkflows/local/rna_variant_calling/main.nf @@ -5,6 +5,7 @@ nextflow.enable.dsl=2 include { FastqToSam; MarkIlluminaAdapters; CreateSequenceDictionary; + MarkDuplicates; samtools_faidx; bedtools_makewindows; CombineGVCFs; @@ -20,7 +21,6 @@ include { SamToFastq as SamToFastq_RNA; STAR_index; STAR_align; MergeBamAlignment as MergeBamAlignment_RNA; - MarkDuplicates; SplitNCigarReads; gatk_HaplotypeCaller as gatk_HaplotypeCaller_RNA; } from '../../../modules/local/RNAseq.nf' From 54618b1f55b7a30428c0c411b3b7ab32e3accc65 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 23 Jan 2024 09:57:03 -0800 Subject: [PATCH 2/5] Add MarkDuplicates to DNAseq pipeline Following the code from gatk4-data-processing https://github.com/gatk-workflows/gatk4-data-processing/blob/c44603c464fe3cb7d9b82da2a95f844fdeb20e3c/processing-for-variant-discovery-gatk4.wdl#L458-L498 --- modules/local/DNAseq.nf | 21 +++++++++++++++++++ .../local/dna_variant_calling/main.nf | 4 ++++ 2 files changed, 25 insertions(+) diff --git a/modules/local/DNAseq.nf b/modules/local/DNAseq.nf index 61697da..a3adde7 100644 --- a/modules/local/DNAseq.nf +++ b/modules/local/DNAseq.nf @@ -127,6 +127,27 @@ process MergeBamAlignment { """ } +process MarkDuplicates { + label 'gatk' + publishDir "${params.outdir}/03_PrepGATK" + input: + tuple path(bam), path(bai), path(genome_fasta), path(genome_dict) + output: + path("${bam.simpleName}_marked.bam") + script: + """ + #! /usr/bin/env bash + $gatk_app --java-options "${java_options}" MarkDuplicates \ + --INPUT $bam \ + --OUTPUT ${bam.simpleName}_marked.bam \ + --METRICS_FILE ${bam.simpleName}_marked.metrics \ + --VALIDATION_STRINGENCY SILENT \ + --OPTICAL_DUPLICATE_PIXEL_DISTANCE 2500 \ + --ASSUME_SORT_ORDER "queryname" \ + --CREATE_MD5_FILE true + """ +} + process gatk_HaplotypeCaller { tag "$window" label 'gatk' diff --git a/subworkflows/local/dna_variant_calling/main.nf b/subworkflows/local/dna_variant_calling/main.nf index d7799f6..5e1cd60 100644 --- a/subworkflows/local/dna_variant_calling/main.nf +++ b/subworkflows/local/dna_variant_calling/main.nf @@ -20,6 +20,7 @@ include { SamToFastq as SamToFastq_DNA bwamem2_index; bwamem2_mem; MergeBamAlignment as MergeBamAlignment_DNA; + MarkDuplicates as MarkDuplicates_DNA; gatk_HaplotypeCaller as gatk_HaplotypeCaller_DNA; gatk_HaplotypeCaller_invariant; } from '../../../modules/local/DNAseq.nf' @@ -57,6 +58,9 @@ workflow DNA_VARIANT_CALLING { | combine(genome_ch) | combine(CreateSequenceDictionary.out) | MergeBamAlignment_DNA + | combine(genome_ch) + | combine(CreateSequenceDictionary.out) + | MarkDuplicates_DNA if(params.invariant) { allbambai_ch = MergeBamAlignment.out // do these need to be merged by read? From 9ad3c5ddb447771686ec84e44d36f3a3eb5ad882 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 23 Jan 2024 09:48:06 -0800 Subject: [PATCH 3/5] Add SortAndFixTags to DNAseq pipeline Following the code from gatk4-data-processing https://github.com/gatk-workflows/gatk4-data-processing/blob/c44603c464fe3cb7d9b82da2a95f844fdeb20e3c/processing-for-variant-discovery-gatk4.wdl#L406-L456 --- modules/local/DNAseq.nf | 25 +++++++++++++++++++ .../local/dna_variant_calling/main.nf | 4 +++ 2 files changed, 29 insertions(+) diff --git a/modules/local/DNAseq.nf b/modules/local/DNAseq.nf index a3adde7..44830ec 100644 --- a/modules/local/DNAseq.nf +++ b/modules/local/DNAseq.nf @@ -148,6 +148,31 @@ process MarkDuplicates { """ } +process SortAndFixTags { + label 'gatk' + publishDir "${params.outdir}/03_PrepGATK" + input: + tuple path(bam), path(genome_fasta), path(genome_dict) + output: + tuple path("${bam.simpleName}_sorted.bam"), path("${bam.simpleName}_sorted.bai") + script: + """ + #! /usr/bin/env bash + $gatk_app --java-options "${java_options}" SortSam \ + --INPUT $bam \ + --OUTPUT /dev/stdout \ + --SORT_ORDER "coordinate" \ + --CREATE_INDEX false \ + --CREATE_MD5_FILE false \ + | $gatk_app --java-options "${java_options}" SetNmMdAndUqTags \ + --INPUT /dev/stdin \ + --OUTPUT ${bam.simpleName}_sorted.bam \ + --CREATE_INDEX true \ + --CREATE_MD5_FILE true \ + --REFERENCE_SEQUENCE $genome_fasta + """ +} + process gatk_HaplotypeCaller { tag "$window" label 'gatk' diff --git a/subworkflows/local/dna_variant_calling/main.nf b/subworkflows/local/dna_variant_calling/main.nf index 5e1cd60..54697ea 100644 --- a/subworkflows/local/dna_variant_calling/main.nf +++ b/subworkflows/local/dna_variant_calling/main.nf @@ -21,6 +21,7 @@ include { SamToFastq as SamToFastq_DNA bwamem2_mem; MergeBamAlignment as MergeBamAlignment_DNA; MarkDuplicates as MarkDuplicates_DNA; + SortAndFixTags as SortAndFixTags_DNA; gatk_HaplotypeCaller as gatk_HaplotypeCaller_DNA; gatk_HaplotypeCaller_invariant; } from '../../../modules/local/DNAseq.nf' @@ -61,6 +62,9 @@ workflow DNA_VARIANT_CALLING { | combine(genome_ch) | combine(CreateSequenceDictionary.out) | MarkDuplicates_DNA + | combine(genome_ch) + | combine(CreateSequenceDictionary.out) + | SortAndFixTags_DNA if(params.invariant) { allbambai_ch = MergeBamAlignment.out // do these need to be merged by read? From 467d78f87be7fe1e9eba207459125bf6aa27737f Mon Sep 17 00:00:00 2001 From: vsatheesh Date: Wed, 24 Jan 2024 18:18:50 -0600 Subject: [PATCH 4/5] fix cammelcase parameter being converted to cammel-case --- main.nf | 4 ++-- nextflow.config | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index b63bdfe..2de0c4f 100644 --- a/main.nf +++ b/main.nf @@ -46,7 +46,7 @@ def helpMsg() { --gatk_app Link to gatk executable [default: '$gatk_app'] --java_options Java options for gatk [default:'${java_options}'] --gatk_cluster_options GATK cluster options [default:'${params.gatk_cluster_options}'] - --gatk_HaplotypeCaller_params Additional parameters to pass to GATK HaplotypeCaller [default:'${params.gatk_HaplotypeCaller_params}'] + --gatk_haplotype_caller_params Additional parameters to pass to GATK HaplotypeCaller [default:'${params.gatk_haplotype_caller_params}'] Aligners: --bwamem2_app Link to bwamem2 executable [default: '$bwamem2_app'] @@ -80,7 +80,7 @@ if(params.help){ def parameters_valid = ['help','outdir', 'genome','gtf','reads','reads_file','long_reads','invariant','seq', 'singularity_img','docker_img','container_img', - 'gatk_app','gatk_HaplotypeCaller_params', + 'gatk_app','gatk_haplotype_caller_params', 'star_app','star_index_params','star_index_file','bwamem2_app','samtools_app','bedtools_app','datamash_app','vcftools_app', 'pbmm2_app', 'java_options','window','queueSize','queue-size','account', 'threads', 'gatk_cluster_options'] as Set diff --git a/nextflow.config b/nextflow.config index b825db6..dddfa85 100644 --- a/nextflow.config +++ b/nextflow.config @@ -28,7 +28,7 @@ params { docker_img = 'j23414/gatk4' container_img = 'docker://ghcr.io/aseetharam/gatk:master' gatk_app = 'gatk' - gatk_HaplotypeCaller_params = '' + gatk_haplotype_caller_params = '' star_app = 'STAR' bwamem2_app = 'bwa-mem2' samtools_app = 'samtools' @@ -58,7 +58,7 @@ env { star_index_file = params.star_index_file bedtools_app = "$params.bedtools_app" gatk_app = "$params.gatk_app" - gatk_HaplotypeCaller_params = "$params.gatk_HaplotypeCaller_params" + gatk_HaplotypeCaller_params = "$params.gatk_haplotype_caller_params" datamash_app = "$params.datamash_app" vcftools_app = "$params.vcftools_app" pbmm2_app = "$params.pbmm2_app" From 5d9a73d5451f0a4dcd94e791e80b862f0ca5acbe Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Mon, 5 Feb 2024 11:11:09 -0800 Subject: [PATCH 5/5] fixup: gatk_haplotypecaller_params Use an elvis operator to assign default values, if a parameter is not set by the user. > WARN: Environment variable `gatk_HaplotypeCaller_params` evaluates to an empty value --- nextflow.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index dddfa85..e9a216f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -28,7 +28,7 @@ params { docker_img = 'j23414/gatk4' container_img = 'docker://ghcr.io/aseetharam/gatk:master' gatk_app = 'gatk' - gatk_haplotype_caller_params = '' + gatk_haplotype_caller_params = "" star_app = 'STAR' bwamem2_app = 'bwa-mem2' samtools_app = 'samtools' @@ -58,7 +58,7 @@ env { star_index_file = params.star_index_file bedtools_app = "$params.bedtools_app" gatk_app = "$params.gatk_app" - gatk_HaplotypeCaller_params = "$params.gatk_haplotype_caller_params" + gatk_HaplotypeCaller_params = params.gatk_haplotype_caller_params ? "$params.gatk_haplotype_caller_params" : " " datamash_app = "$params.datamash_app" vcftools_app = "$params.vcftools_app" pbmm2_app = "$params.pbmm2_app"