diff --git a/CITATIONS.md b/CITATIONS.md index d84e03ce..fedf7b37 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -36,7 +36,7 @@ > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. -- [Tabix]() +- [Tabix](https://pubmed.ncbi.nlm.nih.gov/21208982/) > Heng Li, Tabix: fast retrieval of sequence features from generic TAB-delimited files, Bioinformatics, Volume 27, Issue 5, 1 March 2011, Pages 718–719. doi: 10.1093/bioinformatics/btq671. PubMed PMID: 21208982; PubMed Central PMCID: PMC3042176. diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index f3b0ac1d..e1cb0b2b 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -9,3 +9,55 @@ report_section_order: order: -1001 export_plots: true + +# Run only these modules +run_modules: + - custom_content + - fastqc + - star + - samtools + - picard + - gatk + - snpeff + - vep + +# Order of modules +module_order: + - fastqc: + name: "FastQC (raw)" + path_filters: + - "*_val_*.zip" + - star: + name: "Read Alignment (STAR)" + - samtools: + name: "Samtools Flagstat" + - picard: + name: "GATK4 MarkDuplicates" + info: "Metrics generated either by GATK4 MarkDuplicates" + - qualimap: + name: "Qualimap" + - gatk: + name: "GATK4 BQSR" + - snpeff: + name: "SNPeff" + - vep: + name: "VEP" + +extra_fn_clean_exts: + - "_val" + +# Don't show % Dups in the General Stats table (we have this from Picard) +table_columns_visible: + fastqc: + percent_duplicates: False + +sp: + samtools/stats: + fn: "*.aligned.bam.stats" + samtools/flagstat: + fn: "*.aligned.bam.flagstat" + picard/markdups: + fn: "*.markdup.sorted.metrics" + snpeff: + contents: "SnpEff_version" + max_filesize: 5000000 diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 8570149a..e9a7a178 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -114,19 +114,16 @@ def validate_unique_samples(self): """ Assert that the combination of sample name and FASTQ filename is unique. - In addition to the validation, also rename the sample if more than one sample, - FASTQ file combination exists. + In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the + number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment. """ assert len(self._seen) == len(self.modified), "The pair of sample name and FASTQ must be unique." - if len({pair[0] for pair in self._seen}) < len(self._seen): - counts = Counter(pair[0] for pair in self._seen) - seen = Counter() - for row in self.modified: - sample = row[self._sample_col] - seen[sample] += 1 - if counts[sample] > 1: - row[self._sample_col] = f"{sample}_T{seen[sample]}" + seen = Counter() + for row in self.modified: + sample = row[self._sample_col] + seen[sample] += 1 + row[self._sample_col] = f"{sample}_T{seen[sample]}" def read_head(handle, num_lines=10): diff --git a/conf/modules.config b/conf/modules.config index 721b4c05..61cf9ba1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -51,7 +51,7 @@ process { withName: 'GUNZIP_.*' { publishDir = [ path: { "${params.outdir}/genome" }, - mode: 'copy', + mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, enabled: params.save_reference ] @@ -122,7 +122,7 @@ process { path: { "${params.outdir}/reports"}, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: !params.skip_qc + enabled: !params.skip_multiqc ] } @@ -146,9 +146,9 @@ process { params.save_unaligned ? '--outReadsUnmapped Fastx' : '', params.read_length ? "--sjdbOverhang ${params.read_length - 1}" : '', params.star_twopass ? '--twopassMode Basic' : '', - params.star_limitBAMsortRAM > 0 ? "--limitBAMsortRAM ${params.star_limitBAMsortRAM}" : "", - params.star_outBAMsortingBinsN > 0 ? "--outBAMsortingBinsN ${params.star_outBAMsortingBinsN}" : "", - params.star_limitOutSJcollapsed > 0 ? "--limitOutSJcollapsed ${params.star_limitOutSJcollapsed}" : "" + params.star_max_memory_bamsort > 0 ? "--limitBAMsortRAM ${params.star_max_memory_bamsort}" : "", + params.star_bins_bamsort > 0 ? "--outBAMsortingBinsN ${params.star_bins_bamsort}" : "", + params.star_max_collapsed_junc > 0 ? "--limitOutSJcollapsed ${params.star_max_collapsed_junc}" : "" ].join(' ').trim() publishDir = [ [ @@ -206,7 +206,7 @@ process { '--SUBDIVISION_MODE BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW', '--UNIQUE true', '--SORT true', - params.scatter_count ? "--SCATTER_COUNT $params.scatter_count" : '' + params.gatk_interval_scatter_count ? "--SCATTER_COUNT $params.gatk_interval_scatter_count" : '' ].join(' ').trim() publishDir = [ enabled: false ] } @@ -261,7 +261,7 @@ process { publishDir = [ path: { "${params.outdir}/reports/stats/${meta.id}" }, mode: params.publish_dir_mode, - enabled: !params.skip_qc, + enabled: !params.skip_multiqc, pattern: "*.{stats,flagstat}" ] } @@ -311,7 +311,7 @@ process { withName: GATK4_HAPLOTYPECALLER { ext.args = [ '--dont-use-soft-clipped-bases', - params.stand_call_conf ? "--standard-min-confidence-threshold-for-calling $params.stand_call_conf" : '', + params.gatk_hc_call_conf ? "--standard-min-confidence-threshold-for-calling $params.gatk_hc_call_conf" : '', params.bam_csi_index ? "--create-output-variant-index false" : "" ].join(' ').trim() publishDir = [ enabled: false ] @@ -340,10 +340,10 @@ process { withName: GATK4_VARIANTFILTRATION { ext.prefix = {"${meta.id}.haplotypecaller.filtered"} ext.args = [ - params.window ? "--window $params.window" : '', - params.cluster ? "--cluster $params.cluster" : '', - params.fs_filter ? "--filter-name \"FS\" --filter \"FS > $params.fs_filter\" " : '', - params.qd_filter ? "--filter-name \"QD\" --filter \"QD < $params.qd_filter\" " : '', + params.gatk_vf_window_size ? "--window $params.gatk_vf_window_size" : '', + params.gatk_vf_cluster_size ? "--cluster $params.gatk_vf_cluster_size" : '', + params.gatk_vf_fs_filter ? "--filter-name \"FS\" --filter \"FS > $params.gatk_vf_fs_filter\" " : '', + params.gatk_vf_qd_filter ? "--filter-name \"QD\" --filter \"QD < $params.gatk_vf_qd_filter\" " : '', ].join(' ').trim() publishDir = [ path: { "${params.outdir}/variant_calling/${meta.id}" }, @@ -366,7 +366,7 @@ process { mode: params.publish_dir_mode, path: { "${params.outdir}/reports/EnsemblVEP/${meta.id}" }, pattern: "*html", - enabled: !params.skip_qc + enabled: !params.skip_multiqc ] } @@ -380,7 +380,7 @@ process { path: { "${params.outdir}/reports/SnpEff/${meta.id}" }, pattern: "*.{csv,html,txt}", saveAs: { params.annotate_tools.contains('snpeff') ? it : null }, - enabled: !params.skip_qc + enabled: !params.skip_multiqc ] } diff --git a/conf/test.config b/conf/test.config index e931a9a8..dd25aa41 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,7 +20,7 @@ params { max_time = '6.h' // Input data - input = "${baseDir}/tests/csv/1.0/samplesheet.csv" + input = "https://raw.githubusercontent.com/nf-core/test-datasets/rnavar/samplesheet/v1.0/samplesheet.csv" // Genome references genome = 'WBcel235' diff --git a/docs/usage.md b/docs/usage.md index 4d6b2cb0..a59c1965 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -130,18 +130,18 @@ GATK best practices has been followed in this pipeline for RNA analysis, hence i > **NB:** Base recalibration can be turned off using `--skip_baserecalibration true` option. This is useful when you are analyzing data from non-model organisms where there is no known variant datasets exist. -`GATK SplitNCigarReads` is very time consuming step, therefore we made an attempt to break the GTF file into multiple chunks (scatters) using `GATK IntervalListTools` to run the process independently on each chunk in a parallel way to speed up the analysis. The default number of splits is set to 25, that means the GTF file is split into 25 smaller files and run `GATK SplitNCigarReads` on each of them in parallel. You can modify the number of splits using parameter `--scatter_count`. +`GATK SplitNCigarReads` is very time consuming step, therefore we made an attempt to break the GTF file into multiple chunks (scatters) using `GATK IntervalListTools` to run the process independently on each chunk in a parallel way to speed up the analysis. The default number of splits is set to 25, that means the GTF file is split into 25 smaller files and run `GATK SplitNCigarReads` on each of them in parallel. You can modify the number of splits using parameter `--gatk_interval_scatter_count`. ## Variant calling and filtering -`GATK HaplotypeCaller` is used for variant calling with default minimum phred-scaled confidence threshold as 20. This value can be changed using paramerter `--stand_call_conf`. +`GATK HaplotypeCaller` is used for variant calling with default minimum phred-scaled confidence threshold as 20. This value can be changed using paramerter `--gatk_hc_call_conf`. The pipeline runs a hard-filtering step on the variants by default. It does not filter out any variants, rather it flags i.e. PASS or other flags such as FS, QD, SnpCluster, etc. in FILTER column of the VCF. The following are the default filter criteria, however it can be changed using the respective parameters. -- `--cluster` is set to 3. It is the number of SNPs which make up a cluster. -- `--window` is set to 35. The window size (in bases) in which to evaluate clustered SNPs. -- `--fs_filter` is set to 30.0. Filter based on FisherStrand > 30.0. It is the Phred-scaled probability that there is strand bias at the site. -- `--qd_filter` is set to 2.0 meaning filter variants if Quality By Depth filter is < 2.0. +- `--gatk_vf_cluster_size` is set to 3. It is the number of SNPs which make up a cluster. +- `--gatk_vf_window_size` is set to 35. The window size (in bases) in which to evaluate clustered SNPs. +- `--gatk_vf_fs_filter` is set to 30.0. Filter based on FisherStrand > 30.0. It is the Phred-scaled probability that there is strand bias at the site. +- `--gatk_vf_qd_filter` is set to 2.0 meaning filter variants if Quality By Depth filter is < 2.0. Variant filtering is an optional step. You can skip it using `--skip_variantfiltration` parameter. diff --git a/nextflow.config b/nextflow.config index 5c930df0..ee010d49 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,72 +12,72 @@ params { // Pipeline options // Mandatory option - input = null // sample sheet + input = null // sample sheet // Genome and reference options - genome = 'GRCh38' - igenomes_base = 's3://ngi-igenomes/igenomes' - igenomes_ignore = false - save_reference = false - save_merged_fastq = false + genome = 'GRCh38' + igenomes_base = 's3://ngi-igenomes/igenomes' + igenomes_ignore = false + save_reference = false + save_merged_fastq = false // Sequence read information - read_length = 150 // Required for STAR to build index and align reads + read_length = 150 // Required for STAR to build index and align reads // Alignment - aligner = 'star' // Only STAR is currently supported. - star_twopass = true - star_ignore_sjdbgtf = false // Ignore GTF file while creating index or alignment by STAR - star_limitBAMsortRAM = 0 - star_outBAMsortingBinsN = 50 - star_limitOutSJcollapsed = 1000000 - seq_center = null - seq_platform = 'illumina' // Required for preparing for BAM headers for GATK to work - bam_csi_index = false - save_unaligned = false - save_align_intermeds = false + aligner = 'star' // Only STAR is currently supported. + star_twopass = true + star_ignore_sjdbgtf = false // Ignore GTF file while creating index or alignment by STAR + star_max_memory_bamsort = 0 // STAR parameter limitBAMsortRAM to specify maximum RAM for sorting BAM + star_bins_bamsort = 50 // STAR parameter outBAMsortingBinsN to specify number of bins for sorting BAM + star_max_collapsed_junc = 1000000 // STAR parameter limitOutSJcollapsed to specify max number of collapsed junctions + seq_center = null + seq_platform = 'illumina' // Required for preparing for BAM headers for GATK to work + bam_csi_index = false + save_unaligned = false + save_align_intermeds = false // Preprocessing of alignment - remove_duplicates = false + remove_duplicates = false // Variant calling - no_intervals = false + no_intervals = false // Variant annotation - annotate_tools = null // List of annotation tools to run - snpeff or vep or merge - annotation_cache = false // Annotation cache disabled - cadd_cache = null // CADD cache disabled - cadd_indels = null // No CADD InDels file - cadd_indels_tbi = null // No CADD InDels index - cadd_wg_snvs = null // No CADD SNVs file - cadd_wg_snvs_tbi = null // No CADD SNVs index - genesplicer = null // genesplicer disabled within VEP - snpeff_cache = null // No directory for snpEff cache - snpeff_db = null // No default db for snpeff - vep_cache = null // No directory for VEP cache - vep_genome = null // No default genome for VEP - vep_cache_version = null // No default cache version for VEP + annotate_tools = null // List of annotation tools to run - snpeff or vep or merge + annotation_cache = false // Annotation cache disabled + cadd_cache = null // CADD cache disabled + cadd_indels = null // No CADD InDels file + cadd_indels_tbi = null // No CADD InDels index + cadd_wg_snvs = null // No CADD SNVs file + cadd_wg_snvs_tbi = null // No CADD SNVs index + genesplicer = null // genesplicer disabled within VEP + snpeff_cache = null // No directory for snpEff cache + snpeff_db = null // No default db for snpeff + vep_cache = null // No directory for VEP cache + vep_genome = null // No default genome for VEP + vep_cache_version = null // No default cache version for VEP // Skip steps - skip_baserecalibration = false - skip_intervallisttools = false - skip_variantfiltration = false - skip_variantannotation = false + skip_baserecalibration = false + skip_intervallisttools = false + skip_variantfiltration = false + skip_variantannotation = false // GATK intervallist parameters - scatter_count = 25 + gatk_interval_scatter_count = 25 //GATK haplotypecaller parameters - stand_call_conf = 20 + gatk_hc_call_conf = 20 //GATK variant filter parameters - window = 35 - cluster = 3 - fs_filter = 30.0 - qd_filter = 2.0 + gatk_vf_window_size = 35 + gatk_vf_cluster_size = 3 + gatk_vf_fs_filter = 30.0 + gatk_vf_qd_filter = 2.0 // QC - skip_qc = false + skip_multiqc = false // MultiQC options multiqc_config = null diff --git a/nextflow_schema.json b/nextflow_schema.json index fa2466b9..b8bfe2a1 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -186,19 +186,19 @@ "description": "Do not use GTF file during STAR index buidling step", "help_text": "Do not use parameter --sjdbGTFfile during the STAR genomeGenerate process." }, - "star_limitBAMsortRAM": { + "star_max_memory_bamsort": { "type": "integer", "default": 0, "description": "Option to limit RAM when sorting BAM file. Value to be specified in bytes. If 0, will be set to the genome index size.", "help_text": "This parameter specifies the maximum available RAM (bytes) for sorting BAM during STAR alignment." }, - "star_outBAMsortingBinsN": { + "star_bins_bamsort": { "type": "integer", "default": 50, "description": "Specifies the number of genome bins for coordinate-sorting", "help_text": "This parameter specifies the number of bins to be used for coordinate sorting during STAR alignment step." }, - "star_limitOutSJcollapsed": { + "star_max_collapsed_junc": { "type": "integer", "default": 1000000, "description": "Specifies the maximum number of collapsed junctions" @@ -253,7 +253,7 @@ "default": "", "fa_icon": "fas fa-toolbox", "properties": { - "stand_call_conf": { + "gatk_hc_call_conf": { "type": "number", "default": 20, "fa_icon": "fas fa-hammer", @@ -363,10 +363,10 @@ "description": "Skip variant annotation", "help_text": "Set this parameter if you don't want to run variant annotation." }, - "skip_qc": { + "skip_multiqc": { "type": "boolean", - "description": "Skip QC reports", - "help_text": "This parameter disable all QC reports from recalibrated BAM file." + "description": "Skip MultiQC reports", + "help_text": "This parameter disable all QC reports" } } }, @@ -376,7 +376,7 @@ "fa_icon": "fas fa-terminal", "description": "Define parameters of the tools used in the pipeline", "properties": { - "scatter_count": { + "gatk_interval_scatter_count": { "type": "integer", "default": 25, "description": "Number of times the gene interval list to be split in order to run GATK haplotype caller in parallel", @@ -387,25 +387,25 @@ "description": "Do not use gene interval file during variant calling", "help_text": "This parameter, if set to True, does not use the gene intervals during the variant calling step, which then results in variants from all regions including non-genic. Default is False" }, - "window": { + "gatk_vf_window_size": { "type": "integer", "default": 35, "description": "The window size (in bases) in which to evaluate clustered SNPs.", "help_text": "This parameter is used by GATK variant filteration step. It defines the window size (in bases) in which to evaluate clustered SNPs. It has to be used together with the other option 'cluster'." }, - "cluster": { + "gatk_vf_cluster_size": { "type": "integer", "default": 3, "description": "The number of SNPs which make up a cluster. Must be at least 2.", "help_text": "This parameter is used by GATK variant filteration step. It defines the number of SNPs which make up a cluster within a window. Must be at least 2." }, - "fs_filter": { + "gatk_vf_fs_filter": { "type": "number", "default": 30, "description": "Value to be used for the FisherStrand (FS) filter", "help_text": "This parameter defines the value to use for the FisherStrand (FS) filter in the GATK variant-filtering step. \nThe value should given in a float number format. Default is 30.0" }, - "qd_filter": { + "gatk_vf_qd_filter": { "type": "number", "default": 2, "description": "Value to be used for the QualByDepth (QD) filter", diff --git a/subworkflows/local/annotate.nf b/subworkflows/local/annotate.nf index 4c7b03ab..9555dddb 100644 --- a/subworkflows/local/annotate.nf +++ b/subworkflows/local/annotate.nf @@ -24,7 +24,11 @@ workflow ANNOTATE { ch_vcf_ann = Channel.empty() if (tools.contains('snpeff') || tools.contains('merge')) { - SNPEFF_ANNOTATE(vcf, snpeff_db, snpeff_cache) + SNPEFF_ANNOTATE ( + vcf, + snpeff_db, + snpeff_cache + ) ch_vcf_ann = ch_vcf_ann.mix(SNPEFF_ANNOTATE.out.vcf_tbi) ch_reports = ch_reports.mix(SNPEFF_ANNOTATE.out.reports) ch_versions = ch_versions.mix(SNPEFF_ANNOTATE.out.versions.first()) @@ -32,14 +36,26 @@ workflow ANNOTATE { if (tools.contains('merge')) { vcf_ann_for_merge = SNPEFF_ANNOTATE.out.vcf_tbi.map{ meta, vcf, tbi -> [meta, vcf] } - MERGE_ANNOTATE(vcf_ann_for_merge, vep_genome, vep_species, vep_cache_version, vep_cache) + MERGE_ANNOTATE ( + vcf_ann_for_merge, + vep_genome, + vep_species, + vep_cache_version, + vep_cache + ) ch_vcf_ann = ch_vcf_ann.mix(MERGE_ANNOTATE.out.vcf_tbi) ch_reports = ch_reports.mix(MERGE_ANNOTATE.out.reports) ch_versions = ch_versions.mix(MERGE_ANNOTATE.out.versions.first()) } if (tools.contains('vep')) { - ENSEMBLVEP_ANNOTATE(vcf, vep_genome, vep_species, vep_cache_version, vep_cache) + ENSEMBLVEP_ANNOTATE ( + vcf, + vep_genome, + vep_species, + vep_cache_version, + vep_cache + ) ch_vcf_ann = ch_vcf_ann.mix(ENSEMBLVEP_ANNOTATE.out.vcf_tbi) ch_reports = ch_reports.mix(ENSEMBLVEP_ANNOTATE.out.reports) ch_versions = ch_versions.mix(ENSEMBLVEP_ANNOTATE.out.versions.first()) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 0aecf87f..95fc458e 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -9,11 +9,13 @@ workflow INPUT_CHECK { samplesheet // file: /path/to/samplesheet.csv main: - SAMPLESHEET_CHECK ( samplesheet ) - .csv - .splitCsv ( header:true, sep:',' ) - .map { create_fastq_channel(it) } - .set { reads } + SAMPLESHEET_CHECK ( + samplesheet + ) + .csv + .splitCsv ( header:true, sep:',' ) + .map { create_fastq_channel(it) } + .set { reads } emit: reads // channel: [ val(meta), [ reads ] ] diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index b0721aaf..e4760b57 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -26,7 +26,9 @@ workflow PREPARE_GENOME { // Uncompress genome fasta file if required // if (params.fasta.endsWith('.gz')) { - GUNZIP_FASTA ( Channel.fromPath(params.fasta).map{ it -> [[id:it[0].baseName], it] } ) + GUNZIP_FASTA ( + Channel.fromPath(params.fasta).map{ it -> [[id:it[0].baseName], it] } + ) ch_fasta = GUNZIP_FASTA.out.gunzip.map{ meta, fasta -> [fasta] }.collect() ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions) } else { @@ -36,11 +38,12 @@ workflow PREPARE_GENOME { // // Uncompress GTF annotation file or create from GFF3 if required // - ch_gffread_version = Channel.empty() if (params.gtf) { if (params.gtf.endsWith('.gz')) { - GUNZIP_GTF ( Channel.fromPath(params.gtf).map{ it -> [[id:it[0].baseName], it] } ) + GUNZIP_GTF ( + Channel.fromPath(params.gtf).map{ it -> [[id:it[0].baseName], it] } + ) ch_gtf = GUNZIP_GTF.out.gunzip.map{ meta, gtf -> [gtf] }.collect() ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) } else { @@ -48,13 +51,21 @@ workflow PREPARE_GENOME { } } else if (params.gff) { if (params.gff.endsWith('.gz')) { - GUNZIP_GFF ( Channel.fromPath(params.gff).map{ it -> [[id:it[0].baseName], it] } ) + GUNZIP_GFF ( + Channel.fromPath(params.gff).map{ it -> [[id:it[0].baseName], it] } + ) ch_gff = GUNZIP_GFF.out.gunzip.map{ meta, gff -> [gff] }.collect() ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions) } else { ch_gff = Channel.fromPath(params.gff).collect() } - ch_gtf = GFFREAD ( ch_gff ).gtf + + GFFREAD ( + ch_gff + ) + .gtf + .set { ch_gtf } + ch_versions = ch_versions.mix(GFFREAD.out.versions) } @@ -63,7 +74,9 @@ workflow PREPARE_GENOME { // if (params.exon_bed) { if (params.exon_bed.endsWith('.gz')) { - GUNZIP_GENE_BED ( Channel.fromPath(params.exon_bed).map{ it -> [[id:it[0].baseName], it] } ) + GUNZIP_GENE_BED ( + Channel.fromPath(params.exon_bed).map{ it -> [[id:it[0].baseName], it] } + ) ch_gene_bed = GUNZIP_GENE_BED.out.gunzip.map{ meta, bed -> [bed] }.collect() ch_versions = ch_versions.mix(GUNZIP_GENE_BED.out.versions) } else { @@ -78,7 +91,9 @@ workflow PREPARE_GENOME { ch_fasta_fai = Channel.empty() if (params.fasta_fai) ch_fasta_fai = Channel.fromPath(params.fasta_fai).collect() if (!params.fasta_fai) { - SAMTOOLS_FAIDX(ch_fasta.map{ it -> [[id:it[0].getName()], it]}) + SAMTOOLS_FAIDX( + ch_fasta.map{ it -> [[id:it[0].getName()], it]} + ) ch_fasta_fai = SAMTOOLS_FAIDX.out.fai.map{ meta, fai -> [fai] }.collect() ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions) } @@ -91,13 +106,13 @@ workflow PREPARE_GENOME { // // Uncompress STAR index or generate from scratch if required // - ch_star_index = Channel.empty() - if ('star' in prepare_tool_indices) { if (params.star_index) { if (params.star_index.endsWith('.tar.gz')) { - UNTAR_STAR_INDEX (Channel.fromPath(params.star_index).map{ it -> [[id:it[0].baseName], it] }) + UNTAR_STAR_INDEX ( + Channel.fromPath(params.star_index).map{ it -> [[id:it[0].baseName], it] } + ) ch_star_index = UNTAR_STAR_INDEX.out.untar.map{ meta, star_index -> [star_index] }.collect() ch_versions = ch_versions.mix(UNTAR_STAR_INDEX.out.versions) } else { @@ -105,7 +120,11 @@ workflow PREPARE_GENOME { } } else { - ch_star_index = STAR_GENOMEGENERATE(ch_fasta,ch_gtf).index + STAR_GENOMEGENERATE ( + ch_fasta,ch_gtf + ) + .index + .set { ch_star_index } ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) } diff --git a/subworkflows/nf-core/align_star.nf b/subworkflows/nf-core/align_star.nf index 87e482db..2a4069fd 100644 --- a/subworkflows/nf-core/align_star.nf +++ b/subworkflows/nf-core/align_star.nf @@ -21,13 +21,22 @@ workflow ALIGN_STAR { // // Map reads with STAR // - STAR_ALIGN ( reads, index, gtf, star_ignore_sjdbgtf, seq_platform, seq_center ) + STAR_ALIGN ( + reads, + index, + gtf, + star_ignore_sjdbgtf, + seq_platform, + seq_center + ) ch_versions = ch_versions.mix(STAR_ALIGN.out.versions.first()) // // Sort, index BAM file and run samtools stats, flagstat and idxstats // - BAM_SORT_SAMTOOLS ( STAR_ALIGN.out.bam ) + BAM_SORT_SAMTOOLS ( + STAR_ALIGN.out.bam + ) ch_versions = ch_versions.mix(BAM_SORT_SAMTOOLS.out.versions.first()) emit: diff --git a/subworkflows/nf-core/bam_sort_samtools.nf b/subworkflows/nf-core/bam_sort_samtools.nf index d735bd37..f473b759 100644 --- a/subworkflows/nf-core/bam_sort_samtools.nf +++ b/subworkflows/nf-core/bam_sort_samtools.nf @@ -14,10 +14,14 @@ workflow BAM_SORT_SAMTOOLS { ch_versions = Channel.empty() - SAMTOOLS_SORT ( ch_bam ) + SAMTOOLS_SORT ( + ch_bam + ) ch_versions = ch_versions.mix(SAMTOOLS_SORT.out.versions.first()) - SAMTOOLS_INDEX ( SAMTOOLS_SORT.out.bam ) + SAMTOOLS_INDEX ( + SAMTOOLS_SORT.out.bam + ) ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) SAMTOOLS_SORT.out.bam @@ -33,7 +37,9 @@ workflow BAM_SORT_SAMTOOLS { } .set { ch_bam_bai } - BAM_STATS_SAMTOOLS ( ch_bam_bai ) + BAM_STATS_SAMTOOLS ( + ch_bam_bai + ) ch_versions = ch_versions.mix(BAM_STATS_SAMTOOLS.out.versions.first()) emit: diff --git a/subworkflows/nf-core/bam_stats_samtools.nf b/subworkflows/nf-core/bam_stats_samtools.nf index 3d505a4d..68d632c3 100644 --- a/subworkflows/nf-core/bam_stats_samtools.nf +++ b/subworkflows/nf-core/bam_stats_samtools.nf @@ -13,13 +13,20 @@ workflow BAM_STATS_SAMTOOLS { main: ch_versions = Channel.empty() - SAMTOOLS_STATS ( ch_bam_bai, [] ) + SAMTOOLS_STATS ( + ch_bam_bai, + [] + ) ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions.first()) - SAMTOOLS_FLAGSTAT ( ch_bam_bai ) + SAMTOOLS_FLAGSTAT ( + ch_bam_bai + ) ch_versions = ch_versions.mix(SAMTOOLS_FLAGSTAT.out.versions.first()) - SAMTOOLS_IDXSTATS ( ch_bam_bai ) + SAMTOOLS_IDXSTATS ( + ch_bam_bai + ) ch_versions = ch_versions.mix(SAMTOOLS_IDXSTATS.out.versions.first()) emit: diff --git a/subworkflows/nf-core/ensemblvep_annotate.nf b/subworkflows/nf-core/ensemblvep_annotate.nf index 40572a8b..13dffee4 100644 --- a/subworkflows/nf-core/ensemblvep_annotate.nf +++ b/subworkflows/nf-core/ensemblvep_annotate.nf @@ -17,9 +17,18 @@ workflow ENSEMBLVEP_ANNOTATE { ch_versions = Channel.empty() - ENSEMBLVEP(vcf, vep_genome, vep_species, vep_cache_version, vep_cache) + ENSEMBLVEP ( + vcf, + vep_genome, + vep_species, + vep_cache_version, + vep_cache + ) ch_versions = ch_versions.mix(ENSEMBLVEP.out.versions.first()) - TABIX_BGZIPTABIX(ENSEMBLVEP.out.vcf) + + TABIX_BGZIPTABIX ( + ENSEMBLVEP.out.vcf + ) ch_versions = ch_versions.mix(TABIX_BGZIPTABIX.out.versions.first()) emit: diff --git a/subworkflows/nf-core/markduplicates.nf b/subworkflows/nf-core/markduplicates.nf index b1c41edc..85b724dd 100644 --- a/subworkflows/nf-core/markduplicates.nf +++ b/subworkflows/nf-core/markduplicates.nf @@ -14,13 +14,17 @@ workflow MARKDUPLICATES { ch_versions = Channel.empty() - GATK4_MARKDUPLICATES(bam) + GATK4_MARKDUPLICATES ( + bam + ) ch_versions = ch_versions.mix(GATK4_MARKDUPLICATES.out.versions.first()) // // Index BAM file and run samtools stats, flagstat and idxstats // - SAMTOOLS_INDEX(GATK4_MARKDUPLICATES.out.bam) + SAMTOOLS_INDEX ( + GATK4_MARKDUPLICATES.out.bam + ) ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) GATK4_MARKDUPLICATES.out.bam @@ -31,7 +35,9 @@ workflow MARKDUPLICATES { else [meta, bam, csi]} .set{ch_bam_bai} - BAM_STATS_SAMTOOLS(ch_bam_bai) + BAM_STATS_SAMTOOLS ( + ch_bam_bai + ) ch_versions = ch_versions.mix(BAM_STATS_SAMTOOLS.out.versions.first()) emit: diff --git a/subworkflows/nf-core/recalibrate.nf b/subworkflows/nf-core/recalibrate.nf index 70309891..664cf3f5 100644 --- a/subworkflows/nf-core/recalibrate.nf +++ b/subworkflows/nf-core/recalibrate.nf @@ -24,11 +24,18 @@ workflow RECALIBRATE { bam_recalibrated = Channel.empty() bam_reports = Channel.empty() - APPLYBQSR(bam, fasta, fai, dict) + APPLYBQSR ( + bam, + fasta, + fai, + dict + ) bam_recalibrated = APPLYBQSR.out.bam ch_versions = ch_versions.mix(APPLYBQSR.out.versions.first()) - SAMTOOLS_INDEX(bam_recalibrated) + SAMTOOLS_INDEX ( + bam_recalibrated + ) bam_recalibrated_index = bam_recalibrated .join(SAMTOOLS_INDEX.out.bai, by: [0], remainder: true) .join(SAMTOOLS_INDEX.out.csi, by: [0], remainder: true) diff --git a/subworkflows/nf-core/snpeff_annotate.nf b/subworkflows/nf-core/snpeff_annotate.nf index b7b618d0..9a8b65bc 100644 --- a/subworkflows/nf-core/snpeff_annotate.nf +++ b/subworkflows/nf-core/snpeff_annotate.nf @@ -15,9 +15,16 @@ workflow SNPEFF_ANNOTATE { ch_versions = Channel.empty() - SNPEFF(vcf, snpeff_db, snpeff_cache) + SNPEFF ( + vcf, + snpeff_db, + snpeff_cache + ) ch_versions = ch_versions.mix(SNPEFF.out.versions.first()) - TABIX_BGZIPTABIX(SNPEFF.out.vcf) + + TABIX_BGZIPTABIX ( + SNPEFF.out.vcf + ) ch_versions = ch_versions.mix(TABIX_BGZIPTABIX.out.versions.first()) emit: diff --git a/subworkflows/nf-core/splitncigar.nf b/subworkflows/nf-core/splitncigar.nf index 150c5bc1..1b1895f7 100644 --- a/subworkflows/nf-core/splitncigar.nf +++ b/subworkflows/nf-core/splitncigar.nf @@ -26,7 +26,12 @@ workflow SPLITNCIGAR { [new_meta, bam, bai, intervals] }.set{bam_interval} - GATK4_SPLITNCIGARREADS(bam_interval, fasta, fasta_fai, fasta_dict) + GATK4_SPLITNCIGARREADS ( + bam_interval, + fasta, + fasta_fai, + fasta_dict + ) bam_splitncigar = GATK4_SPLITNCIGARREADS.out.bam ch_versions = ch_versions.mix(GATK4_SPLITNCIGARREADS.out.versions.first()) @@ -37,11 +42,16 @@ workflow SPLITNCIGAR { [new_meta, bam] }.groupTuple().set{bam_splitncigar_interval} - SAMTOOLS_MERGE(bam_splitncigar_interval, fasta) + SAMTOOLS_MERGE ( + bam_splitncigar_interval, + fasta + ) splitncigar_bam = SAMTOOLS_MERGE.out.bam ch_versions = ch_versions.mix(SAMTOOLS_MERGE.out.versions.first()) - SAMTOOLS_INDEX(splitncigar_bam) + SAMTOOLS_INDEX ( + splitncigar_bam + ) splitncigar_bam_bai = splitncigar_bam .join(SAMTOOLS_INDEX.out.bai, by: [0], remainder: true) .join(SAMTOOLS_INDEX.out.csi, by: [0], remainder: true) diff --git a/tests/csv/1.0/samplesheet.csv b/tests/csv/1.0/samplesheet.csv deleted file mode 100644 index d6a719d9..00000000 --- a/tests/csv/1.0/samplesheet.csv +++ /dev/null @@ -1,3 +0,0 @@ -sample,fastq_1,fastq_2,strandedness -GM12878,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz,reverse -TEST123,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz,https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz,reverse diff --git a/tests/test_annotation.yml b/tests/test_annotation.yml index cf8fc787..b6fb529d 100644 --- a/tests/test_annotation.yml +++ b/tests/test_annotation.yml @@ -8,7 +8,7 @@ - path: results/variant_annotation/GM12878/GM12878_snpEff.ann.vcf.gz.tbi - path: results/reports/multiqc_report.html - name: Run VEP - command: nextflow run main.nf -profile test,docker --annotate_tools vep --skip_qc + command: nextflow run main.nf -profile test,docker --annotate_tools vep --skip_multiqc tags: - annotation - vep @@ -16,7 +16,7 @@ - path: results/variant_annotation/GM12878/GM12878_VEP.ann.vcf.gz - path: results/variant_annotation/GM12878/GM12878_VEP.ann.vcf.gz.tbi - name: Run snpEff followed by VEP - command: nextflow run main.nf -profile test,docker --annotate_tools merge --skip_qc + command: nextflow run main.nf -profile test,docker --annotate_tools merge --skip_multiqc tags: - annotation - merge diff --git a/tests/test_bamcsiindex.yml b/tests/test_bamcsiindex.yml index e1fd170e..06fb817f 100644 --- a/tests/test_bamcsiindex.yml +++ b/tests/test_bamcsiindex.yml @@ -6,9 +6,5 @@ - path: results/reports/multiqc_report.html - path: results/preprocessing/GM12878/GM12878.markdup.sorted.bam - path: results/preprocessing/GM12878/GM12878.markdup.sorted.bam.csi - - path: results/preprocessing/TEST123/TEST123.markdup.sorted.bam - - path: results/preprocessing/TEST123/TEST123.markdup.sorted.bam.csi - path: results/variant_calling/GM12878/GM12878.haplotypecaller.vcf.gz - path: results/variant_calling/GM12878/GM12878.haplotypecaller.vcf.gz.csi - - path: results/variant_calling/TEST123/TEST123.haplotypecaller.vcf.gz - - path: results/variant_calling/TEST123/TEST123.haplotypecaller.vcf.gz.csi diff --git a/tests/test_default.yml b/tests/test_default.yml index 79264af6..36cb35fa 100644 --- a/tests/test_default.yml +++ b/tests/test_default.yml @@ -11,9 +11,3 @@ - path: results/variant_calling/GM12878/GM12878.haplotypecaller.vcf.gz.tbi - path: results/variant_calling/GM12878/GM12878.haplotypecaller.filtered.vcf.gz - path: results/variant_calling/GM12878/GM12878.haplotypecaller.filtered.vcf.gz.tbi - - path: results/preprocessing/TEST123/TEST123.recal.bam - - path: results/preprocessing/TEST123/TEST123.recal.bam.bai - - path: results/variant_calling/TEST123/TEST123.haplotypecaller.vcf.gz - - path: results/variant_calling/TEST123/TEST123.haplotypecaller.vcf.gz.tbi - - path: results/variant_calling/TEST123/TEST123.haplotypecaller.filtered.vcf.gz - - path: results/variant_calling/TEST123/TEST123.haplotypecaller.filtered.vcf.gz.tbi diff --git a/workflows/rnavar.nf b/workflows/rnavar.nf index a8a76fae..34688263 100644 --- a/workflows/rnavar.nf +++ b/workflows/rnavar.nf @@ -94,18 +94,18 @@ def seq_platform = params.seq_platform ? params.seq_platform : [] def seq_center = params.seq_center ? params.seq_center : [] // Initialize file channels based on params -dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : Channel.empty() -dbsnp_tbi = params.dbsnp_tbi ? Channel.fromPath(params.dbsnp_tbi).collect() : Channel.empty() -known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : Channel.empty() -known_indels_tbi = params.known_indels_tbi ? Channel.fromPath(params.known_indels_tbi).collect() : Channel.empty() +ch_dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : Channel.empty() +ch_dbsnp_tbi = params.dbsnp_tbi ? Channel.fromPath(params.dbsnp_tbi).collect() : Channel.empty() +ch_known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : Channel.empty() +ch_known_indels_tbi = params.known_indels_tbi ? Channel.fromPath(params.known_indels_tbi).collect() : Channel.empty() // Initialize varaint annotation associated channels -def snpeff_db = params.snpeff_db ?: Channel.empty() -def vep_cache_version = params.vep_cache_version ?: Channel.empty() -def vep_genome = params.vep_genome ?: Channel.empty() -def vep_species = params.vep_species ?: Channel.empty() -def snpeff_cache = params.snpeff_cache ? Channel.fromPath(params.snpeff_cache).collect() : [] -def vep_cache = params.vep_cache ? Channel.fromPath(params.vep_cache).collect() : [] +ch_snpeff_db = params.snpeff_db ?: Channel.empty() +ch_vep_cache_version = params.vep_cache_version ?: Channel.empty() +ch_vep_genome = params.vep_genome ?: Channel.empty() +ch_vep_species = params.vep_species ?: Channel.empty() +ch_snpeff_cache = params.snpeff_cache ? Channel.fromPath(params.snpeff_cache).collect() : [] +ch_vep_cache = params.vep_cache ? Channel.fromPath(params.vep_cache).collect() : [] // MultiQC reporting def multiqc_report = [] @@ -126,7 +126,9 @@ workflow RNAVAR { // // SUBWORKFLOW: Uncompress and prepare reference genome files // - PREPARE_GENOME (prepareToolIndices) + PREPARE_GENOME ( + prepareToolIndices + ) ch_genome_bed = Channel.from([id:'genome.bed']).combine(PREPARE_GENOME.out.exon_bed) ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions) @@ -139,8 +141,10 @@ workflow RNAVAR { .reads .map { meta, fastq -> - meta.id = meta.id.split('_')[0..meta.id.split('_').size()-2].join('_') - [ meta, fastq ] } + def meta_clone = meta.clone() + meta_clone.id = meta_clone.id.split('_')[0..-2].join('_') + [ meta_clone, fastq ] + } .groupTuple(by: [0]) .branch { meta, fastq -> @@ -164,9 +168,8 @@ workflow RNAVAR { ch_versions = ch_versions.mix(CAT_FASTQ.out.versions.first().ifEmpty(null)) // - // MODULE: Run FastQC + // MODULE: Generate QC summary using FastQC // - FASTQC ( ch_cat_fastq ) @@ -174,29 +177,31 @@ workflow RNAVAR { ch_versions = ch_versions.mix(FASTQC.out.versions.first()) // - // PREPARE THE INTERVAL LIST FROM GTF FILE + // MODULE: Prepare the interval list from the GTF file using GATK4 BedToIntervalList // - ch_interval_list = Channel.empty() - GATK4_BEDTOINTERVALLIST(ch_genome_bed, PREPARE_GENOME.out.dict) + GATK4_BEDTOINTERVALLIST( + ch_genome_bed, + PREPARE_GENOME.out.dict + ) ch_interval_list = GATK4_BEDTOINTERVALLIST.out.interval_list ch_versions = ch_versions.mix(GATK4_BEDTOINTERVALLIST.out.versions.first().ifEmpty(null)) // - // MODULE: IntervalListTools from GATK4 + // MODULE: Scatter one interval-list into many interval-files using GATK4 IntervalListTools // - ch_interval_list_split = Channel.empty() if (!params.skip_intervallisttools) { - GATK4_INTERVALLISTTOOLS(ch_interval_list) + GATK4_INTERVALLISTTOOLS( + ch_interval_list + ) ch_interval_list_split = GATK4_INTERVALLISTTOOLS.out.interval_list.map{ meta, bed -> [bed] }.flatten() } else ch_interval_list_split = ch_interval_list // - // SUBWORKFLOW: Alignment with STAR + // SUBWORKFLOW: Perform read alignment using STAR aligner // - ch_genome_bam = Channel.empty() ch_genome_bam_index = Channel.empty() ch_samtools_stats = Channel.empty() @@ -224,8 +229,12 @@ workflow RNAVAR { ch_reports = ch_reports.mix(ALIGN_STAR.out.log_final.collect{it[1]}.ifEmpty([])) ch_versions = ch_versions.mix(ALIGN_STAR.out.versions.first().ifEmpty(null)) + // // SUBWORKFLOW: Mark duplicates with GATK4 - MARKDUPLICATES(ch_genome_bam) + // + MARKDUPLICATES ( + ch_genome_bam + ) ch_genome_bam = MARKDUPLICATES.out.bam_bai //Gather QC reports @@ -233,36 +242,45 @@ workflow RNAVAR { ch_reports = ch_reports.mix(MARKDUPLICATES.out.metrics.collect{it[1]}.ifEmpty([])) ch_versions = ch_versions.mix(MARKDUPLICATES.out.versions.first().ifEmpty(null)) - // Subworkflow - SplitNCigarReads from GATK4 over the intervals + // + // SUBWORKFLOW: SplitNCigarReads from GATK4 over the intervals // Splits reads that contain Ns in their cigar string (e.g. spanning splicing events in RNAseq data). - splitncigar_bam_bai = Channel.empty() - SPLITNCIGAR(ch_genome_bam, PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.fai, PREPARE_GENOME.out.dict, ch_interval_list_split) - splitncigar_bam_bai = SPLITNCIGAR.out.bam_bai - ch_versions = ch_versions.mix(SPLITNCIGAR.out.versions.first().ifEmpty(null)) - - bam_variant_calling = Channel.empty() - + // + ch_splitncigar_bam_bai = Channel.empty() + SPLITNCIGAR ( + ch_genome_bam, + PREPARE_GENOME.out.fasta, + PREPARE_GENOME.out.fai, + PREPARE_GENOME.out.dict, + ch_interval_list_split + ) + ch_splitncigar_bam_bai = SPLITNCIGAR.out.bam_bai + ch_versions = ch_versions.mix(SPLITNCIGAR.out.versions.first().ifEmpty(null)) + + // + // MODULE: BaseRecalibrator from GATK4 + // Generates a recalibration table based on various co-variates + // + ch_bam_variant_calling = Channel.empty() if(!params.skip_baserecalibration) { - // MODULE: BaseRecalibrator from GATK4 ch_bqsr_table = Channel.empty() - // known_sites is made by grouping both the dbsnp and the known indels ressources // they can either or both be optional - known_sites = dbsnp.concat(known_indels).collect() - known_sites_tbi = dbsnp_tbi.concat(known_indels_tbi).collect() + ch_known_sites = ch_dbsnp.concat(ch_known_indels).collect() + ch_known_sites_tbi = ch_dbsnp_tbi.concat(ch_known_indels_tbi).collect() ch_interval_list_recalib = ch_interval_list.map{ meta, bed -> [bed] }.flatten() - splitncigar_bam_bai.combine(ch_interval_list_recalib) + ch_splitncigar_bam_bai.combine(ch_interval_list_recalib) .map{ meta, bam, bai, interval -> [ meta, bam, bai, interval] - }.set{splitncigar_bam_bai_interval} + }.set{ch_splitncigar_bam_bai_interval} GATK4_BASERECALIBRATOR( - splitncigar_bam_bai_interval, + ch_splitncigar_bam_bai_interval, PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.fai, PREPARE_GENOME.out.dict, - known_sites, - known_sites_tbi + ch_known_sites, + ch_known_sites_tbi ) ch_bqsr_table = GATK4_BASERECALIBRATOR.out.table @@ -270,44 +288,45 @@ workflow RNAVAR { ch_reports = ch_reports.mix(ch_bqsr_table.map{ meta, table -> table}) ch_versions = ch_versions.mix(GATK4_BASERECALIBRATOR.out.versions.first().ifEmpty(null)) - // MODULE: ApplyBaseRecalibrator from GATK4 - bam_applybqsr = splitncigar_bam_bai.join(ch_bqsr_table, by: [0]) - bam_recalibrated_qc = Channel.empty() + ch_bam_applybqsr = ch_splitncigar_bam_bai.join(ch_bqsr_table, by: [0]) + ch_bam_recalibrated_qc = Channel.empty() ch_interval_list_applybqsr = ch_interval_list.map{ meta, bed -> [bed] }.flatten() - bam_applybqsr.combine(ch_interval_list_applybqsr) + ch_bam_applybqsr.combine(ch_interval_list_applybqsr) .map{ meta, bam, bai, table, interval -> [ meta, bam, bai, table, interval] - }.set{applybqsr_bam_bai_interval} + }.set{ch_applybqsr_bam_bai_interval} + // + // MODULE: ApplyBaseRecalibrator from GATK4 + // Recalibrates the base qualities of the input reads based on the recalibration table produced by the GATK BaseRecalibrator tool. + // RECALIBRATE( - params.skip_qc, - applybqsr_bam_bai_interval, + params.skip_multiqc, + ch_applybqsr_bam_bai_interval, PREPARE_GENOME.out.dict, PREPARE_GENOME.out.fai, PREPARE_GENOME.out.fasta ) - bam_variant_calling = RECALIBRATE.out.bam - bam_recalibrated_qc = RECALIBRATE.out.qc + ch_bam_variant_calling = RECALIBRATE.out.bam + ch_bam_recalibrated_qc = RECALIBRATE.out.qc // Gather QC reports - ch_reports = ch_reports.mix(RECALIBRATE.out.qc.collect{it[1]}.ifEmpty([])) - ch_versions = ch_versions.mix(RECALIBRATE.out.versions.first().ifEmpty(null)) + ch_reports = ch_reports.mix(RECALIBRATE.out.qc.collect{it[1]}.ifEmpty([])) + ch_versions = ch_versions.mix(RECALIBRATE.out.versions.first().ifEmpty(null)) } else { - bam_variant_calling = splitncigar_bam_bai + ch_bam_variant_calling = ch_splitncigar_bam_bai } - // MODULE: HaplotypeCaller from GATK4 interval_flag = params.no_intervals // Run haplotyper even in the absence of dbSNP files if (!params.dbsnp){ - dbsnp = [] - dbsnp_tbi = [] + ch_dbsnp = [] + ch_dbsnp_tbi = [] } - haplotypecaller_vcf = Channel.empty() - - haplotypecaller_interval_bam = bam_variant_calling.combine(ch_interval_list_split) + ch_haplotypecaller_vcf = Channel.empty() + ch_haplotypecaller_interval_bam = ch_bam_variant_calling.combine(ch_interval_list_split) .map{ meta, bam, bai, interval_list -> new_meta = meta.clone() new_meta.id = meta.id + "_" + interval_list.baseName @@ -315,16 +334,20 @@ workflow RNAVAR { [new_meta, bam, bai, interval_list] } + // + // MODULE: HaplotypeCaller from GATK4 + // Calls germline SNPs and indels via local re-assembly of haplotypes. + // GATK4_HAPLOTYPECALLER( - haplotypecaller_interval_bam, + ch_haplotypecaller_interval_bam, PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.fai, PREPARE_GENOME.out.dict, - dbsnp, - dbsnp_tbi + ch_dbsnp, + ch_dbsnp_tbi ) - haplotypecaller_raw = GATK4_HAPLOTYPECALLER.out.vcf + ch_haplotypecaller_raw = GATK4_HAPLOTYPECALLER.out.vcf .map{ meta, vcf -> meta.id = meta.sample [meta, vcf]} @@ -332,18 +355,25 @@ workflow RNAVAR { ch_versions = ch_versions.mix(GATK4_HAPLOTYPECALLER.out.versions.first().ifEmpty(null)) + // + // MODULE: MergeVCFS from GATK4 + // Merge multiple VCF files into one VCF + // GATK4_MERGEVCFS( - haplotypecaller_raw, + ch_haplotypecaller_raw, PREPARE_GENOME.out.dict ) - haplotypecaller_vcf = GATK4_MERGEVCFS.out.vcf + ch_haplotypecaller_vcf = GATK4_MERGEVCFS.out.vcf ch_versions = ch_versions.mix(GATK4_MERGEVCFS.out.versions.first().ifEmpty(null)) + // + // MODULE: Index the VCF using TABIX + // TABIX( - haplotypecaller_vcf + ch_haplotypecaller_vcf ) - haplotypecaller_vcf_tbi = haplotypecaller_vcf + ch_haplotypecaller_vcf_tbi = ch_haplotypecaller_vcf .join(TABIX.out.tbi, by: [0], remainder: true) .join(TABIX.out.csi, by: [0], remainder: true) .map{meta, vcf, tbi, csi -> @@ -351,34 +381,40 @@ workflow RNAVAR { else [meta, vcf, csi] } - ch_versions = ch_versions.mix(TABIX.out.versions.first().ifEmpty(null)) - final_vcf = haplotypecaller_vcf + ch_versions = ch_versions.mix(TABIX.out.versions.first().ifEmpty(null)) + ch_final_vcf = ch_haplotypecaller_vcf + // // MODULE: VariantFiltration from GATK4 + // Filter variant calls based on certain criteria + // if (!params.skip_variantfiltration && !params.bam_csi_index ) { GATK4_VARIANTFILTRATION( - haplotypecaller_vcf_tbi, + ch_haplotypecaller_vcf_tbi, PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.fai, PREPARE_GENOME.out.dict ) - filtered_vcf = GATK4_VARIANTFILTRATION.out.vcf - final_vcf = filtered_vcf + ch_filtered_vcf = GATK4_VARIANTFILTRATION.out.vcf + ch_final_vcf = ch_filtered_vcf ch_versions = ch_versions.mix(GATK4_VARIANTFILTRATION.out.versions.first().ifEmpty(null)) } + // + // SUBWORKFLOW: Annotate variants using snpEff and Ensembl VEP if enabled. + // if((!params.skip_variantannotation) && (params.annotate_tools) && (params.annotate_tools.contains('merge') || params.annotate_tools.contains('snpeff') || params.annotate_tools.contains('vep'))) { ANNOTATE( - final_vcf, + ch_final_vcf, params.annotate_tools, - snpeff_db, - snpeff_cache, - vep_genome, - vep_species, - vep_cache_version, - vep_cache) + ch_snpeff_db, + ch_snpeff_cache, + ch_vep_genome, + ch_vep_species, + ch_vep_cache_version, + ch_vep_cache) // Gather QC reports ch_reports = ch_reports.mix(ANNOTATE.out.reports) @@ -393,9 +429,9 @@ workflow RNAVAR { // // MODULE: MultiQC + // Present summary of reads, alignment, duplicates, BSQR stats for all samples as well as workflow summary/parameters as single report // - - if (!params.skip_qc){ + if (!params.skip_multiqc){ workflow_summary = WorkflowRnavar.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) ch_multiqc_files = Channel.empty().mix(ch_version_yaml,