Merge pull request #56 from praveenraj2018/dev

Addressed the review comments
nf-core · Jun 16, 2022 · e612b8f · e612b8f
2 parents ed9e2a3 + 1ea2855
commit e612b8f
Show file tree

Hide file tree

Showing 24 changed files with 393 additions and 223 deletions.
diff --git a/CITATIONS.md b/CITATIONS.md
@@ -36,7 +36,7 @@
 
   > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.
 
-- [Tabix]()
+- [Tabix](https://pubmed.ncbi.nlm.nih.gov/21208982/)
 
   > Heng Li, Tabix: fast retrieval of sequence features from generic TAB-delimited files, Bioinformatics, Volume 27, Issue 5, 1 March 2011, Pages 718–719. doi: 10.1093/bioinformatics/btq671. PubMed PMID: 21208982; PubMed Central PMCID: PMC3042176.
 

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -9,3 +9,55 @@ report_section_order:
     order: -1001
 
 export_plots: true
+
+# Run only these modules
+run_modules:
+  - custom_content
+  - fastqc
+  - star
+  - samtools
+  - picard
+  - gatk
+  - snpeff
+  - vep
+
+# Order of modules
+module_order:
+  - fastqc:
+      name: "FastQC (raw)"
+      path_filters:
+        - "*_val_*.zip"
+  - star:
+      name: "Read Alignment (STAR)"
+  - samtools:
+      name: "Samtools Flagstat"
+  - picard:
+      name: "GATK4 MarkDuplicates"
+      info: "Metrics generated either by GATK4 MarkDuplicates"
+  - qualimap:
+      name: "Qualimap"
+  - gatk:
+      name: "GATK4 BQSR"
+  - snpeff:
+      name: "SNPeff"
+  - vep:
+      name: "VEP"
+
+extra_fn_clean_exts:
+  - "_val"
+
+# Don't show % Dups in the General Stats table (we have this from Picard)
+table_columns_visible:
+  fastqc:
+    percent_duplicates: False
+
+sp:
+  samtools/stats:
+    fn: "*.aligned.bam.stats"
+  samtools/flagstat:
+    fn: "*.aligned.bam.flagstat"
+  picard/markdups:
+    fn: "*.markdup.sorted.metrics"
+  snpeff:
+    contents: "SnpEff_version"
+    max_filesize: 5000000
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -114,19 +114,16 @@ def validate_unique_samples(self):
         """
         Assert that the combination of sample name and FASTQ filename is unique.
 
-        In addition to the validation, also rename the sample if more than one sample,
-        FASTQ file combination exists.
+        In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the
+        number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment.
 
         """
         assert len(self._seen) == len(self.modified), "The pair of sample name and FASTQ must be unique."
-        if len({pair[0] for pair in self._seen}) < len(self._seen):
-            counts = Counter(pair[0] for pair in self._seen)
-            seen = Counter()
-            for row in self.modified:
-                sample = row[self._sample_col]
-                seen[sample] += 1
-                if counts[sample] > 1:
-                    row[self._sample_col] = f"{sample}_T{seen[sample]}"
+        seen = Counter()
+        for row in self.modified:
+            sample = row[self._sample_col]
+            seen[sample] += 1
+            row[self._sample_col] = f"{sample}_T{seen[sample]}"
 
 
 def read_head(handle, num_lines=10):

diff --git a/conf/modules.config b/conf/modules.config
@@ -51,7 +51,7 @@ process {
     withName: 'GUNZIP_.*' {
         publishDir = [
             path: { "${params.outdir}/genome" },
-            mode: 'copy',
+            mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
             enabled: params.save_reference
         ]
@@ -122,7 +122,7 @@ process {
             path: { "${params.outdir}/reports"},
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
-            enabled: !params.skip_qc
+            enabled: !params.skip_multiqc
         ]
     }
 
@@ -146,9 +146,9 @@ process {
             params.save_unaligned ? '--outReadsUnmapped Fastx' : '',
             params.read_length ? "--sjdbOverhang ${params.read_length - 1}" : '',
             params.star_twopass ? '--twopassMode Basic' : '',
-            params.star_limitBAMsortRAM > 0 ? "--limitBAMsortRAM ${params.star_limitBAMsortRAM}" : "",
-            params.star_outBAMsortingBinsN > 0 ? "--outBAMsortingBinsN ${params.star_outBAMsortingBinsN}" : "",
-            params.star_limitOutSJcollapsed > 0 ? "--limitOutSJcollapsed ${params.star_limitOutSJcollapsed}" : ""
+            params.star_max_memory_bamsort > 0 ? "--limitBAMsortRAM ${params.star_max_memory_bamsort}" : "",
+            params.star_bins_bamsort > 0 ? "--outBAMsortingBinsN ${params.star_bins_bamsort}" : "",
+            params.star_max_collapsed_junc > 0 ? "--limitOutSJcollapsed ${params.star_max_collapsed_junc}" : ""
         ].join(' ').trim()
         publishDir = [
             [
@@ -206,7 +206,7 @@ process {
             '--SUBDIVISION_MODE BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW',
             '--UNIQUE true',
             '--SORT true',
-            params.scatter_count ? "--SCATTER_COUNT $params.scatter_count" : ''
+            params.gatk_interval_scatter_count ? "--SCATTER_COUNT $params.gatk_interval_scatter_count" : ''
         ].join(' ').trim()
         publishDir  = [ enabled: false ]
     }
@@ -261,7 +261,7 @@ process {
             publishDir  = [
                 path: { "${params.outdir}/reports/stats/${meta.id}" },
                 mode: params.publish_dir_mode,
-                enabled: !params.skip_qc,
+                enabled: !params.skip_multiqc,
                 pattern: "*.{stats,flagstat}"
             ]
     }
@@ -311,7 +311,7 @@ process {
     withName: GATK4_HAPLOTYPECALLER {
         ext.args    = [
             '--dont-use-soft-clipped-bases',
-            params.stand_call_conf ? "--standard-min-confidence-threshold-for-calling $params.stand_call_conf" : '',
+            params.gatk_hc_call_conf ? "--standard-min-confidence-threshold-for-calling $params.gatk_hc_call_conf" : '',
             params.bam_csi_index ? "--create-output-variant-index false" : ""
         ].join(' ').trim()
         publishDir  = [ enabled: false ]
@@ -340,10 +340,10 @@ process {
     withName: GATK4_VARIANTFILTRATION {
         ext.prefix = {"${meta.id}.haplotypecaller.filtered"}
         ext.args    = [
-            params.window ? "--window $params.window" : '',
-            params.cluster ? "--cluster $params.cluster" : '',
-            params.fs_filter ? "--filter-name \"FS\" --filter \"FS > $params.fs_filter\" " : '',
-            params.qd_filter ? "--filter-name \"QD\" --filter \"QD < $params.qd_filter\" " : '',
+            params.gatk_vf_window_size ? "--window $params.gatk_vf_window_size" : '',
+            params.gatk_vf_cluster_size ? "--cluster $params.gatk_vf_cluster_size" : '',
+            params.gatk_vf_fs_filter ? "--filter-name \"FS\" --filter \"FS > $params.gatk_vf_fs_filter\" " : '',
+            params.gatk_vf_qd_filter ? "--filter-name \"QD\" --filter \"QD < $params.gatk_vf_qd_filter\" " : '',
         ].join(' ').trim()
         publishDir  = [
             path: { "${params.outdir}/variant_calling/${meta.id}" },
@@ -366,7 +366,7 @@ process {
             mode: params.publish_dir_mode,
             path: { "${params.outdir}/reports/EnsemblVEP/${meta.id}" },
             pattern: "*html",
-            enabled: !params.skip_qc
+            enabled: !params.skip_multiqc
         ]
     }
 
@@ -380,7 +380,7 @@ process {
             path: { "${params.outdir}/reports/SnpEff/${meta.id}" },
             pattern: "*.{csv,html,txt}",
             saveAs: { params.annotate_tools.contains('snpeff') ? it : null },
-            enabled: !params.skip_qc
+            enabled: !params.skip_multiqc
         ]
     }
 

diff --git a/conf/test.config b/conf/test.config
@@ -20,7 +20,7 @@ params {
     max_time   = '6.h'
 
     // Input data
-    input = "${baseDir}/tests/csv/1.0/samplesheet.csv"
+    input = "https://raw.githubusercontent.com/nf-core/test-datasets/rnavar/samplesheet/v1.0/samplesheet.csv"
 
     // Genome references
     genome              = 'WBcel235'

diff --git a/docs/usage.md b/docs/usage.md
@@ -130,18 +130,18 @@ GATK best practices has been followed in this pipeline for RNA analysis, hence i
 
 > **NB:** Base recalibration can be turned off using `--skip_baserecalibration true` option. This is useful when you are analyzing data from non-model organisms where there is no known variant datasets exist.
 
-`GATK SplitNCigarReads` is very time consuming step, therefore we made an attempt to break the GTF file into multiple chunks (scatters) using `GATK IntervalListTools` to run the process independently on each chunk in a parallel way to speed up the analysis. The default number of splits is set to 25, that means the GTF file is split into 25 smaller files and run `GATK SplitNCigarReads` on each of them in parallel. You can modify the number of splits using parameter `--scatter_count`.
+`GATK SplitNCigarReads` is very time consuming step, therefore we made an attempt to break the GTF file into multiple chunks (scatters) using `GATK IntervalListTools` to run the process independently on each chunk in a parallel way to speed up the analysis. The default number of splits is set to 25, that means the GTF file is split into 25 smaller files and run `GATK SplitNCigarReads` on each of them in parallel. You can modify the number of splits using parameter `--gatk_interval_scatter_count`.
 
 ## Variant calling and filtering
 
-`GATK HaplotypeCaller` is used for variant calling with default minimum phred-scaled confidence threshold as 20. This value can be changed using paramerter `--stand_call_conf`.
+`GATK HaplotypeCaller` is used for variant calling with default minimum phred-scaled confidence threshold as 20. This value can be changed using paramerter `--gatk_hc_call_conf`.
 
 The pipeline runs a hard-filtering step on the variants by default. It does not filter out any variants, rather it flags i.e. PASS or other flags such as FS, QD, SnpCluster, etc. in FILTER column of the VCF. The following are the default filter criteria, however it can be changed using the respective parameters.
 
-- `--cluster` is set to 3. It is the number of SNPs which make up a cluster.
-- `--window` is set to 35. The window size (in bases) in which to evaluate clustered SNPs.
-- `--fs_filter` is set to 30.0. Filter based on FisherStrand > 30.0. It is the Phred-scaled probability that there is strand bias at the site.
-- `--qd_filter` is set to 2.0 meaning filter variants if Quality By Depth filter is < 2.0.
+- `--gatk_vf_cluster_size` is set to 3. It is the number of SNPs which make up a cluster.
+- `--gatk_vf_window_size` is set to 35. The window size (in bases) in which to evaluate clustered SNPs.
+- `--gatk_vf_fs_filter` is set to 30.0. Filter based on FisherStrand > 30.0. It is the Phred-scaled probability that there is strand bias at the site.
+- `--gatk_vf_qd_filter` is set to 2.0 meaning filter variants if Quality By Depth filter is < 2.0.
 
 Variant filtering is an optional step. You can skip it using `--skip_variantfiltration` parameter.
 

diff --git a/nextflow.config b/nextflow.config
@@ -12,72 +12,72 @@ params {
     // Pipeline options
 
     // Mandatory option
-    input                      = null // sample sheet
+    input                           = null      // sample sheet
 
     // Genome and reference options
-    genome                     = 'GRCh38'
-    igenomes_base              = 's3://ngi-igenomes/igenomes'
-    igenomes_ignore            = false
-    save_reference             = false
-    save_merged_fastq          = false
+    genome                          = 'GRCh38'
+    igenomes_base                   = 's3://ngi-igenomes/igenomes'
+    igenomes_ignore                 = false
+    save_reference                  = false
+    save_merged_fastq               = false
 
     // Sequence read information
-    read_length                = 150 // Required for STAR to build index and align reads
+    read_length                     = 150 // Required for STAR to build index and align reads
 
     // Alignment
-    aligner                    = 'star' // Only STAR is currently supported.
-    star_twopass               = true
-    star_ignore_sjdbgtf        = false // Ignore GTF file while creating index or alignment by STAR
-    star_limitBAMsortRAM       = 0
-    star_outBAMsortingBinsN    = 50
-    star_limitOutSJcollapsed   = 1000000
-    seq_center                 = null
-    seq_platform               = 'illumina' // Required for preparing for BAM headers for GATK to work
-    bam_csi_index              = false
-    save_unaligned             = false
-    save_align_intermeds       = false
+    aligner                         = 'star'    // Only STAR is currently supported.
+    star_twopass                    = true
+    star_ignore_sjdbgtf             = false     // Ignore GTF file while creating index or alignment by STAR
+    star_max_memory_bamsort         = 0         // STAR parameter limitBAMsortRAM to specify maximum RAM for sorting BAM
+    star_bins_bamsort               = 50        // STAR parameter outBAMsortingBinsN to specify number of bins for sorting BAM
+    star_max_collapsed_junc         = 1000000   // STAR parameter limitOutSJcollapsed to specify max number of collapsed junctions
+    seq_center                      = null
+    seq_platform                    = 'illumina' // Required for preparing for BAM headers for GATK to work
+    bam_csi_index                   = false
+    save_unaligned                  = false
+    save_align_intermeds            = false
 
     // Preprocessing of alignment
-    remove_duplicates          = false
+    remove_duplicates               = false
 
     // Variant calling
-    no_intervals               = false
+    no_intervals                    = false
 
     // Variant annotation
-    annotate_tools             = null // List of annotation tools to run - snpeff or vep or merge
-    annotation_cache           = false // Annotation cache disabled
-    cadd_cache                 = null // CADD cache disabled
-    cadd_indels                = null // No CADD InDels file
-    cadd_indels_tbi            = null // No CADD InDels index
-    cadd_wg_snvs               = null // No CADD SNVs file
-    cadd_wg_snvs_tbi           = null // No CADD SNVs index
-    genesplicer                = null // genesplicer disabled within VEP
-    snpeff_cache               = null // No directory for snpEff cache
-    snpeff_db                  = null // No default db for snpeff
-    vep_cache                  = null // No directory for VEP cache
-    vep_genome                 = null // No default genome for VEP
-    vep_cache_version          = null // No default cache version for VEP
+    annotate_tools                  = null  // List of annotation tools to run - snpeff or vep or merge
+    annotation_cache                = false // Annotation cache disabled
+    cadd_cache                      = null // CADD cache disabled
+    cadd_indels                     = null // No CADD InDels file
+    cadd_indels_tbi                 = null // No CADD InDels index
+    cadd_wg_snvs                    = null // No CADD SNVs file
+    cadd_wg_snvs_tbi                = null // No CADD SNVs index
+    genesplicer                     = null // genesplicer disabled within VEP
+    snpeff_cache                    = null // No directory for snpEff cache
+    snpeff_db                       = null // No default db for snpeff
+    vep_cache                       = null // No directory for VEP cache
+    vep_genome                      = null // No default genome for VEP
+    vep_cache_version               = null // No default cache version for VEP
 
     // Skip steps
-    skip_baserecalibration     = false
-    skip_intervallisttools     = false
-    skip_variantfiltration     = false
-    skip_variantannotation     = false
+    skip_baserecalibration          = false
+    skip_intervallisttools          = false
+    skip_variantfiltration          = false
+    skip_variantannotation          = false
 
     // GATK intervallist parameters
-    scatter_count              = 25
+    gatk_interval_scatter_count     = 25
 
     //GATK haplotypecaller parameters
-    stand_call_conf            = 20
+    gatk_hc_call_conf               = 20
 
     //GATK variant filter parameters
-    window                     = 35
-    cluster                    = 3
-    fs_filter                  = 30.0
-    qd_filter                  = 2.0
+    gatk_vf_window_size             = 35
+    gatk_vf_cluster_size            = 3
+    gatk_vf_fs_filter               = 30.0
+    gatk_vf_qd_filter               = 2.0
 
     // QC
-    skip_qc                    = false
+    skip_multiqc                    = false
 
     // MultiQC options
     multiqc_config             = null