Skip to content

Commit

Permalink
Merge pull request #56 from praveenraj2018/dev
Browse files Browse the repository at this point in the history
Addressed the review comments
  • Loading branch information
maxulysse authored Jun 16, 2022
2 parents ed9e2a3 + 1ea2855 commit e612b8f
Show file tree
Hide file tree
Showing 24 changed files with 393 additions and 223 deletions.
2 changes: 1 addition & 1 deletion CITATIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

> Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.
- [Tabix]()
- [Tabix](https://pubmed.ncbi.nlm.nih.gov/21208982/)

> Heng Li, Tabix: fast retrieval of sequence features from generic TAB-delimited files, Bioinformatics, Volume 27, Issue 5, 1 March 2011, Pages 718–719. doi: 10.1093/bioinformatics/btq671. PubMed PMID: 21208982; PubMed Central PMCID: PMC3042176.
Expand Down
52 changes: 52 additions & 0 deletions assets/multiqc_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,55 @@ report_section_order:
order: -1001

export_plots: true

# Run only these modules
run_modules:
- custom_content
- fastqc
- star
- samtools
- picard
- gatk
- snpeff
- vep

# Order of modules
module_order:
- fastqc:
name: "FastQC (raw)"
path_filters:
- "*_val_*.zip"
- star:
name: "Read Alignment (STAR)"
- samtools:
name: "Samtools Flagstat"
- picard:
name: "GATK4 MarkDuplicates"
info: "Metrics generated either by GATK4 MarkDuplicates"
- qualimap:
name: "Qualimap"
- gatk:
name: "GATK4 BQSR"
- snpeff:
name: "SNPeff"
- vep:
name: "VEP"

extra_fn_clean_exts:
- "_val"

# Don't show % Dups in the General Stats table (we have this from Picard)
table_columns_visible:
fastqc:
percent_duplicates: False

sp:
samtools/stats:
fn: "*.aligned.bam.stats"
samtools/flagstat:
fn: "*.aligned.bam.flagstat"
picard/markdups:
fn: "*.markdup.sorted.metrics"
snpeff:
contents: "SnpEff_version"
max_filesize: 5000000
17 changes: 7 additions & 10 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,19 +114,16 @@ def validate_unique_samples(self):
"""
Assert that the combination of sample name and FASTQ filename is unique.
In addition to the validation, also rename the sample if more than one sample,
FASTQ file combination exists.
In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the
number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment.
"""
assert len(self._seen) == len(self.modified), "The pair of sample name and FASTQ must be unique."
if len({pair[0] for pair in self._seen}) < len(self._seen):
counts = Counter(pair[0] for pair in self._seen)
seen = Counter()
for row in self.modified:
sample = row[self._sample_col]
seen[sample] += 1
if counts[sample] > 1:
row[self._sample_col] = f"{sample}_T{seen[sample]}"
seen = Counter()
for row in self.modified:
sample = row[self._sample_col]
seen[sample] += 1
row[self._sample_col] = f"{sample}_T{seen[sample]}"


def read_head(handle, num_lines=10):
Expand Down
28 changes: 14 additions & 14 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ process {
withName: 'GUNZIP_.*' {
publishDir = [
path: { "${params.outdir}/genome" },
mode: 'copy',
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
enabled: params.save_reference
]
Expand Down Expand Up @@ -122,7 +122,7 @@ process {
path: { "${params.outdir}/reports"},
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
enabled: !params.skip_qc
enabled: !params.skip_multiqc
]
}

Expand All @@ -146,9 +146,9 @@ process {
params.save_unaligned ? '--outReadsUnmapped Fastx' : '',
params.read_length ? "--sjdbOverhang ${params.read_length - 1}" : '',
params.star_twopass ? '--twopassMode Basic' : '',
params.star_limitBAMsortRAM > 0 ? "--limitBAMsortRAM ${params.star_limitBAMsortRAM}" : "",
params.star_outBAMsortingBinsN > 0 ? "--outBAMsortingBinsN ${params.star_outBAMsortingBinsN}" : "",
params.star_limitOutSJcollapsed > 0 ? "--limitOutSJcollapsed ${params.star_limitOutSJcollapsed}" : ""
params.star_max_memory_bamsort > 0 ? "--limitBAMsortRAM ${params.star_max_memory_bamsort}" : "",
params.star_bins_bamsort > 0 ? "--outBAMsortingBinsN ${params.star_bins_bamsort}" : "",
params.star_max_collapsed_junc > 0 ? "--limitOutSJcollapsed ${params.star_max_collapsed_junc}" : ""
].join(' ').trim()
publishDir = [
[
Expand Down Expand Up @@ -206,7 +206,7 @@ process {
'--SUBDIVISION_MODE BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW',
'--UNIQUE true',
'--SORT true',
params.scatter_count ? "--SCATTER_COUNT $params.scatter_count" : ''
params.gatk_interval_scatter_count ? "--SCATTER_COUNT $params.gatk_interval_scatter_count" : ''
].join(' ').trim()
publishDir = [ enabled: false ]
}
Expand Down Expand Up @@ -261,7 +261,7 @@ process {
publishDir = [
path: { "${params.outdir}/reports/stats/${meta.id}" },
mode: params.publish_dir_mode,
enabled: !params.skip_qc,
enabled: !params.skip_multiqc,
pattern: "*.{stats,flagstat}"
]
}
Expand Down Expand Up @@ -311,7 +311,7 @@ process {
withName: GATK4_HAPLOTYPECALLER {
ext.args = [
'--dont-use-soft-clipped-bases',
params.stand_call_conf ? "--standard-min-confidence-threshold-for-calling $params.stand_call_conf" : '',
params.gatk_hc_call_conf ? "--standard-min-confidence-threshold-for-calling $params.gatk_hc_call_conf" : '',
params.bam_csi_index ? "--create-output-variant-index false" : ""
].join(' ').trim()
publishDir = [ enabled: false ]
Expand Down Expand Up @@ -340,10 +340,10 @@ process {
withName: GATK4_VARIANTFILTRATION {
ext.prefix = {"${meta.id}.haplotypecaller.filtered"}
ext.args = [
params.window ? "--window $params.window" : '',
params.cluster ? "--cluster $params.cluster" : '',
params.fs_filter ? "--filter-name \"FS\" --filter \"FS > $params.fs_filter\" " : '',
params.qd_filter ? "--filter-name \"QD\" --filter \"QD < $params.qd_filter\" " : '',
params.gatk_vf_window_size ? "--window $params.gatk_vf_window_size" : '',
params.gatk_vf_cluster_size ? "--cluster $params.gatk_vf_cluster_size" : '',
params.gatk_vf_fs_filter ? "--filter-name \"FS\" --filter \"FS > $params.gatk_vf_fs_filter\" " : '',
params.gatk_vf_qd_filter ? "--filter-name \"QD\" --filter \"QD < $params.gatk_vf_qd_filter\" " : '',
].join(' ').trim()
publishDir = [
path: { "${params.outdir}/variant_calling/${meta.id}" },
Expand All @@ -366,7 +366,7 @@ process {
mode: params.publish_dir_mode,
path: { "${params.outdir}/reports/EnsemblVEP/${meta.id}" },
pattern: "*html",
enabled: !params.skip_qc
enabled: !params.skip_multiqc
]
}

Expand All @@ -380,7 +380,7 @@ process {
path: { "${params.outdir}/reports/SnpEff/${meta.id}" },
pattern: "*.{csv,html,txt}",
saveAs: { params.annotate_tools.contains('snpeff') ? it : null },
enabled: !params.skip_qc
enabled: !params.skip_multiqc
]
}

Expand Down
2 changes: 1 addition & 1 deletion conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ params {
max_time = '6.h'

// Input data
input = "${baseDir}/tests/csv/1.0/samplesheet.csv"
input = "https://raw.githubusercontent.com/nf-core/test-datasets/rnavar/samplesheet/v1.0/samplesheet.csv"

// Genome references
genome = 'WBcel235'
Expand Down
12 changes: 6 additions & 6 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,18 +130,18 @@ GATK best practices has been followed in this pipeline for RNA analysis, hence i

> **NB:** Base recalibration can be turned off using `--skip_baserecalibration true` option. This is useful when you are analyzing data from non-model organisms where there is no known variant datasets exist.
`GATK SplitNCigarReads` is very time consuming step, therefore we made an attempt to break the GTF file into multiple chunks (scatters) using `GATK IntervalListTools` to run the process independently on each chunk in a parallel way to speed up the analysis. The default number of splits is set to 25, that means the GTF file is split into 25 smaller files and run `GATK SplitNCigarReads` on each of them in parallel. You can modify the number of splits using parameter `--scatter_count`.
`GATK SplitNCigarReads` is very time consuming step, therefore we made an attempt to break the GTF file into multiple chunks (scatters) using `GATK IntervalListTools` to run the process independently on each chunk in a parallel way to speed up the analysis. The default number of splits is set to 25, that means the GTF file is split into 25 smaller files and run `GATK SplitNCigarReads` on each of them in parallel. You can modify the number of splits using parameter `--gatk_interval_scatter_count`.

## Variant calling and filtering

`GATK HaplotypeCaller` is used for variant calling with default minimum phred-scaled confidence threshold as 20. This value can be changed using paramerter `--stand_call_conf`.
`GATK HaplotypeCaller` is used for variant calling with default minimum phred-scaled confidence threshold as 20. This value can be changed using paramerter `--gatk_hc_call_conf`.

The pipeline runs a hard-filtering step on the variants by default. It does not filter out any variants, rather it flags i.e. PASS or other flags such as FS, QD, SnpCluster, etc. in FILTER column of the VCF. The following are the default filter criteria, however it can be changed using the respective parameters.

- `--cluster` is set to 3. It is the number of SNPs which make up a cluster.
- `--window` is set to 35. The window size (in bases) in which to evaluate clustered SNPs.
- `--fs_filter` is set to 30.0. Filter based on FisherStrand > 30.0. It is the Phred-scaled probability that there is strand bias at the site.
- `--qd_filter` is set to 2.0 meaning filter variants if Quality By Depth filter is < 2.0.
- `--gatk_vf_cluster_size` is set to 3. It is the number of SNPs which make up a cluster.
- `--gatk_vf_window_size` is set to 35. The window size (in bases) in which to evaluate clustered SNPs.
- `--gatk_vf_fs_filter` is set to 30.0. Filter based on FisherStrand > 30.0. It is the Phred-scaled probability that there is strand bias at the site.
- `--gatk_vf_qd_filter` is set to 2.0 meaning filter variants if Quality By Depth filter is < 2.0.

Variant filtering is an optional step. You can skip it using `--skip_variantfiltration` parameter.

Expand Down
88 changes: 44 additions & 44 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -12,72 +12,72 @@ params {
// Pipeline options

// Mandatory option
input = null // sample sheet
input = null // sample sheet

// Genome and reference options
genome = 'GRCh38'
igenomes_base = 's3://ngi-igenomes/igenomes'
igenomes_ignore = false
save_reference = false
save_merged_fastq = false
genome = 'GRCh38'
igenomes_base = 's3://ngi-igenomes/igenomes'
igenomes_ignore = false
save_reference = false
save_merged_fastq = false

// Sequence read information
read_length = 150 // Required for STAR to build index and align reads
read_length = 150 // Required for STAR to build index and align reads

// Alignment
aligner = 'star' // Only STAR is currently supported.
star_twopass = true
star_ignore_sjdbgtf = false // Ignore GTF file while creating index or alignment by STAR
star_limitBAMsortRAM = 0
star_outBAMsortingBinsN = 50
star_limitOutSJcollapsed = 1000000
seq_center = null
seq_platform = 'illumina' // Required for preparing for BAM headers for GATK to work
bam_csi_index = false
save_unaligned = false
save_align_intermeds = false
aligner = 'star' // Only STAR is currently supported.
star_twopass = true
star_ignore_sjdbgtf = false // Ignore GTF file while creating index or alignment by STAR
star_max_memory_bamsort = 0 // STAR parameter limitBAMsortRAM to specify maximum RAM for sorting BAM
star_bins_bamsort = 50 // STAR parameter outBAMsortingBinsN to specify number of bins for sorting BAM
star_max_collapsed_junc = 1000000 // STAR parameter limitOutSJcollapsed to specify max number of collapsed junctions
seq_center = null
seq_platform = 'illumina' // Required for preparing for BAM headers for GATK to work
bam_csi_index = false
save_unaligned = false
save_align_intermeds = false

// Preprocessing of alignment
remove_duplicates = false
remove_duplicates = false

// Variant calling
no_intervals = false
no_intervals = false

// Variant annotation
annotate_tools = null // List of annotation tools to run - snpeff or vep or merge
annotation_cache = false // Annotation cache disabled
cadd_cache = null // CADD cache disabled
cadd_indels = null // No CADD InDels file
cadd_indels_tbi = null // No CADD InDels index
cadd_wg_snvs = null // No CADD SNVs file
cadd_wg_snvs_tbi = null // No CADD SNVs index
genesplicer = null // genesplicer disabled within VEP
snpeff_cache = null // No directory for snpEff cache
snpeff_db = null // No default db for snpeff
vep_cache = null // No directory for VEP cache
vep_genome = null // No default genome for VEP
vep_cache_version = null // No default cache version for VEP
annotate_tools = null // List of annotation tools to run - snpeff or vep or merge
annotation_cache = false // Annotation cache disabled
cadd_cache = null // CADD cache disabled
cadd_indels = null // No CADD InDels file
cadd_indels_tbi = null // No CADD InDels index
cadd_wg_snvs = null // No CADD SNVs file
cadd_wg_snvs_tbi = null // No CADD SNVs index
genesplicer = null // genesplicer disabled within VEP
snpeff_cache = null // No directory for snpEff cache
snpeff_db = null // No default db for snpeff
vep_cache = null // No directory for VEP cache
vep_genome = null // No default genome for VEP
vep_cache_version = null // No default cache version for VEP

// Skip steps
skip_baserecalibration = false
skip_intervallisttools = false
skip_variantfiltration = false
skip_variantannotation = false
skip_baserecalibration = false
skip_intervallisttools = false
skip_variantfiltration = false
skip_variantannotation = false

// GATK intervallist parameters
scatter_count = 25
gatk_interval_scatter_count = 25

//GATK haplotypecaller parameters
stand_call_conf = 20
gatk_hc_call_conf = 20

//GATK variant filter parameters
window = 35
cluster = 3
fs_filter = 30.0
qd_filter = 2.0
gatk_vf_window_size = 35
gatk_vf_cluster_size = 3
gatk_vf_fs_filter = 30.0
gatk_vf_qd_filter = 2.0

// QC
skip_qc = false
skip_multiqc = false

// MultiQC options
multiqc_config = null
Expand Down
Loading

0 comments on commit e612b8f

Please sign in to comment.