diff --git a/CHANGELOG.md b/CHANGELOG.md index da46d0c..06df372 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## v2.3.0-dev - [date] +### `Added` +- [#44](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/44) Make interval file optional in GenotypeGVCFs process +- [#44](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/44) Decouple the interval file parameter from the broad ## v2.2.0-dev diff --git a/conf/test.config b/conf/test.config index bf303cd..b6f0c09 100644 --- a/conf/test.config +++ b/conf/test.config @@ -33,7 +33,7 @@ params { // Reference data referenceGenome = "data-test/reference/Homo_sapiens_assembly38/chr22" referenceGenomeFasta = "chr22.fa" - intervalsFile = "testInterval22.list" + intervalsFile = "data-test/reference/broad/testInterval22.list" broad = "data-test/reference/broad" //Vep diff --git a/docs/reference_data.md b/docs/reference_data.md index f096b44..b97aaa0 100644 --- a/docs/reference_data.md +++ b/docs/reference_data.md @@ -12,23 +12,32 @@ This directory should contain the following files: - The reference genome FASTA file index (e.g., `Homo_sapiens_assembly38.fasta.fai`). Its location will be automatically derived by appending `.fai` to the `referenceGenomeFasta` parameter. - The reference genome dictionary file (e.g., `Homo_sapiens_assembly38.dict`). Its location will be automatically derived by replacing the `.fasta` file extension of the `referenceGenomeFasta` parameter with `.dict`. - ## Broad reference data (VQSR) -The `broad` parameter specifies the directory containing the reference data files for VQSR. We chose the name `broad` because -this data is from the [Broad Institute](https://www.broadinstitute.org/), a collaborative research institution known for its contributions to genomics and biomedical research. +The `broad` parameter specifies the directory containing the reference data files for VQSR. +Note that the VQSR step applies only to whole genome data, so you need to specify the broad parameter only if you have whole genome data. + +We chose the name `broad` because this data is from the [Broad Institute](https://www.broadinstitute.org/), a collaborative research institution known +for its contributions to genomics and biomedical research. -Files can be downloaded using this link: [GATK Ressource Bundle](https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0?pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))&prefix=&forceOnObjectsSortingFiltering=true) +Files can be downloaded using this link: [GATK Ressource Bundle](https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0/) The broad directory must contain the following files: -- *Intervals Files*: The genomic interval(s) over which we operate (WES, WGS or targeted sequencing). The filename of this list must be defined with the intervalsFile parameter (e.g., `wgs_calling_regions.hg38.interval_list`). For more details, see [Gatk documentation](https://gatk.broadinstitute.org/hc/en-us/articles/360035531852-Intervals-and-interval-lists). -- Highly validated variance ressources currently required by VQSR. ***These are currently hard coded in the pipeline***: - HapMap file : hapmap_3.3.hg38.vcf.gz - 1000G omni2.5 file : 1000G_omni2.5.hg38.vcf.gz - 1000G reference file : 1000G_phase1.snps.high_confidence.hg38.vcf.gz - SNP database : Homo_sapiens_assembly38.dbsnp138.vcf.gz +These are all highly validated variance ressources currently required by VQSR. +***The file names are not configurable and are currently hard coded in the pipeline***. + +Extra settings (ex: resource prior probabilities, tranches, etc.) required to run the different VQSR steps are injected through pipeline parameters or hard coded in the vqsr modules. The values chosen for these settings are based on NIH [Biowulf](https://hpc.nih.gov/training/gatk_tutorial/vqsr.html) + +## Interval file +The `intervalFile` parameter specifies the path to an interval file (ex: `broad/wgs_calling_regions.hg38.interval_list`). + +If specified, the given interval file will be used to defines the genomic interval(s) over which we operate (WES, WGS or targeted sequencing). +For more details, see [Gatk documentation](https://gatk.broadinstitute.org/hc/en-us/articles/360035531852-Intervals-and-interval-lists). -Extra settings (ex: resource prior probabilities, tranches, etc.) required to run the different VQSR steps are injected through pipeline parameters or hard coded in the vqsr modules. The values chosen for these settings are based on NIH [Biowulf](https://hpc.nih.gov/training/gatk_tutorial/vqsr.html) ## VEP Cache Directory The `vepCache` parameter specifies the directory for the vep cache. It is only required if `vep` is specified via the @@ -96,8 +105,8 @@ analysis file should contain only the `analysis` section. | --- | --- | --- | | `referenceGenome` | _Required_ | Path to the directory containing the reference genome data | | `referenceGenomeFasta` | _Required_ | Filename of the reference genome .fasta file, within the specified `referenceGenome` directory | -| `broad` | _Required_ | Path to the directory containing Broad reference data | -| `intervalsFile` | _Required_ | Filename of the genome intervals list, within the specified `broad` directory | +| `broad` | _Optional_ | Path to the directory containing Broad reference data (for VQSR) | +| `intervalsFile` | _Optional_ | Path to the file containg the genome intervals list on which to operate | | `vepCache` | _Optional_ | Path to the vep cache data directory | | `exomiser_data_dir` | _Optional_ | Path to the exomiser reference data directory | | `exomiser_genome` | _Optional_ | Genome assembly version to be used by exomiser(`hg19` or `hg38`) | diff --git a/docs/usage.md b/docs/usage.md index 915a793..d360c17 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -164,8 +164,8 @@ Parameters summary | `outdir` | _Required_ | Path to the output directoy | | `referenceGenome` | _Required_ | Path to the directory containing the reference genome data | | `referenceGenomeFasta` | _Required_ | Filename of the reference genome .fasta file, within the specified `referenceGenome` directory | -| `broad` | _Required_ | Path to the directory containing Broad reference data | -| `intervalsFile` | _Required_ | Filename of the genome intervals list, within the specified `broad` directory | +| `broad` | _Optional_ | Path to the directory containing Broad reference data (for VQSR) | +| `intervalsFile` | _Optional_ | Path to the file containg the genome intervals list on which to operate | | `tools` | _Optional_ | Additional tools to run separated by commas. Supported tools are `vep` and `exomiser` | | `vepCache` | _Optional_ | Path to the vep cache data directory | | `exomiser_data_dir` | _Optional_ | Path to the exomiser reference data directory | diff --git a/modules/local/combine_gvcfs/main.nf b/modules/local/combine_gvcfs/main.nf index 5e8fd9f..064349d 100644 --- a/modules/local/combine_gvcfs/main.nf +++ b/modules/local/combine_gvcfs/main.nf @@ -26,6 +26,7 @@ process COMBINEGVCFS { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" def input_list = vcf.collect{"--variant $it"}.join(' ') + def interval_options = interval ? "--intervals $interval" : "" def avail_mem = 3072 if (!task.memory) { @@ -40,7 +41,7 @@ process COMBINEGVCFS { --output ${prefix}.combined.g.vcf.gz \\ --reference ${fasta} \\ --tmp-dir . \\ - --intervals $interval \\ + $interval_options \\ $args cat <<-END_VERSIONS > versions.yml diff --git a/nextflow_schema.json b/nextflow_schema.json index c831783..098b408 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -53,14 +53,14 @@ }, "broad": { "type": "string", - "description": "Directory containing the references for vqsr and the intervalsFile", - "help_text": "Path to the directory containing 5 important files: \\n1. The intervalsFile whose name is defined in the intervalsFile parameter\\n2. The Hapmap file for vqsr training\\n3. The omni2.5 file for vqsr training\\n4. The 1000G SNP reference file for vqsr training\\n5. The dbsnp database for vqsr training", + "description": "Directory containing the references for vqsr", + "help_text": "Path to the directory containing 4 important files: \\n1. The Hapmap file for vqsr training\\n2. The omni2.5 file for vqsr training\\n3. The 1000G SNP reference file for vqsr training\\n4. The dbsnp database for vqsr training", "format": "directory-path" }, "intervalsFile": { "type": "string", - "description": "Namefile of the genome interval we want to use", - "help_text": "Namefile of the genome interval. Used during the CombineGVCFs step to indicate the regions of interest", + "description": "Path to interval file", + "help_text": "Path to an interval file. If specified, will be used during the CombineGVCFs step to indicate the regions of interest.", "format": "file-path" }, "referenceGenome": { @@ -77,8 +77,6 @@ } }, "required": [ - "broad", - "intervalsFile", "referenceGenome", "referenceGenomeFasta" ] diff --git a/subworkflows/local/vqsr/main.nf b/subworkflows/local/vqsr/main.nf index 6f8bc6f..e5e1187 100644 --- a/subworkflows/local/vqsr/main.nf +++ b/subworkflows/local/vqsr/main.nf @@ -13,8 +13,10 @@ workflow VQSR { input // channel: (val(meta), [.vcf.gz, .vcf.gz.tbi]) main: referenceGenome = file(params.referenceGenome) - broad = file(params.broad) + //If VQSR is not used (i.e. only whole exome data), we allow to avoid passing the broad paramater. + //This code, however, will be executed anyway, so we need to handle this scenario. + broad = params.broad? file(params.broad): "" outputSNP = variantRecalibratorSNP(input, referenceGenome, broad) | join(input) diff --git a/workflows/postprocessing.nf b/workflows/postprocessing.nf index 82ee85d..c9414fe 100644 --- a/workflows/postprocessing.nf +++ b/workflows/postprocessing.nf @@ -138,8 +138,7 @@ workflow POSTPROCESSING { def referenceGenome = file(params.referenceGenome) def pathReferenceGenomeFasta = file(params.referenceGenome + "/" + params.referenceGenomeFasta) def pathReferenceGenomeFai = file(pathReferenceGenomeFasta + ".fai") - def broad = file(params.broad) - def pathIntervalFile = file(params.broad + "/" + params.intervalsFile) + def pathIntervalFile = params.intervalsFile? file(params.intervalsFile) : [] //The empty list is used if we don't want to use an interval file def pathReferenceDict = file(params.referenceGenome + "/" + params.referenceGenomeFasta.substring(0,params.referenceGenomeFasta.indexOf(".")) + ".dict") file(params.outdir).mkdirs()