Merge pull request #46 from Ferlab-Ste-Justine/feat/CLIN-36110-allow-…

…to-skip-exclude-mnps feat: CLIN-36110 allow to skip exclude mnps
Ferlab-Ste-Justine · Dec 9, 2024 · e0667e4 · e0667e4
2 parents 33db042 + 4608546
commit e0667e4
Show file tree

Hide file tree

Showing 8 changed files with 51 additions and 18 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#44](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/44) Make interval file optional in GenotypeGVCFs process
 - [#44](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/44) Decouple the interval file parameter from the broad
 - [#45](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/45) Allow to add dbsnp ids to output vcf files
+- [#46](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/46) Allow to skip the exclude mnp step
 
 ### `Known issues`
 - The nf-core modules that we are using have a potential performance flaw. Typically, the regex used to describe the output files also match the input files (ex: "*.vcf"), which can cause unnecessary file transfers.  This has already proven to cause issues on fusion. One fix could be to transfer the whole modules to local to perform the small change necessary to fix this.

diff --git a/conf/modules.config b/conf/modules.config
@@ -78,9 +78,8 @@ process {
         ext.prefix =  {"variants.${meta.id}.vep"}
     }
 
-    // Currently, tabix is used only for vep output in the post-processing pipeline.
-    // Consider creating a vep subworkflow to group vep and tabix steps, making the process name more specific.
-    withName: 'FERLAB_POSTPROCESSING:POSTPROCESSING:tabix' {
+    // To publish the vep index file in the same output folder as the vep output
+    withName: 'vep_tabix' {
          publishDir = [
             path: { "${params.outdir}/ensemblvep" },
             mode: params.publish_dir_mode,

diff --git a/docs/usage.md b/docs/usage.md
@@ -18,7 +18,7 @@ The samplesheet must contains the following columns at the minimum:
 - *familyId*: The identifier used for the sample family
 - *sample*: The identifier used for the sample
 - *sequencingType*: Must be either WES (Whole Exome Sequencing) or WGS (Whole Genome Sequencing)
-- *gvcf*: Path to the sample .gvcf file
+- *gvcf*: Path to the sample .gvcf.gz file
 
 Additionnally, there is an optional *phenoFamily* column that can contain a .yml/.json file providing phenotype 
 information on the family in phenopacket format. This column is only necessary if using the exomiser tool.
@@ -77,6 +77,17 @@ If you wish to repeatedly use the same parameters for multiple runs, rather than
 Do not use `-c <file>` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args).
 :::
 
+### Skip exclude MNPs
+
+At the beginning of our workflow, we separate MNPs into individual SNPs.  
+
+You can optionally skip this step by setting the `exclude_mnps` parameter to `false` (default is `true`).
+
+Note that MNPs are not supported by the VQSR procedure, so you cannot skip this step if you have whole genome data.
+
+Additionally, if you skip the exclusion of MNPs, ensure that your input GVCF files are indexed or that they are compressed with bgzip. 
+If the index file is missing, the workflow will attempt to generate it, but the input GVCF file must be compressed with bgzip for this to work.
+
 
 ### Tools
 
@@ -170,6 +181,7 @@ Parameters summary
 | `intervalsFile` | _Optional_ | Path to the file containg the genome intervals list on which to operate |
 | `tools` | _Optional_ | Additional tools to run separated by commas. Supported tools are `vep` and `exomiser` |
 | `vepCache` | _Optional_ | Path to the vep cache data directory |
+| `exclude_mnps` | _Optional_ | Replace MNPs by individual SNPs (default: true). Must be true on whole genome data. |
 | `exomiser_data_dir` | _Optional_ | Path to the exomiser reference data directory |
 | `exomiser_genome` | _Optional_ | Genome assembly version to be used by exomiser(`hg19` or `hg38`) |
 | `exomiser_data_version` | _Optional_ | Exomiser data version (e.g., `2402`)|

diff --git a/modules/local/tabix.nf b/modules/local/tabix.nf
@@ -6,7 +6,7 @@ process tabix {
     tuple val(meta), path(vcfFile)
 
     output:
-    path "*.tbi"
+    tuple val(meta), path("*.tbi")
 
     script:
     def args = task.ext.args ?: ''

diff --git a/nextflow.config b/nextflow.config
@@ -38,6 +38,7 @@ params {
     exomiser_analysis_wgs = "${projectDir}/assets/exomiser/default_exomiser_WGS_analysis.yml"
 
 	//Process-specific parameters
+    exclude_mnps = true
     TSfilterSNP = '99'
     TSfilterINDEL = '99'
     hardFilters = [[name: 'QD2', expression: 'QD < 2.0'],

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -279,6 +279,12 @@
           "type": "string",
           "pattern": "^(vep|exomiser)?(,(vep|exomiser))*$",
           "description": "List of tools to use separate with comma.  Available tools: [vep, exomiser]"
+        },
+        "exclude_mnps": {
+          "type": "boolean",
+          "description": "If true, exclude MNPs from the VEP annotation",
+          "help_text": "If true (default), exclude MNPs from the VEP annotation. Must be true on whole genome data.",
+          "default": true 
         }
       }
     },

diff --git a/subworkflows/local/exclude_mnps/main.nf b/subworkflows/local/exclude_mnps/main.nf
@@ -3,13 +3,12 @@ include { BCFTOOLS_NORM } from '../../../modules/nf-core/bcftools/norm/main'
 /**
 Separates MNPs into several SNP that will be analyzed separately
 
-The input and output formats are the same:
-    Input: ([meta]  [some.file.vcf.gz, some.file.vcf.gz.tbi])
-
+    Input: [meta, input.vcf.gz]
+    Output: [meta, [output.vcf.gz, output.vcf.gz.tbi]]
 */
 workflow EXCLUDE_MNPS {
     take:
-        input // channel: (val(metas),  [.gvcf.gz])
+        input // channel: [meta,  .gvcf.gz]
     main:
     versions = Channel.empty()
     def reference_path = file("${params.referenceGenome}/${params.referenceGenomeFasta}")
@@ -24,6 +23,6 @@ workflow EXCLUDE_MNPS {
     versions = versions.mix(BCFTOOLS_FILTER.out.versions)
     versions = versions.mix(BCFTOOLS_NORM.out.versions)
     emit:
-        ch_output_excludemnps // channel: (val(meta),  [.vcf.gz, .vcf.gz.tbi])
+        ch_output_excludemnps // channel: [meta,  [.vcf.gz, .vcf.gz.tbi]]
         versions
 }
diff --git a/workflows/postprocessing.nf b/workflows/postprocessing.nf
@@ -13,7 +13,8 @@ include { VQSR                   } from "../subworkflows/local/vqsr"
 include { EXOMISER               } from '../modules/local/exomiser'
 include { splitMultiAllelics     } from '../modules/local/split_multi_allelics'
 include { ENSEMBLVEP_VEP         } from '../modules/nf-core/ensemblvep/vep/main'  
-include { tabix                  } from '../modules/local/tabix'
+include { tabix  as vep_tabix    } from '../modules/local/tabix'
+include { tabix as initial_tabix } from '../modules/local/tabix' 
 include { COMBINEGVCFS           } from '../modules/local/combine_gvcfs'
 include { GATK4_GENOTYPEGVCFS    } from '../modules/nf-core/gatk4/genotypegvcfs'
 include { GATK4_VARIANTFILTRATION} from '../modules/nf-core/gatk4/variantfiltration'
@@ -132,6 +133,17 @@ process writemeta{
     """
 }
 
+// We assume that the input gvcf files are indexed
+def replicate_excludemnps_output_format(input_channel) {
+    def with_tbi = input_channel.filter{meta, vcf -> file(vcf + ".tbi").exists()}
+        .map{meta, vcf -> [meta, [file(vcf), file(vcf + ".tbi")]]}
+
+    def tbi_input = input_channel.filter{meta, vcf -> !file(vcf + ".tbi").exists()}
+    def tbi_output = initial_tabix(tbi_input)
+    def with_generated_tbi = tbi_input.join(tbi_output).map{meta, vcf, tbi -> [meta, [vcf, tbi]]}
+
+    return with_tbi.concat(with_generated_tbi)
+}
 
 workflow POSTPROCESSING {
     //Local Temp Params
@@ -156,21 +168,24 @@ workflow POSTPROCESSING {
     .collectFile(storeDir: "${params.outdir}/pipeline_info/configs",cache: false)
 
     writemeta()
-    def ch_output_from_excludemnps = EXCLUDE_MNPS(ch_samplesheet).ch_output_excludemnps
-    //Create groupkey for the grouptuple and separate the vcf (file[0]) and the index (files[1])
+
+    def ch_output_from_excludemnps = params.exclude_mnps ? 
+        EXCLUDE_MNPS(ch_samplesheet).ch_output_excludemnps : 
+        replicate_excludemnps_output_format(ch_samplesheet)
+
+    def grouped_by_family = ch_output_from_excludemnps
+         //Create groupkey for the grouptuple and separate the vcf (file[0]) and the index (files[1])
         .map{meta, files -> tuple(groupKey(meta.familyId, meta.sampleSize),meta,files[0],files[1])}
         .groupTuple()
         .map{ familyId, meta, vcf, tbi -> 
         //now that samples are grouped together, we no longer follow sample in meta, and the id no longer needs the sampleId
             def updated_meta = meta[0].findAll{!["sample", "id"].contains(it.key) }
             updated_meta["id"] = updated_meta.familyId
             [updated_meta, vcf.flatten(), tbi.flatten()]}
-
-
 
     //Combine per-sample gVCF files into a multi-sample gVCF file
-    def filtered_one = ch_output_from_excludemnps.filter{it[0].sampleSize == 1}
-    def ch_input_for_combinegvcf = ch_output_from_excludemnps.filter{it[0].sampleSize > 1}    
+    def filtered_one = grouped_by_family.filter{it[0].sampleSize == 1}
+    def ch_input_for_combinegvcf = grouped_by_family.filter{it[0].sampleSize > 1}    
     def ch_output_from_combinegvcf = COMBINEGVCFS(ch_input_for_combinegvcf , pathReferenceGenomeFasta,pathReferenceGenomeFai,pathReferenceDict,pathIntervalFile).combined_gvcf
     .join(COMBINEGVCFS.out.tbi)
     .concat(filtered_one)
@@ -204,7 +219,7 @@ workflow POSTPROCESSING {
             vep_cache,
             params.vep_cache_version
         )
-        tabix(ch_output_from_vep.vcf)
+        vep_tabix(ch_output_from_vep.vcf)
     }
 
     if (isExomiserToolIncluded()) {