Merge pull request #54 from Ferlab-Ste-Justine/feat/CLIN-3713-allow-e…

…xomiser-to-start-from-vep feat: CLIN-3713 allow exomiser to start from vep
Ferlab-Ste-Justine · Dec 20, 2024 · 7bbc8d2 · 7bbc8d2
2 parents f4f6489 + 93e12db
commit 7bbc8d2
Show file tree

Hide file tree

Showing 10 changed files with 68 additions and 42 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#49](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/49) Add support for local frequency source
 - [#49](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/49) Pass java -Xmx option at the command line for exomiser
 - [#53](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/53) Replace vep and tabix logic by a standard nf-core subworkflow
+- [#54](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/54) Allow exomiser to start from the vep output
+
+### `Changed`
+- [#54](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/54) Standardize exomiser output filenames
 
 ### `Fixed`
 - [#50](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/50) Use container tag 1.20 for splitMultiAllelics process

diff --git a/conf/modules.config b/conf/modules.config
@@ -68,6 +68,7 @@ process {
             enabled: true,
             pattern: 'results/*{vcf.gz,vcf.gz.tbi,tsv,json,html}'
         ])
+        ext.args = { "--output-filename=${meta.id}.exomiser" }
     }
 
     withName: ENSEMBLVEP_VEP {

diff --git a/docs/output.md b/docs/output.md
@@ -109,22 +109,22 @@ The `exomiser/results` subdirectory contains the output fo the pipeline after th
 
 ```
 |_ exomiser/results
-   |_ family1.splitted-exomiser.genes.tsv
-   |_ family1.splitted-exomiser.html
-   |_ family1.splitted-exomiser.json
-   |_ family1.splitted-exomiser.variants.tsv
-   |_ family1.splitted-exomiser.vcf.gz
-   |_ family1.splitted-exomiser.vcf.gz.tbi
+   |_ family1.exomiser.genes.tsv
+   |_ family1.exomiser.html
+   |_ family1.exomiser.json
+   |_ family1.exomiser.variants.tsv
+   |_ family1.exomiser.vcf.gz
+   |_ family1.exomiser.vcf.gz.tbi
   ...   
 ```
 
 It should contains a set of 6 files per family.  Specifically, we use the following naming scheme:
-- `<FAMILY_ID>.splitted-exomiser.genes.tsv`
-- `<FAMILY_ID>.splitted-exomiser.html`
-- `<FAMILY_ID>.splitted-exomiser.json`
-- `<FAMILY_ID>.splitted-exomiser.variants.tsv`
-- `<FAMILY_ID>.splitted-exomiser.vcf.gz`
-- `<FAMILY_ID>.splitted-exomiser.vcf.gz.tbi`
+- `<FAMILY_ID>.exomiser.genes.tsv`
+- `<FAMILY_ID>.exomiser.html`
+- `<FAMILY_ID>.exomiser.json`
+- `<FAMILY_ID>.exomiser.variants.tsv`
+- `<FAMILY_ID>.exomiser.vcf.gz`
+- `<FAMILY_ID>.exomiser.vcf.gz.tbi`
 
 The family ID should match the family ID in the input sample sheet.
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -98,6 +98,14 @@ Exomiser, on the other hand, is a tool specifically designed for the analysis of
 integrates phenotype data with variant information to prioritize variants that are likely to be disease-causing. 
 This can greatly assist in the identification of potential disease-causing variants in exome sequencing data.
 
+### Exomiser input data
+
+By default, both vep and exomiser steps, if applicable, run in parallel and consume the output of the normalization step.
+
+To have the Exomiser step start from the VEP output instead, set the parameter `exomiser_start_from_vep` to `true`. In this case, the vep and exomiser steps will run sequentially.
+
+Note that the parameter `exomiser_start_from_vep` will be ignored if vep is not specified via the `tools` parameter.
+
 ### Customize versions and commands
 
 If needed, it is possible to customize the options passed to the vep command by overriding the ext.args directive for the
@@ -188,3 +196,4 @@ Parameters summary
 | `exomiser_remm_filename` | _Optional_	| Filename of the exomiser REMM data file (e.g., `ReMM.v0.3.1.post1.hg38.tsv.gz`) |
 | `exomiser_analysis_wes` | _Optional_ | Path to the exomiser analysis file for WES data, if different from the default |
 | `exomiser_analysis_wgs` | _Optional_ | Path to the exomiser analysis file for WGS data, if different from the default |
+| `exomiser_start_from_vep` | _Optional_ | If `true` (default `false`), run the exomiser analysis on the VEP annotated VCF file. Ignored if vep is not activated via `tools` parameter. |
diff --git a/modules/local/exomiser/main.nf b/modules/local/exomiser/main.nf
@@ -3,7 +3,7 @@ process EXOMISER {
     label 'process_medium'
 
     input:
-    tuple val(meta), path(vcfFile), path(phenoFile), path(analysisFile)
+    tuple val(meta), path(vcfFile), path(indexFile), path(phenoFile), path(analysisFile)
     path datadir
     val exomiserGenome
     val exomiserDataVersion
@@ -33,7 +33,6 @@ process EXOMISER {
 
     script:
     def args = task.ext.args ?: ''
-    def exactVcfFile = vcfFile.find { it.name.endsWith("vcf.gz") }
 
     def localFrequencyFileArgs = "" 
     if (localFrequencyPath) {
@@ -63,11 +62,13 @@ process EXOMISER {
         avail_mem = (task.memory.mega*0.8).intValue()
     }
 
+    // Note: specifying the extra options (args) at the beginning because output options are ignored when they are passed at the end.
     """
     #!/bin/bash -eo pipefail
 
     java -Xmx${avail_mem}M -cp \$( cat /app/jib-classpath-file ) \$( cat /app/jib-main-class-file ) \\
-        --vcf ${exactVcfFile} \\
+        ${args} \\
+        --vcf ${vcfFile} \\
         --assembly "${params.exomiser_genome}" \\
         --analysis "${analysisFile}" \\
         --sample ${phenoFile} \\
@@ -77,8 +78,7 @@ process EXOMISER {
         ${remmArgs} \\
         ${caddArgs} \\
         --exomiser.${exomiserGenome}.data-version="${exomiserDataVersion}" \\
-        --exomiser.phenotype.data-version="${exomiserDataVersion}" \\
-        ${args}
+        --exomiser.phenotype.data-version="${exomiserDataVersion}"
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
@@ -91,12 +91,12 @@ process EXOMISER {
     """
     #!/bin/bash -eo pipefail
     mkdir results
-    touch results/${familyId}.splitted-exomiser.genes.tsv
-    touch results/${familyId}.splitted-exomiser.html
-    touch results/${familyId}.splitted-exomiser.json
-    touch results/${familyId}.splitted-exomiser.variants.tsv
-    touch results/${familyId}.splitted-exomiser.vcf.gz
-    touch results/${familyId}.splitted-exomiser.vcf.gz.tbi
+    touch results/${familyId}.exomiser.genes.tsv
+    touch results/${familyId}.exomiser.html
+    touch results/${familyId}.exomiser.json
+    touch results/${familyId}.exomiser.variants.tsv
+    touch results/${familyId}.exomiser.vcf.gz
+    touch results/${familyId}.exomiser.vcf.gz.tbi
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/exomiser/tests/main.nf.test b/modules/local/exomiser/tests/main.nf.test
@@ -17,9 +17,10 @@ nextflow_process {
             process {
                 """
                 input[0] = [ [familyId: "family1"],
-                 file("https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/gvcf/test.genome.vcf.gz"), 
-                 file("assets/exomiser/pheno/family1.yml"),
-                 file("assets/exomiser/default_exomiser_WGS_analysis.yml")]
+                file("https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/gvcf/test.genome.vcf.gz"), 
+                file("https://github.com/nf-core/test-datasets/raw/modules/data/genomics/homo_sapiens/illumina/gvcf/test.genome.vcf.gz.tbi"), 
+                file("assets/exomiser/pheno/family1.yml"),
+                file("assets/exomiser/default_exomiser_WGS_analysis.yml")]
                 input[1] = file("data-test/reference/exomiser")
                 input[2] = "hg38"
                 input[3] = "2402"
@@ -37,32 +38,32 @@ nextflow_process {
                 // vcf channel
                 assert vcf.size() == 1
                 assert vcf.get(0)[0] == expected_meta
-                assert file(vcf.get(0)[1]).name == "family1.splitted-exomiser.vcf.gz"
+                assert file(vcf.get(0)[1]).name == "family1.exomiser.vcf.gz"
 
                 // tbi channel
                 assert tbi.size() == 1
                 assert tbi.get(0)[0] == expected_meta
-                assert file(tbi.get(0)[1]).name == "family1.splitted-exomiser.vcf.gz.tbi"
+                assert file(tbi.get(0)[1]).name == "family1.exomiser.vcf.gz.tbi"
 
                 // html channel
                 assert html.size() == 1
                 assert html.get(0)[0] == expected_meta
-                assert file(html.get(0)[1]).name == "family1.splitted-exomiser.html"
+                assert file(html.get(0)[1]).name == "family1.exomiser.html"
 
                 // json channel
                 assert json.size() == 1
                 assert json.get(0)[0] == expected_meta
-                assert file(json.get(0)[1]).name == "family1.splitted-exomiser.json"
+                assert file(json.get(0)[1]).name == "family1.exomiser.json"
 
                 // genetsv channel
                 assert genetsv.size() == 1
                 assert genetsv.get(0)[0] == expected_meta
-                assert file(genetsv.get(0)[1]).name == "family1.splitted-exomiser.genes.tsv"
+                assert file(genetsv.get(0)[1]).name == "family1.exomiser.genes.tsv"
 
                 // variantstsv channel
                 assert variantstsv.size() == 1
                 assert variantstsv.get(0)[0] == expected_meta
-                assert file(variantstsv.get(0)[1]).name == "family1.splitted-exomiser.variants.tsv"
+                assert file(variantstsv.get(0)[1]).name == "family1.exomiser.variants.tsv"
 
                 // versions channel
                 assert snapshot(versions).match()

diff --git a/modules/local/split_multi_allelics.nf b/modules/local/split_multi_allelics.nf
@@ -7,7 +7,7 @@ process splitMultiAllelics{
     path referenceGenome
 
     output:
-    tuple val(meta), path("*splitted.vcf*")
+    tuple val(meta), path("*splitted.vcf.gz"), path("*splitted.vcf.gz.tbi")
 
     script:
     def familyId = meta.familyId

diff --git a/nextflow.config b/nextflow.config
@@ -38,6 +38,7 @@ params {
     exomiser_analysis_wgs = "${projectDir}/assets/exomiser/default_exomiser_WGS_analysis.yml"
     exomiser_local_frequency_path = null
     exomiser_local_frequency_index_path = null
+    exomiser_start_from_vep = false
 
 	//Process-specific parameters
     exclude_mnps = true

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -401,6 +401,11 @@
           "description": "Path to the index of the local frequency data file",
           "format": "file-path",
           "pattern": "^\\S+\\.tbi$"
+        },
+        "exomiser_start_from_vep": {
+          "type": "boolean",
+          "description": "If true, run the exomiser analysis on the VEP annotated VCF file",
+          "default": false
         }
       },
       "allOf": [

diff --git a/workflows/postprocessing.nf b/workflows/postprocessing.nf
@@ -67,9 +67,10 @@ def exomiser(inputChannel,
     cadd_snv_filename,
     cadd_indel_filename
     ) {
-    def ch_input_for_exomiser = inputChannel.map{meta, files -> [
+    def ch_input_for_exomiser = inputChannel.map{meta, vcf, tbi -> [
         meta,
-        files,
+        vcf,
+        tbi,
         meta.familypheno, 
         meta.sequencingType == "WES"? file(analysis_wes_path) : file(analysis_wgs_path)
     ]}
@@ -94,11 +95,7 @@ def exomiser(inputChannel,
 
 def vep(input_channel, vep_genome, vep_species, path_fasta, vep_cache, vep_cache_version) {
 
-    def ch_input_for_vep  = input_channel.map{meta, files ->
-        def vcf_file = files.find { it.name.endsWith("vcf.gz") }
-        def custom_extra_files = [] 
-        [meta, vcf_file, custom_extra_files]
-    }
+    def ch_input_for_vep  = input_channel.map{meta, vcf, tbi -> [meta, vcf, []]}
 
     return VCF_ANNOTATE_ENSEMBLVEP(
         ch_input_for_vep,  //  meta, vcf, optional_custom_files
@@ -226,11 +223,13 @@ workflow POSTPROCESSING {
     //normalize variants
     def ch_output_from_splitMultiAllelics = splitMultiAllelics(ch_output_from_tagArtifacts, referenceGenome)
 
+
+    def ch_output_from_vep //declaring vep output channel early so that it can be accessed outside the if block
     //Annotating variants with VEP
     if (isVepToolIncluded()) {
         def vep_cache = file(params.vep_cache)
 
-        def ch_output_from_vep = vep(
+        ch_output_from_vep = vep(
             ch_output_from_splitMultiAllelics, 
             params.vep_genome,
             HOMO_SAPIENS_SPECIES,
@@ -241,7 +240,13 @@ workflow POSTPROCESSING {
     }
 
     if (isExomiserToolIncluded()) {
-        exomiser(ch_output_from_splitMultiAllelics, 
+        def ch_exomiser_input = ch_output_from_splitMultiAllelics
+        if (isVepToolIncluded() && params.exomiser_start_from_vep){
+            log.info("Running the exomiser analysis using the vep annotated vcf file as input")
+            ch_exomiser_input = ch_output_from_vep
+        }
+        exomiser(
+            ch_exomiser_input,
             params.exomiser_genome,
             params.exomiser_data_version,
             params.exomiser_data_dir,