Merge pull request #41 from Ferlab-Ste-Justine/feat/CLIN-3407-make-ve…

…p-command-configurable feat: CLIN-3406 make vep command configurable
Ferlab-Ste-Justine · Dec 3, 2024 · c43ca3d · c43ca3d
2 parents 16eeed0 + a3ecf05
commit c43ca3d
Show file tree

Hide file tree

Showing 24 changed files with 530 additions and 134 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -23,7 +23,7 @@ jobs:
       matrix:
         NXF_VER:
           - "23.10.1"
-          - "latest-everything"
+          - "latest-stable"
     steps:
       - name: Check out pipeline code
         uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## v2.2.0-dev - [date]
 
+### `Added`
+- [#41](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/41) Allow to customize the vep command
+- [#41](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/41) Improve parameter schema for params max_disk, max_memory, max_time
+- [#41](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/41) Consider only stable nextflow versions for ci test
+
+### `Known issues`
+- The nf-core modules that we are using have a potential performance flaw. Typically, the regex used to describe the output files also match the input files (ex: "*.vcf"), which can cause unnecessary file transfers.  This has already proven to cause issues on fusion. One fix could be to transfer the whole modules to local to perform the small change necessary to fix this.
+- The VEP cache version used in the CQDG environment (112) does not match the default configured VEP version (111). This issue can be avoided by overriding the Docker container of the ensemblevep process. If no project is using VEP version 111, it should not be used as the default value.
+
+
+### `Fixed`
+- [#51](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/41) Fix vep url pointing to the wrong vep version in the reference data documentation.
 
 ## v2.1.0dev
 
@@ -18,6 +30,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#35](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/35) Fix incorrect assumption about assets folder location in github ci workflow
 - [#36](https://github.com/Ferlab-Ste-Justine/Post-processing-Pipeline/pull/36) Fix variable input in process BCFTOOLS_NORM causing resume problems
 
+### `Known issues`
+- The nf-core modules genotypeGVCFs and VARIANTFILTRATION have a potential performance flaw. The output glob specifies for vcf and tbi *.vcf and *.vcf.tbi respectively. This regex will also include the inputs, which can cause unnecessary file transfers. This has already proven to cause issues on fusion. One fix could be to transfer the whole modules to local to perform the small change necessary to fix this (change the globs to *${prefix}.vcf)
+
+
 ## v2.0.0dev
 
 ### `Added`

diff --git a/conf/modules.config b/conf/modules.config
@@ -57,11 +57,32 @@ process {
         ]
     }
 
+    withName: ENSEMBLVEP_VEP {
+        container = 'ensemblorg/ensembl-vep:release_111.0' //sticking to v111 for now, but we should update this
+        def args_list =  [
+            "--offline",
+            "--format vcf",
+            "--vcf",
+            "--xref_refseq",
+            "--variant_class",
+            "--numbers",
+            "--hgvs",
+            "--hgvsg",
+            "--canonical",
+            "--symbol",
+            "--flag_pick",
+            "--no_stats",
+            "--fields \"Allele,Consequence,IMPACT,SYMBOL,Feature_type,Gene,PICK,Feature,EXON,BIOTYPE,INTRON,HGVSc,HGVSp,STRAND,CDS_position,cDNA_position,Protein_position,Amino_acids,Codons,VARIANT_CLASS,HGVSg,CANONICAL,RefSeq\""
+        ]
+        ext.args = args_list.join(" ")
+        ext.prefix =  {"variants.${meta.id}.vep"}
+    }
+
     // Currently, tabix is used only for vep output in the post-processing pipeline.
     // Consider creating a vep subworkflow to group vep and tabix steps, making the process name more specific.
     withName: 'FERLAB_POSTPROCESSING:POSTPROCESSING:tabix' {
          publishDir = [
-            path: { "${params.outdir}/vep" },
+            path: { "${params.outdir}/ensemblvep" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]

diff --git a/conf/test.config b/conf/test.config
@@ -35,7 +35,11 @@ params {
     referenceGenomeFasta = "chr22.fa"
     intervalsFile = "testInterval22.list"
     broad = "data-test/reference/broad"
-    vepCache = "data-test/reference/annotation/.vep"
+
+    //Vep
+    vep_cache = "data-test/reference/annotation/.vep"
+    vep_cache_version = "111"
+    vep_genome = "GRCh38"
 
     // Filters for hard filtering
     hardFilters = [[name: 'QD2', expression: 'QD < 2.0'],

diff --git a/docs/output.md b/docs/output.md
@@ -20,7 +20,7 @@ The directories listed below will be created in the output directory after the p
   - Copies of the configuration files used: `config/*.config`. This includes the default `nextflow.config` file as well as any additional configuration files passed as parameters.
   - Other metadata relevant for reproducibility: `metadata.txt` . It contains information such as the original command line, the name of the branch and revision used, the username of the person who submitted the job, a list of configuration files passed, the nextflow work directory, etc.
 - `splitmultiallelics/`: pipeline output before running the tools specified via the `tools` parameter.
-- `vep/`: vep output
+- `ensemblvep/`: vep output
 - `exomiser/results`: exomiser output
 
 You might see other folders named after different pipeline processes. These are considered intermediate pipeline outputs.

diff --git a/docs/reference_data.md b/docs/reference_data.md
@@ -12,6 +12,7 @@ This directory should contain the following files:
 - The reference genome FASTA file index (e.g., `Homo_sapiens_assembly38.fasta.fai`). Its location will be automatically derived by appending `.fai` to the `referenceGenomeFasta` parameter.
 - The reference genome dictionary file (e.g., `Homo_sapiens_assembly38.dict`). Its location will be automatically derived by replacing the `.fasta` file extension of the `referenceGenomeFasta` parameter with `.dict`.
 
+
 ## Broad reference data (VQSR)
 The `broad` parameter specifies the directory containing the reference data files for VQSR. We chose the name `broad` because
 this data is from the [Broad Institute](https://www.broadinstitute.org/), a collaborative research institution known for its contributions to genomics and biomedical research.
@@ -34,7 +35,8 @@ The `vepCache` parameter specifies the directory for the vep cache. It is only r
 `tools` parameter.
 
 The vep cache is not automatically populated by the pipeline. It must be pre-downloaded. You can obtain a copy of the 
-data by following the [vep installation procedure](https://github.com/Ensembl/ensembl-vep). Generally, we only need the human files obtainable from [Ensembl](https://ftp.ensembl.org/pub/release-112/variation/vep/homo_sapiens_vep_112_GRCh38.tar.gz).
+data by following the [vep installation procedure](https://github.com/Ensembl/ensembl-vep). Generally, we only need the human files obtainable from [Ensembl](https://ftp.ensembl.org/pub/release-111/variation/vep/homo_sapiens_vep_111_GRCh38.tar.gz).
+Make sure to use the data release matching the vep version used (i.e. configured docker container for the vep process).
 
 ## Exomiser reference data
 The exomiser reference data is only required if `exomiser` is specified via the `tools` parameter.

diff --git a/docs/usage.md b/docs/usage.md
@@ -91,6 +91,11 @@ Exomiser, on the other hand, is a tool specifically designed for the analysis of
 integrates phenotype data with variant information to prioritize variants that are likely to be disease-causing. 
 This can greatly assist in the identification of potential disease-causing variants in exome sequencing data.
 
+### Customize versions and commands
+
+If needed, it is possible to customize the options passed to the vep command by overriding the ext.args directive for the
+ENSEMBLVEP_VEP process. See [conf/modules.config](../conf/modules.config).
+
 
 ### Stub mode and quick tests
 

diff --git a/modules.json b/modules.json
@@ -15,6 +15,11 @@
             "git_sha": "33ef773a7ea36e88323902f63662aa53c9b88988",
             "installed_by": ["modules"]
           },
+          "ensemblvep/vep": {
+            "branch": "master",
+            "git_sha": "6e3585d9ad20b41adc7d271009f8cb5e191ecab4",
+            "installed_by": ["modules"]
+          },
           "gatk4/genotypegvcfs": {
             "branch": "master",
             "git_sha": "06c8865e36741e05ad32ef70ab3fac127486af48",

diff --git a/modules/local/split_multi_allelics.nf b/modules/local/split_multi_allelics.nf
@@ -0,0 +1,31 @@
+// This module does not follow nf-core standards. We plan to fix or replace it with an nf-core module in the future.
+process splitMultiAllelics{
+    label 'medium'
+
+    container 'staphb/bcftools'
+
+    input:
+    tuple val(meta), path(vcfFile)
+    path referenceGenome
+
+    output:
+    tuple val(meta), path("*splitted.vcf*")
+
+    script:
+    def familyId = meta.familyId
+    def exactVcfFile = vcfFile.find { it.name.endsWith("vcf.gz") }
+    """
+    set -e
+    echo $familyId > file
+    bcftools annotate -x FORMAT/PRI ${exactVcfFile} | bcftools norm -c w -m -any -f $referenceGenome/${params.referenceGenomeFasta} --old-rec-tag OLD_RECORD --output-type z --output ${familyId}.normed.vcf.gz  
+    bcftools view --min-ac 1 --output-type z --output ${familyId}.splitted.vcf.gz ${familyId}.normed.vcf.gz
+    bcftools index -t ${familyId}.splitted.vcf.gz
+    """
+
+    stub:
+    def familyId = meta.familyId
+    """
+    touch ${familyId}.splitted.vcf.gz
+    touch ${familyId}.splitted.vcf.gz.tbi
+    """
+}
diff --git a/modules/local/tabix.nf b/modules/local/tabix.nf
@@ -0,0 +1,23 @@
+// This module does not follow nf-core standards. We plan to fix or replace it with an nf-core module in the future.
+process tabix {
+    label 'tiny'
+
+    input:
+    tuple val(meta), path(vcfFile)
+
+    output:
+    path "*.tbi"
+
+    script:
+    def args = task.ext.args ?: ''
+
+    """
+    tabix \\
+        $vcfFile \\
+        $args
+    """
+    stub:
+    """
+    touch ${vcfFile}.tbi
+    """
+} 
diff --git a/modules/local/vep.nf b/modules/local/vep.nf
diff --git a/modules/local/vqsr.nf b/modules/local/vqsr.nf
@@ -1,3 +1,5 @@
+// This module does not follow nf-core standards. We plan to fix or replace it with nf-core modules in the future.
+
 
 /**
 Build a recalibration model to score SNP variant quality for filtering purposes

diff --git a/modules/nf-core/ensemblvep/vep/environment.yml b/modules/nf-core/ensemblvep/vep/environment.yml
diff --git a/modules/nf-core/ensemblvep/vep/main.nf b/modules/nf-core/ensemblvep/vep/main.nf