Merge pull request #157 from LouisLeNezet/chunk_model

Add chunk_model as parameter
nf-core · Nov 10, 2024 · 06d6b60 · 06d6b60
2 parents fe2213c + e34c5a2
commit 06d6b60
Show file tree

Hide file tree

Showing 19 changed files with 544 additions and 83 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -64,6 +64,7 @@ Initial release of nf-core/phaseimpute, created with the [nf-core](https://nf-co
 - [#148](https://github.com/nf-core/phaseimpute/pull/148) - Fix awsfulltest github action for manual dispatch
 - [#149](https://github.com/nf-core/phaseimpute/pull/149) - Remove the map file from the awsfulltest
 - [#152](https://github.com/nf-core/phaseimpute/pull/152) - Fix URLs in the documentation and remove tools citation in the README, use a white background for all images in the documentation.
+- [#157](https://github.com/nf-core/phaseimpute/pull/157) - Add `chunk_model` as parameter for better control over `GLIMPSE2_CHUNK` and set window size in `GLIMPSE1_CHUNK` and `GLIMPSE2_chunk` to 4mb to reduce number of chunks (empirical).
 
 ### `Fixed`
 

diff --git a/conf/steps/panel_prep.config b/conf/steps/panel_prep.config
@@ -41,8 +41,8 @@ process {
     }
 
     withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_NORM' {
-        ext.args      = ["-m +any", "--no-version", "--output-type z", "--write-index=tbi"].join(' ')
-        ext.prefix    = { "${meta.id}_${meta.chr}_multiallelic" }
+        ext.args   = ["-m +any", "--no-version", "--output-type z", "--write-index=tbi"].join(' ')
+        ext.prefix = { "${meta.id}_${meta.chr}_multiallelic" }
         publishDir = [ enabled: false ]
     }
 
@@ -62,7 +62,7 @@ process {
     }
 
     withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:VCFLIB_VCFFIXUP' {
-        ext.prefix   = { "${meta.id}_${meta.chr}_fixup" }
+        ext.prefix = { "${meta.id}_${meta.chr}_fixup" }
         publishDir = [
             path: { "${params.outdir}/prep_panel/panel" },
             mode: params.publish_dir_mode,
@@ -72,7 +72,7 @@ process {
     }
 
     withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_INDEX' {
-        ext.args     = "--tbi"
+        ext.args   = "--tbi"
         publishDir = [
             path: { "${params.outdir}/prep_panel/panel" },
             mode: params.publish_dir_mode,
@@ -88,6 +88,7 @@ process {
     }
 
     withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_PHASE_SHAPEIT5:GLIMPSE2_CHUNK' {
+        ext.args   = "--window-mb 4"
         ext.prefix = { "${meta.id}_chunks" }
     }
 
@@ -123,7 +124,7 @@ process {
     }
 
     withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:BCFTOOLS_CONVERT' {
-        ext.args = {"--haplegendsample ${meta.id}_${meta.chr}"}
+        ext.args   = {"--haplegendsample ${meta.id}_${meta.chr}"}
         publishDir = [
             path: { "${params.outdir}/prep_panel/haplegend/" },
             mode: params.publish_dir_mode,
@@ -184,6 +185,7 @@ process {
     }
 
     withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CHUNK_GLIMPSE:GLIMPSE_CHUNK' {
+        ext.args   = "--window-size 4"
         ext.prefix = { "${meta.id}_${meta.chr}_chunks_glimpse1" }
         publishDir = [
             path: { "${params.outdir}/prep_panel/chunks/glimpse1/" },
@@ -194,7 +196,8 @@ process {
     }
 
     withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CHUNK_GLIMPSE:GLIMPSE2_CHUNK' {
-        ext.prefix    = { "${meta.id}_${meta.chr}_chunks_glimpse2" }
+        ext.args   = "--window-mb 4"
+        ext.prefix = { "${meta.id}_${meta.chr}_chunks_glimpse2" }
         publishDir = [
             path: { "${params.outdir}/prep_panel/chunks/glimpse2/" },
             mode: params.publish_dir_mode,

diff --git a/conf/test_all.config b/conf/test_all.config
@@ -33,6 +33,7 @@ params {
     phase        = true
     normalize    = true
     compute_freq = false
+    chunk_model  = "recursive"
 
     // Pipeline steps
     steps = "all"

diff --git a/conf/test_dog.config b/conf/test_dog.config
@@ -32,6 +32,7 @@ params {
     normalize    = false
     compute_freq = false
     rename_chr   = true
+    chunk_model  = "recursive"
 
     // Input data
     input        = params.pipelines_testdata_base_path + "dog_data/csv/sample_dog.csv"

diff --git a/conf/test_panelprep.config b/conf/test_panelprep.config
@@ -32,6 +32,7 @@ params {
     normalize      = true
     compute_freq   = true
     remove_samples = "HG00096,HG00097,HG00099,HG00100"
+    chunk_model    = "recursive"
 
     // Pipeline steps
     steps   = "panelprep"

diff --git a/main.nf b/main.nf
@@ -43,6 +43,7 @@ workflow NFCORE_PHASEIMPUTE {
     ch_map         // channel: map file for imputation
     ch_posfile     // channel: samplesheet read in from --posfile
     ch_chunks      // channel: samplesheet read in from --chunks
+    chunk_model    // parameter: chunk model
     ch_versions    // channel: versions of software used
 
     main:
@@ -101,6 +102,7 @@ workflow NFCORE_PHASEIMPUTE {
         ch_map,
         ch_posfile,
         ch_chunks,
+        chunk_model,
         ch_versions
     )
     emit:
@@ -141,6 +143,7 @@ workflow {
         PIPELINE_INITIALISATION.out.gmap,
         PIPELINE_INITIALISATION.out.posfile,
         PIPELINE_INITIALISATION.out.chunks,
+        PIPELINE_INITIALISATION.out.chunk_model,
         PIPELINE_INITIALISATION.out.versions
     )
     //

diff --git a/nextflow.config b/nextflow.config
@@ -10,7 +10,7 @@
 params {
 
     // steps
-    steps                        = null
+    steps                       = null
 
     // Input options
     input                       = null
@@ -24,6 +24,7 @@ params {
     normalize                   = true
     compute_freq                = false
     remove_samples              = null
+    chunk_model                 = 'sequential'
 
     // ChrCheck parameters
     rename_chr                  = false

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -132,6 +132,13 @@
                 "binaryref": {
                     "type": "string",
                     "description": "Whether to generate a binary reference file to be used with GLIMPSE2"
+                },
+                "chunk_model": {
+                    "type": "string",
+                    "description": "Model type to use for GLIMPSE2_CHUNK",
+                    "enum": ["recursive", "sequential"],
+                    "default": "sequential",
+                    "hidden": true
                 }
             }
         },

diff --git a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf
@@ -279,6 +279,9 @@ workflow PIPELINE_INITIALISATION {
     // Check that all input files have the correct index
     checkFileIndex(ch_input.mix(ch_input_truth, ch_ref_gen, ch_panel))
 
+    // Chunk model
+    chunk_model = params.chunk_model
+
     emit:
     input                = ch_input         // [ [meta], file, index ]
     input_truth          = ch_input_truth   // [ [meta], file, index ]
@@ -289,6 +292,7 @@ workflow PIPELINE_INITIALISATION {
     gmap                 = ch_map           // [ [map], map ]
     posfile              = ch_posfile       // [ [panel, chr], vcf, index, hap, legend ]
     chunks               = ch_chunks        // [ [chr], txt ]
+    chunk_model          = chunk_model
     versions             = ch_versions
 }
 
@@ -407,6 +411,9 @@ def validateInputParameters() {
             error("To use `--remove_samples` you need to include `--normalize`.")
         }
     }
+
+    // Check that the chunk model is provided
+    assert params.chunk_model : "No chunk model provided"
 }
 
 //

diff --git a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/tests/main.nf.test b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/tests/main.nf.test
@@ -17,6 +17,7 @@ nextflow_workflow {
                 chunks = "../../../tests/csv/chunks.csv"
                 posfile = "../../../tests/csv/posfile.csv"
                 panel = "../../../tests/csv/panel.csv"
+                chunk_model = "recursive"
             }
             workflow {
                 """
@@ -49,6 +50,7 @@ nextflow_workflow {
                 posfile = "../../../tests/csv/posfile.csv"
                 panel = "../../../tests/csv/panel.csv"
                 input_region = "$moduleTestDir/region.csv"
+                chunk_model = "sequential"
             }
             workflow {
                 """
@@ -80,6 +82,7 @@ nextflow_workflow {
                 chunks = "../../../tests/csv/chunks.csv"
                 panel = "../../../tests/csv/panel.csv"
                 input_region = "$moduleTestDir/region.csv"
+                chunk_model = "recursive"
             }
             workflow {
                 """

diff --git a/subworkflows/local/vcf_chunk_glimpse/main.nf b/subworkflows/local/vcf_chunk_glimpse/main.nf
@@ -7,6 +7,7 @@ workflow VCF_CHUNK_GLIMPSE {
     take:
     ch_reference  // channel: [ [panel, chr], vcf, csi ]
     ch_map        // channel  (optional): [ [chr], map ]
+    chunk_model   // channel : model
 
     main:
 
@@ -36,9 +37,6 @@ workflow VCF_CHUNK_GLIMPSE {
         )
         .map { metaPC, it -> [metaPC, it["RegionIn"], it["RegionOut"]]}
 
-    // Make chunks with Glimpse2 (does not work with "sequential" mode)
-    chunk_model = "recursive"
-
     ch_input_glimpse2 = ch_vcf_csi_chr
         .map{
             metaPC, vcf, csi, chr -> [metaPC.subMap("chr"), metaPC, vcf, csi, chr]

diff --git a/subworkflows/local/vcf_chunk_glimpse/tests/main.nf.test b/subworkflows/local/vcf_chunk_glimpse/tests/main.nf.test
@@ -47,17 +47,20 @@ nextflow_workflow {
                         file(params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38_21.map", checkIfExist:true)
                     ]
                 )
+                input[2] = "recursive"
                 """
             }
         }
 
         then {
             assertAll(
                 { assert workflow.success },
-                { assert snapshot(workflow.out).match() },
-                { assert snapshot(workflow.out.chunks.collect{
-                    path(it[1]).readLines()
-                    }).match("chunksWithMap")
+                { assert snapshot(
+                    workflow.out,
+                    workflow.out.chunks.collect{
+                        path(it[1]).readLines()
+                    }
+                ).match()
                 }
             )
         }
@@ -87,17 +90,63 @@ nextflow_workflow {
                     [[chr: "chr22"], []],
                     [[chr: "chr21"], []]
                 )
+                input[2] = "recursive"
                 """
             }
         }
 
         then {
             assertAll(
                 { assert workflow.success },
-                { assert snapshot(workflow.out).match() },
-                { assert snapshot(workflow.out.chunks.collect{
-                    path(it[1]).readLines()
-                    }).match("chunksWithoutMap")
+                { assert snapshot(
+                    workflow.out,
+                    workflow.out.chunks.collect{
+                        path(it[1]).readLines()
+                    }
+                ).match()
+                }
+            )
+        }
+    }
+
+    test("Chunks with sequential model") {
+        when {
+            params {
+                max_cpus   = 2
+                max_memory = '2.GB'
+            }
+            workflow {
+                """
+                input[0] = Channel.of(
+                    [
+                        [id: "1000GP", chr: "chr22"],
+                        file(params.pipelines_testdata_base_path + "hum_data/panel/chr22/1000GP.chr22.s.norel.vcf.gz", checkIfExist:true),
+                        file(params.pipelines_testdata_base_path + "hum_data/panel/chr22/1000GP.chr22.s.norel.vcf.gz.csi", checkIfExist:true),
+                    ],
+                    [
+                        [id: "1000GP", chr: "chr21"],
+                        file(params.pipelines_testdata_base_path + "hum_data/panel/chr21/1000GP.chr21.s.norel.vcf.gz", checkIfExist:true),
+                        file(params.pipelines_testdata_base_path + "hum_data/panel/chr21/1000GP.chr21.s.norel.vcf.gz.csi", checkIfExist:true),
+                    ]
+                )
+                input[1] = Channel.of(
+                    [[chr: "chr22"], []],
+                    [[chr: "chr21"], []]
+                )
+                input[2] = "sequential"
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert workflow.success },
+                { assert snapshot(
+                    workflow.out,
+                    workflow.out.chunks.collect{
+                        path(it[1]).readLines()
+                    }
+                ).match()
                 }
             )
         }