From a89cf5f891c6c86fad04cb3f8aafb7293ca4f1e8 Mon Sep 17 00:00:00 2001 From: Louis Le Nezet Date: Tue, 26 Nov 2024 14:39:59 +0100 Subject: [PATCH 01/17] Improve fulltest --- conf/steps/imputation_glimpse1.config | 2 +- conf/steps/imputation_glimpse2.config | 2 +- conf/steps/simulation.config | 1 + conf/steps/validation.config | 1 + conf/test_full.config | 7 ++++--- tests/csv/sample_sim_full_truth.csv | 2 ++ 6 files changed, 10 insertions(+), 5 deletions(-) create mode 100644 tests/csv/sample_sim_full_truth.csv diff --git a/conf/steps/imputation_glimpse1.config b/conf/steps/imputation_glimpse1.config index d28ad992..7e31ce3d 100644 --- a/conf/steps/imputation_glimpse1.config +++ b/conf/steps/imputation_glimpse1.config @@ -61,7 +61,7 @@ process { withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_IMPUTE_GLIMPSE1:GLIMPSE_PHASE' { ext.args = ["--impute-reference-only-variants"].join(' ') ext.prefix = { "${meta.id}.batch${meta.batch}.${meta.chunk.replace(':','_')}.glimpse1" } - ext.suffix = "bcf" + ext.suffix = "bcf.gz" publishDir = [ enabled: false ] } diff --git a/conf/steps/imputation_glimpse2.config b/conf/steps/imputation_glimpse2.config index 76ef02ce..bec0e5c9 100644 --- a/conf/steps/imputation_glimpse2.config +++ b/conf/steps/imputation_glimpse2.config @@ -21,7 +21,7 @@ process { cache = "lenient" ext.prefix = { "${meta.id}.batch${meta.batch}.${meta.chunk.replace(':','_')}.glimpse2" } ext.args = { "--keep-monomorphic-ref-sites" } - ext.suffix = "bcf" + ext.suffix = "bcf.gz" publishDir = [ enabled: false ] } diff --git a/conf/steps/simulation.config b/conf/steps/simulation.config index 5b4ea805..71172319 100644 --- a/conf/steps/simulation.config +++ b/conf/steps/simulation.config @@ -17,6 +17,7 @@ process { tag = {"${meta.id} ${meta.chr}"} } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_EXTRACT_REGION_SAMTOOLS:SAMTOOLS_VIEW' { + label = 'process_medium' ext.args = ["--output-fmt bam", "--write-index"].join(' ') ext.prefix = { "${meta.id}_R${meta.region.replace(':','_')}" } } diff --git a/conf/steps/validation.config b/conf/steps/validation.config index b5ca4fde..a95795b8 100644 --- a/conf/steps/validation.config +++ b/conf/steps/validation.config @@ -26,6 +26,7 @@ process { } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GL_TRUTH:BCFTOOLS_MPILEUP' { + label = 'process_high' ext.args = [ "-I", "-E", diff --git a/conf/test_full.config b/conf/test_full.config index f4048ae4..61cfe3df 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -27,8 +27,9 @@ params { genome = "GRCh38" // Input data - input = "${projectDir}/tests/csv/sample_sim_full.csv" - panel = "${projectDir}/tests/csv/panel_full.csv" + input = "${projectDir}/tests/csv/sample_sim_full.csv" + input_truth = "${projectDir}/tests/csv/sample_sim_full_truth.csv" + panel = "${projectDir}/tests/csv/panel_full.csv" // Pipeline steps steps = "all" @@ -40,5 +41,5 @@ params { phase = false // Impute tools - tools = "glimpse1" + tools = "glimpse2" } diff --git a/tests/csv/sample_sim_full_truth.csv b/tests/csv/sample_sim_full_truth.csv new file mode 100644 index 00000000..2b089769 --- /dev/null +++ b/tests/csv/sample_sim_full_truth.csv @@ -0,0 +1,2 @@ +sample,file,index +NA12878,ftp://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz,ftp://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.csi From a9eafb3a1a5cf3747f07a4409eaa08012a1fdbf4 Mon Sep 17 00:00:00 2001 From: Louis Le Nezet Date: Tue, 26 Nov 2024 14:45:10 +0100 Subject: [PATCH 02/17] Update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d147605d..afb17bd3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -67,6 +67,7 @@ Special thanks to [Matthias Hörtenhuber](https://github.com/mashehu), [Mazzalab - [#160](https://github.com/nf-core/phaseimpute/pull/160) - Improve `CHANGELOG.md` and add details to `usage.md` - [#158](https://github.com/nf-core/phaseimpute/pull/158) - Remove frequency computation and phasing from full test to reduce cost and computational time. - [#164](https://github.com/nf-core/phaseimpute/pull/164) - Rename `BAM_REGION_SAMTOOLS` to `BAM_EXTRACT_REGION_SAMTOOLS`. Remove `GLIMPSE2_SPLITREFERENCE` as it is not used. Add more steps to `test_all` profile for more exhaustivity. +- [#163](https://github.com/nf-core/phaseimpute/pull/163) - Improve configuration for demanding processes. Use Genome in a Bottle VCF benchmarking file for AWS full test. ### `Fixed` From 79b7dfb517e9e9a1bfd5c501ed93f87a7efe9b57 Mon Sep 17 00:00:00 2001 From: Louis Le Nezet Date: Tue, 26 Nov 2024 14:53:05 +0100 Subject: [PATCH 03/17] Revert extension change --- conf/steps/imputation_glimpse2.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/steps/imputation_glimpse2.config b/conf/steps/imputation_glimpse2.config index bec0e5c9..76ef02ce 100644 --- a/conf/steps/imputation_glimpse2.config +++ b/conf/steps/imputation_glimpse2.config @@ -21,7 +21,7 @@ process { cache = "lenient" ext.prefix = { "${meta.id}.batch${meta.batch}.${meta.chunk.replace(':','_')}.glimpse2" } ext.args = { "--keep-monomorphic-ref-sites" } - ext.suffix = "bcf.gz" + ext.suffix = "bcf" publishDir = [ enabled: false ] } From 7a36d0447caed4bf127d49619f127e63ee2dc6ec Mon Sep 17 00:00:00 2001 From: Louis Le Nezet Date: Tue, 26 Nov 2024 14:54:14 +0100 Subject: [PATCH 04/17] Revert extension change --- conf/steps/imputation_glimpse1.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/steps/imputation_glimpse1.config b/conf/steps/imputation_glimpse1.config index 7e31ce3d..d28ad992 100644 --- a/conf/steps/imputation_glimpse1.config +++ b/conf/steps/imputation_glimpse1.config @@ -61,7 +61,7 @@ process { withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_IMPUTE_GLIMPSE1:GLIMPSE_PHASE' { ext.args = ["--impute-reference-only-variants"].join(' ') ext.prefix = { "${meta.id}.batch${meta.batch}.${meta.chunk.replace(':','_')}.glimpse1" } - ext.suffix = "bcf.gz" + ext.suffix = "bcf" publishDir = [ enabled: false ] } From 14d8aee023100f5e0d1529cdd502857670e00fe3 Mon Sep 17 00:00:00 2001 From: LouisLeNezet Date: Wed, 27 Nov 2024 17:56:38 +0100 Subject: [PATCH 05/17] Fix multiqc file names --- conf/modules.config | 29 ++++++++- conf/steps/panel_prep.config | 27 ++++++-- conf/steps/simulation.config | 3 + conf/steps/validation.config | 21 ++++++- modules/nf-core/gawk/main.nf | 10 +-- .../local/bam_extract_region_samtools/main.nf | 2 +- .../local/vcf_concordance_glimpse2/main.nf | 18 +++++- workflows/phaseimpute/main.nf | 63 ++++++++++++++----- 8 files changed, 142 insertions(+), 31 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 74062fa8..0e75eb81 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -32,11 +32,22 @@ process { // Coverage process withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:SAMTOOLS_COVERAGE_INP' { cache = "lenient" - ext.prefix = { "${meta.id}.truth" } + ext.prefix = { "${meta.id}.truth.allchr" } publishDir = [ enabled: false ] } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:SAMTOOLS_COVERAGE_DWN' { cache = "lenient" + ext.prefix = { "${meta.id}.allchr" } + publishDir = [ enabled: false ] + } + + // Filter chromosomes in coverage + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:FILTER_CHR_INP' { + ext.prefix = { "${meta.id}.truth" } + publishDir = [ enabled: false ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:FILTER_CHR_DWN' { ext.prefix = { "${meta.id}" } publishDir = [ enabled: false ] } @@ -86,8 +97,22 @@ process { ] } + // Compute sample files for renaming + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BCFTOOLS_QUERY_IMPUTED' { + tag = { "${meta.id} Batch ${meta.batch} ${meta.tools}" } + ext.args = '--list-samples' + publishDir = [enabled: false] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GAWK_IMPUTED' { + tag = { "${meta.id} Batch ${meta.batch} ${meta.tools}" } + ext.prefix = { "${meta.id}_samples"} + ext.args2 = { "-v tools=\"${meta.tools}\" " + "'BEGIN { OFS = \"\\t\" } { print \$1, \"-\", \$1\".\"tools }'" } + publishDir = [enabled: false] + } + // Split by samples for each tool - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SPLIT_BCFTOOLS:BCFTOOLS_PLUGINSPLIT' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:SPLIT_IMPUTED:BCFTOOLS_PLUGINSPLIT' { tag = { "${meta.id} Batch ${meta.batch} ${meta.tools}" } ext.args = ["--output-type z", "--write-index=tbi"].join(' ') publishDir = [ diff --git a/conf/steps/panel_prep.config b/conf/steps/panel_prep.config index bd68b78e..99efa33b 100644 --- a/conf/steps/panel_prep.config +++ b/conf/steps/panel_prep.config @@ -56,8 +56,13 @@ process { publishDir = [ path: { "${params.outdir}/prep_panel/panel" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: { !params.compute_freq && !params.phase } + saveAs: { + if ( !params.compute_freq && !params.phase ) { + filename -> filename.equals('versions.yml') ? null : filename + } else { + null + } + } ] } @@ -66,8 +71,13 @@ process { publishDir = [ path: { "${params.outdir}/prep_panel/panel" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: { !params.phase } + saveAs: { + if ( !params.phase ) { + filename -> filename.equals('versions.yml') ? null : filename + } else { + null + } + } ] } @@ -76,8 +86,13 @@ process { publishDir = [ path: { "${params.outdir}/prep_panel/panel" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: { !params.phase } + saveAs: { + if ( !params.phase ) { + filename -> filename.equals('versions.yml') ? null : filename + } else { + null + } + } ] } diff --git a/conf/steps/simulation.config b/conf/steps/simulation.config index 71172319..17035abb 100644 --- a/conf/steps/simulation.config +++ b/conf/steps/simulation.config @@ -23,6 +23,7 @@ process { } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_EXTRACT_REGION_SAMTOOLS:SAMTOOLS_MERGE' { + cache = "lenient" ext.prefix = { "${meta.id}" } tag = {"${meta.id} ${meta.chr}"} } @@ -34,6 +35,7 @@ process { withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_DOWNSAMPLE_SAMTOOLS:.*' { tag = {"${meta.id} ${meta.chr}"} } + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_DOWNSAMPLE_SAMTOOLS:SAMTOOLS_DEPTH' { publishDir = [enabled: false] ext.prefix = { "${meta1.id}_C${meta1.chr ?: "all"}.depth" } @@ -49,6 +51,7 @@ process { } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_DOWNSAMPLE_SAMTOOLS:SAMTOOLS_VIEW' { + cache = "lenient" ext.args = ["--output-fmt bam", "--write-index"].join(' ') ext.prefix = { "${meta.id}.depth_${meta.depth}x" } publishDir = [ diff --git a/conf/steps/validation.config b/conf/steps/validation.config index a95795b8..58103506 100644 --- a/conf/steps/validation.config +++ b/conf/steps/validation.config @@ -27,6 +27,7 @@ process { withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GL_TRUTH:BCFTOOLS_MPILEUP' { label = 'process_high' + cache = "lenient" ext.args = [ "-I", "-E", @@ -66,13 +67,13 @@ process { } // Compute sample files for renaming - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BCFTOOLS_QUERY' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BCFTOOLS_QUERY_TRUTH' { tag = { "${meta.id} Batch ${meta.batch} ${meta.tools}" } ext.args = '--list-samples' publishDir = [enabled: false] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GAWK' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GAWK_TRUTH' { tag = { "${meta.id} Batch ${meta.batch} ${meta.tools}" } ext.prefix = { "${meta.id}_samples"} ext.args2 = "'BEGIN { OFS = \"\\t\" } { print \$1, \"-\", \$1\".truth\" }'" @@ -104,6 +105,22 @@ process { publishDir = [ enabled: false ] } + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CONCORDANCE_GLIMPSE2:GAWK_ERROR_SPL' { + tag = { "${meta.id} Batch ${meta.batch} ${meta.tools}" } + ext.prefix = { "${meta.id}${meta.panel ? '_P' + meta.panel : ''}${meta.tools ? '_T' + meta.tools : ''}.concordance.renamed.error.spl" } + ext.suffix = "txt.gz" + ext.args2 = { "-v tool=\"${meta.tools}\" " + "'BEGIN { OFS = \" \" } !/^#/ { \$3 = \$3\".\"tool } { print }'" } + publishDir = [enabled: false] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CONCORDANCE_GLIMPSE2:GAWK_RSQUARE_SPL' { + tag = { "${meta.id} Batch ${meta.batch} ${meta.tools}" } + ext.prefix = { "${meta.id}${meta.panel ? '_P' + meta.panel : ''}${meta.tools ? '_T' + meta.tools : ''}.concordance.renamed.rsquare.spl" } + ext.suffix = "txt.gz" + ext.args2 = { "-v tool=\"${meta.tools}\" " + "'BEGIN { OFS = \" \" } !/^#/ { \$1 = \$1\".\"tool } { print }'" } + publishDir = [enabled: false] + } + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CONCORDANCE_GLIMPSE2:GUNZIP' { ext.prefix = { "${meta.id}${meta.panel ? '_P' + meta.panel : ''}${meta.tools ? '_T' + meta.tools : ''}" } publishDir = [ enabled: false ] diff --git a/modules/nf-core/gawk/main.nf b/modules/nf-core/gawk/main.nf index 7514246e..dca68b69 100644 --- a/modules/nf-core/gawk/main.nf +++ b/modules/nf-core/gawk/main.nf @@ -24,10 +24,11 @@ process GAWK { prefix = task.ext.prefix ?: "${meta.id}" suffix = task.ext.suffix ?: "${input.collect{ it.getExtension()}.get(0)}" // use the first extension of the input files - program = program_file ? "-f ${program_file}" : "${args2}" - lst_gz = input.collect{ it.getExtension().endsWith("gz") } - unzip = lst_gz.contains(false) ? "" : "find ${input} -exec zcat {} \\; | \\" - input_cmd = unzip ? "" : "${input}" + program = program_file ? "-f ${program_file}" : "${args2}" + lst_gz = input.collect{ it.getExtension().endsWith("gz") } + unzip = lst_gz.contains(false) ? "" : "find ${input} -exec zcat {} \\; | \\" + input_cmd = unzip ? "" : "${input}" + output_cmd = suffix.endsWith("gz") ? "| gzip" : "" """ ${unzip} @@ -35,6 +36,7 @@ process GAWK { ${args} \\ ${program} \\ ${input_cmd} \\ + ${output_cmd} \\ > ${prefix}.${suffix} cat <<-END_VERSIONS > versions.yml diff --git a/subworkflows/local/bam_extract_region_samtools/main.nf b/subworkflows/local/bam_extract_region_samtools/main.nf index 42f6a218..a8248ef8 100644 --- a/subworkflows/local/bam_extract_region_samtools/main.nf +++ b/subworkflows/local/bam_extract_region_samtools/main.nf @@ -35,7 +35,7 @@ workflow BAM_EXTRACT_REGION_SAMTOOLS { .map{ metaICR, bam, index -> [metaICR.subMap("id", "batch") + [chr: "all"], bam, index] } - .groupTuple(), + .groupTuple(sort: true), ch_fasta ) ch_versions = ch_versions.mix(SAMTOOLS_MERGE.out.versions.first()) diff --git a/subworkflows/local/vcf_concordance_glimpse2/main.nf b/subworkflows/local/vcf_concordance_glimpse2/main.nf index 82ba6592..ec4b04a0 100644 --- a/subworkflows/local/vcf_concordance_glimpse2/main.nf +++ b/subworkflows/local/vcf_concordance_glimpse2/main.nf @@ -2,6 +2,8 @@ include { GLIMPSE2_CONCORDANCE } from '../../../modules/nf-core/glimpse2/ include { GAWK } from '../../../modules/nf-core/gawk' include { ADD_COLUMNS } from '../../../modules/local/add_columns' include { GUNZIP } from '../../../modules/nf-core/gunzip' +include { GAWK as GAWK_ERROR_SPL } from '../../../modules/nf-core/gawk' +include { GAWK as GAWK_RSQUARE_SPL } from '../../../modules/nf-core/gawk' workflow VCF_CONCORDANCE_GLIMPSE2 { @@ -35,11 +37,23 @@ workflow VCF_CONCORDANCE_GLIMPSE2 { ) ch_versions = ch_versions.mix(GLIMPSE2_CONCORDANCE.out.versions.first()) + GAWK_ERROR_SPL( + GLIMPSE2_CONCORDANCE.out.errors_spl, + [] + ) + ch_versions = ch_versions.mix(GAWK_ERROR_SPL.out.versions.first()) + + GAWK_RSQUARE_SPL( + GLIMPSE2_CONCORDANCE.out.rsquare_spl, + [] + ) + ch_versions = ch_versions.mix(GAWK_ERROR_SPL.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix(GLIMPSE2_CONCORDANCE.out.errors_cal.map{ _meta, txt -> [txt]}) ch_multiqc_files = ch_multiqc_files.mix(GLIMPSE2_CONCORDANCE.out.errors_grp.map{ _meta, txt -> [txt]}) - ch_multiqc_files = ch_multiqc_files.mix(GLIMPSE2_CONCORDANCE.out.errors_spl.map{ _meta, txt -> [txt]}) + ch_multiqc_files = ch_multiqc_files.mix(GAWK_ERROR_SPL.out.output.map{ _meta, txt -> [txt]}) ch_multiqc_files = ch_multiqc_files.mix(GLIMPSE2_CONCORDANCE.out.rsquare_grp.map{ _meta, txt -> [txt]}) - ch_multiqc_files = ch_multiqc_files.mix(GLIMPSE2_CONCORDANCE.out.rsquare_spl.map{ _meta, txt -> [txt]}) + ch_multiqc_files = ch_multiqc_files.mix(GAWK_RSQUARE_SPL.out.output.map{ _meta, txt -> [txt]}) ch_multiqc_files = ch_multiqc_files.mix(GLIMPSE2_CONCORDANCE.out.rsquare_per_site.map{ _meta, txt -> [txt]}) GUNZIP(GLIMPSE2_CONCORDANCE.out.errors_grp) diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 8e91d7f8..dbd0ac04 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -25,6 +25,8 @@ include { BAM_EXTRACT_REGION_SAMTOOLS } from '../../subworkflows/ include { BAM_DOWNSAMPLE_SAMTOOLS } from '../../subworkflows/local/bam_downsample_samtools' include { SAMTOOLS_COVERAGE as SAMTOOLS_COVERAGE_INP } from '../../modules/nf-core/samtools/coverage' include { SAMTOOLS_COVERAGE as SAMTOOLS_COVERAGE_DWN } from '../../modules/nf-core/samtools/coverage' +include { GAWK as FILTER_CHR_INP } from '../../modules/nf-core/gawk' +include { GAWK as FILTER_CHR_DWN } from '../../modules/nf-core/gawk' // Panelprep subworkflows include { VCF_NORMALIZE_BCFTOOLS } from '../../subworkflows/local/vcf_normalize_bcftools' @@ -36,7 +38,9 @@ include { BCFTOOLS_STATS as BCFTOOLS_STATS_PANEL } from '../../modules/nf-co // Imputation include { LIST_TO_FILE } from '../../modules/local/list_to_file' -include { VCF_SPLIT_BCFTOOLS } from '../../subworkflows/local/vcf_split_bcftools' +include { BCFTOOLS_QUERY as BCFTOOLS_QUERY_IMPUTED } from '../../modules/nf-core/bcftools/query' +include { GAWK as GAWK_IMPUTED } from '../../modules/nf-core/gawk' +include { VCF_SPLIT_BCFTOOLS as SPLIT_IMPUTED } from '../../subworkflows/local/vcf_split_bcftools' // GLIMPSE1 subworkflows include { BAM_GL_BCFTOOLS as GL_GLIMPSE1 } from '../../subworkflows/local/bam_gl_bcftools' @@ -61,8 +65,8 @@ include { BCFTOOLS_STATS as BCFTOOLS_STATS_TOOLS } from '../../modules/nf-co // Concordance subworkflows include { BAM_GL_BCFTOOLS as GL_TRUTH } from '../../subworkflows/local/bam_gl_bcftools' -include { BCFTOOLS_QUERY } from '../../modules/nf-core/bcftools/query' -include { GAWK } from '../../modules/nf-core/gawk' +include { BCFTOOLS_QUERY as BCFTOOLS_QUERY_TRUTH } from '../../modules/nf-core/bcftools/query' +include { GAWK as GAWK_TRUTH } from '../../modules/nf-core/gawk' include { VCF_SPLIT_BCFTOOLS as SPLIT_TRUTH } from '../../subworkflows/local/vcf_split_bcftools' include { BCFTOOLS_STATS as BCFTOOLS_STATS_TRUTH } from '../../modules/nf-core/bcftools/stats' include { VCF_CONCATENATE_BCFTOOLS as CONCAT_TRUTH } from '../../subworkflows/local/vcf_concatenate_bcftools' @@ -116,10 +120,28 @@ workflow PHASEIMPUTE { // Use input for simulation as truth for validation step ch_input_truth = ch_input_sim + // Program to filter chromosomes + filter_chr_program = ch_region + .collect{ meta, region -> meta.chr } + .map { chr -> + "BEGIN { FS=\"\t\";\nsplit(\"" + chr.join(" ") + '", chr, " ");\n' + + 'for (i in chr) {\nchr_map[chr[i]] = 1;\n}\n}\n' + + 'NR == 1 || (\$1 in chr_map){\nprint \$0;\n}' + } + .collectFile(name:"program.txt") + .collect() + // Compute coverage of input files SAMTOOLS_COVERAGE_INP(ch_input_sim, ch_fasta) - ch_versions = ch_versions.mix(SAMTOOLS_COVERAGE_INP.out.versions) - ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_COVERAGE_INP.out.coverage.map{it[1]}) + ch_versions = ch_versions.mix(SAMTOOLS_COVERAGE_INP.out.versions) + ch_coverage = SAMTOOLS_COVERAGE_INP.out.coverage + + FILTER_CHR_INP( + SAMTOOLS_COVERAGE_INP.out.coverage, + filter_chr_program + ) + ch_versions = ch_versions.mix(FILTER_CHR_INP.out.versions) + ch_multiqc_files = ch_multiqc_files.mix(FILTER_CHR_INP.out.output.map{ it[1] }) if (params.depth) { // Downsample input to desired depth @@ -129,10 +151,17 @@ workflow PHASEIMPUTE { // Compute coverage of input files SAMTOOLS_COVERAGE_DWN(BAM_DOWNSAMPLE_SAMTOOLS.out.bam_emul, ch_fasta) - ch_versions = ch_versions.mix(SAMTOOLS_COVERAGE_DWN.out.versions) - ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_COVERAGE_DWN.out.coverage.map{it[1]}) - } + ch_versions = ch_versions.mix(SAMTOOLS_COVERAGE_DWN.out.versions) + FILTER_CHR_DWN( + SAMTOOLS_COVERAGE_DWN.out.coverage, + filter_chr_program + ) + ch_versions = ch_versions.mix(FILTER_CHR_DWN.out.versions) + ch_multiqc_files = ch_multiqc_files.mix(FILTER_CHR_DWN.out.output.map{ it[1] }) + } + + if (params.genotype) { error "Genotype simulation not yet implemented" } @@ -369,9 +398,15 @@ workflow PHASEIMPUTE { ch_input_validate = ch_input_validate.mix(CONCAT_QUILT.out.vcf_tbi) } + // Prepare renaming file + BCFTOOLS_QUERY_IMPUTED(ch_input_validate, [], [], []) + GAWK_IMPUTED(BCFTOOLS_QUERY_IMPUTED.out.output, []) + ch_split_imputed = ch_input_validate.join(GAWK_IMPUTED.out.output) + // Split result by samples - VCF_SPLIT_BCFTOOLS(ch_input_validate.map{ [it[0], it[1], it[2], []] }) - ch_input_validate = VCF_SPLIT_BCFTOOLS.out.vcf_tbi + SPLIT_IMPUTED(ch_split_imputed) + ch_versions = ch_versions.mix(SPLIT_IMPUTED.out.versions) + ch_input_validate = SPLIT_IMPUTED.out.vcf_tbi // Compute stats on imputed files BCFTOOLS_STATS_TOOLS( @@ -442,12 +477,12 @@ workflow PHASEIMPUTE { ch_versions = ch_versions.mix(CONCAT_TRUTH.out.versions) // Prepare renaming file - BCFTOOLS_QUERY(CONCAT_TRUTH.out.vcf_tbi, [], [], []) - GAWK(BCFTOOLS_QUERY.out.output, []) - ch_pluginsplit = CONCAT_TRUTH.out.vcf_tbi.join(GAWK.out.output.view()) + BCFTOOLS_QUERY_TRUTH(CONCAT_TRUTH.out.vcf_tbi, [], [], []) + GAWK_TRUTH(BCFTOOLS_QUERY_TRUTH.out.output, []) + ch_split_truth = CONCAT_TRUTH.out.vcf_tbi.join(GAWK_TRUTH.out.output) // Split truth vcf by samples - SPLIT_TRUTH(ch_pluginsplit) + SPLIT_TRUTH(ch_split_truth) ch_versions = ch_versions.mix(SPLIT_TRUTH.out.versions) // Compute stats on truth files From 298296bf53369fc69d81be25d3b8965ef49178d9 Mon Sep 17 00:00:00 2001 From: LouisLeNezet Date: Wed, 27 Nov 2024 17:58:26 +0100 Subject: [PATCH 06/17] Update CHANGELOG --- CHANGELOG.md | 1 + workflows/phaseimpute/main.nf | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index afb17bd3..8078f3cc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -81,6 +81,7 @@ Special thanks to [Matthias Hörtenhuber](https://github.com/mashehu), [Mazzalab - [#153](https://github.com/nf-core/phaseimpute/pull/153) - Fix getFileExtension function. Fix image in `usage.md`. Fix small warnings and errors with updated language server. `def` has been added when necessary, `:` use instead of `,` in assertions, `_` added to variables not used in closures, `for` loop replaced by `.each{}`, remove unused code / input. - [#161](https://github.com/nf-core/phaseimpute/pull/161) - Fix `VCF_SPLIT_BCFTOOLS` when only one sample present by updating `BCFTOOLS_PLUGINSPLIT` and adding `BCFTOOLS_QUERY` to get truth samples names for renaming the resulting files. - [#162](https://github.com/nf-core/phaseimpute/pull/162) - Fix `fai` usage when provided by `genomes` parameter. +- [#163](https://github.com/nf-core/phaseimpute/pull/163) - Fix MULTIQC samples names. Fix output panel `publisDir`. - [#164](https://github.com/nf-core/phaseimpute/pull/164) - Improve documentation writing ### `Dependencies` diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index dbd0ac04..1e914b56 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -160,8 +160,7 @@ workflow PHASEIMPUTE { ch_versions = ch_versions.mix(FILTER_CHR_DWN.out.versions) ch_multiqc_files = ch_multiqc_files.mix(FILTER_CHR_DWN.out.output.map{ it[1] }) } - - + if (params.genotype) { error "Genotype simulation not yet implemented" } From 21656bb9b852098b49308f0b2d7ab460fd56db97 Mon Sep 17 00:00:00 2001 From: LouisLeNezet Date: Wed, 27 Nov 2024 18:48:47 +0100 Subject: [PATCH 07/17] Update snapshot --- workflows/phaseimpute/tests/main.nf.test.snap | 146 +++++++++--------- 1 file changed, 73 insertions(+), 73 deletions(-) diff --git a/workflows/phaseimpute/tests/main.nf.test.snap b/workflows/phaseimpute/tests/main.nf.test.snap index 8553f530..43ece590 100644 --- a/workflows/phaseimpute/tests/main.nf.test.snap +++ b/workflows/phaseimpute/tests/main.nf.test.snap @@ -10,12 +10,12 @@ "imputation/stats/NA20359.stitch.bcftools_stats.txt", "imputation/stitch/concat/all.batch0.stitch.vcf.gz", "imputation/stitch/concat/all.batch0.stitch.vcf.gz.tbi", - "imputation/stitch/samples/NA12878.vcf.gz", - "imputation/stitch/samples/NA12878.vcf.gz.tbi", - "imputation/stitch/samples/NA19401.vcf.gz", - "imputation/stitch/samples/NA19401.vcf.gz.tbi", - "imputation/stitch/samples/NA20359.vcf.gz", - "imputation/stitch/samples/NA20359.vcf.gz.tbi" + "imputation/stitch/samples/NA12878.stitch.vcf.gz", + "imputation/stitch/samples/NA12878.stitch.vcf.gz.tbi", + "imputation/stitch/samples/NA19401.stitch.vcf.gz", + "imputation/stitch/samples/NA19401.stitch.vcf.gz.tbi", + "imputation/stitch/samples/NA20359.stitch.vcf.gz", + "imputation/stitch/samples/NA20359.stitch.vcf.gz.tbi" ], "VcfFile [chromosomes=[chr21, chr22], sampleCount=3, variantCount=1739, phased=false]", [ @@ -28,7 +28,7 @@ "nf-test": "0.9.2", "nextflow": "24.10.1" }, - "timestamp": "2024-11-22T13:07:42.012993182" + "timestamp": "2024-11-27T18:17:15.103980096" }, "Check test_all": { "content": [ @@ -47,28 +47,28 @@ "imputation/csv/impute.csv", "imputation/glimpse1/concat/all.batch0.glimpse1.vcf.gz", "imputation/glimpse1/concat/all.batch0.glimpse1.vcf.gz.tbi", - "imputation/glimpse1/samples/NA12878.vcf.gz", - "imputation/glimpse1/samples/NA12878.vcf.gz.tbi", - "imputation/glimpse1/samples/NA19401.vcf.gz", - "imputation/glimpse1/samples/NA19401.vcf.gz.tbi", - "imputation/glimpse1/samples/NA20359.vcf.gz", - "imputation/glimpse1/samples/NA20359.vcf.gz.tbi", + "imputation/glimpse1/samples/NA12878.glimpse1.vcf.gz", + "imputation/glimpse1/samples/NA12878.glimpse1.vcf.gz.tbi", + "imputation/glimpse1/samples/NA19401.glimpse1.vcf.gz", + "imputation/glimpse1/samples/NA19401.glimpse1.vcf.gz.tbi", + "imputation/glimpse1/samples/NA20359.glimpse1.vcf.gz", + "imputation/glimpse1/samples/NA20359.glimpse1.vcf.gz.tbi", "imputation/glimpse2/concat/all.batch0.glimpse2.vcf.gz", "imputation/glimpse2/concat/all.batch0.glimpse2.vcf.gz.tbi", - "imputation/glimpse2/samples/NA12878.vcf.gz", - "imputation/glimpse2/samples/NA12878.vcf.gz.tbi", - "imputation/glimpse2/samples/NA19401.vcf.gz", - "imputation/glimpse2/samples/NA19401.vcf.gz.tbi", - "imputation/glimpse2/samples/NA20359.vcf.gz", - "imputation/glimpse2/samples/NA20359.vcf.gz.tbi", + "imputation/glimpse2/samples/NA12878.glimpse2.vcf.gz", + "imputation/glimpse2/samples/NA12878.glimpse2.vcf.gz.tbi", + "imputation/glimpse2/samples/NA19401.glimpse2.vcf.gz", + "imputation/glimpse2/samples/NA19401.glimpse2.vcf.gz.tbi", + "imputation/glimpse2/samples/NA20359.glimpse2.vcf.gz", + "imputation/glimpse2/samples/NA20359.glimpse2.vcf.gz.tbi", "imputation/quilt/concat/all.batch0.quilt.vcf.gz", "imputation/quilt/concat/all.batch0.quilt.vcf.gz.tbi", - "imputation/quilt/samples/NA12878.vcf.gz", - "imputation/quilt/samples/NA12878.vcf.gz.tbi", - "imputation/quilt/samples/NA19401.vcf.gz", - "imputation/quilt/samples/NA19401.vcf.gz.tbi", - "imputation/quilt/samples/NA20359.vcf.gz", - "imputation/quilt/samples/NA20359.vcf.gz.tbi", + "imputation/quilt/samples/NA12878.quilt.vcf.gz", + "imputation/quilt/samples/NA12878.quilt.vcf.gz.tbi", + "imputation/quilt/samples/NA19401.quilt.vcf.gz", + "imputation/quilt/samples/NA19401.quilt.vcf.gz.tbi", + "imputation/quilt/samples/NA20359.quilt.vcf.gz", + "imputation/quilt/samples/NA20359.quilt.vcf.gz.tbi", "imputation/stats/NA12878.glimpse1.bcftools_stats.txt", "imputation/stats/NA12878.glimpse2.bcftools_stats.txt", "imputation/stats/NA12878.quilt.bcftools_stats.txt", @@ -83,12 +83,12 @@ "imputation/stats/NA20359.stitch.bcftools_stats.txt", "imputation/stitch/concat/all.batch0.stitch.vcf.gz", "imputation/stitch/concat/all.batch0.stitch.vcf.gz.tbi", - "imputation/stitch/samples/NA12878.vcf.gz", - "imputation/stitch/samples/NA12878.vcf.gz.tbi", - "imputation/stitch/samples/NA19401.vcf.gz", - "imputation/stitch/samples/NA19401.vcf.gz.tbi", - "imputation/stitch/samples/NA20359.vcf.gz", - "imputation/stitch/samples/NA20359.vcf.gz.tbi" + "imputation/stitch/samples/NA12878.stitch.vcf.gz", + "imputation/stitch/samples/NA12878.stitch.vcf.gz.tbi", + "imputation/stitch/samples/NA19401.stitch.vcf.gz", + "imputation/stitch/samples/NA19401.stitch.vcf.gz.tbi", + "imputation/stitch/samples/NA20359.stitch.vcf.gz", + "imputation/stitch/samples/NA20359.stitch.vcf.gz.tbi" ], [ "prep_panel/chunks/glimpse1/1000GP_chr21_chunks_glimpse1.txt", @@ -169,7 +169,7 @@ "nf-test": "0.9.2", "nextflow": "24.10.1" }, - "timestamp": "2024-11-22T13:58:54.188813202" + "timestamp": "2024-11-27T18:23:38.85114929" }, "Check test_validate": { "content": [ @@ -207,22 +207,22 @@ "imputation/glimpse2/concat/all.batch0.glimpse2.vcf.gz.tbi", "imputation/glimpse2/concat/all.batch1.glimpse2.vcf.gz", "imputation/glimpse2/concat/all.batch1.glimpse2.vcf.gz.tbi", - "imputation/glimpse2/samples/NA12878.vcf.gz", - "imputation/glimpse2/samples/NA12878.vcf.gz.tbi", - "imputation/glimpse2/samples/NA19401.vcf.gz", - "imputation/glimpse2/samples/NA19401.vcf.gz.tbi", - "imputation/glimpse2/samples/NA20359.vcf.gz", - "imputation/glimpse2/samples/NA20359.vcf.gz.tbi", + "imputation/glimpse2/samples/NA12878.glimpse2.vcf.gz", + "imputation/glimpse2/samples/NA12878.glimpse2.vcf.gz.tbi", + "imputation/glimpse2/samples/NA19401.glimpse2.vcf.gz", + "imputation/glimpse2/samples/NA19401.glimpse2.vcf.gz.tbi", + "imputation/glimpse2/samples/NA20359.glimpse2.vcf.gz", + "imputation/glimpse2/samples/NA20359.glimpse2.vcf.gz.tbi", "imputation/quilt/concat/all.batch0.quilt.vcf.gz", "imputation/quilt/concat/all.batch0.quilt.vcf.gz.tbi", "imputation/quilt/concat/all.batch1.quilt.vcf.gz", "imputation/quilt/concat/all.batch1.quilt.vcf.gz.tbi", - "imputation/quilt/samples/NA12878.vcf.gz", - "imputation/quilt/samples/NA12878.vcf.gz.tbi", - "imputation/quilt/samples/NA19401.vcf.gz", - "imputation/quilt/samples/NA19401.vcf.gz.tbi", - "imputation/quilt/samples/NA20359.vcf.gz", - "imputation/quilt/samples/NA20359.vcf.gz.tbi", + "imputation/quilt/samples/NA12878.quilt.vcf.gz", + "imputation/quilt/samples/NA12878.quilt.vcf.gz.tbi", + "imputation/quilt/samples/NA19401.quilt.vcf.gz", + "imputation/quilt/samples/NA19401.quilt.vcf.gz.tbi", + "imputation/quilt/samples/NA20359.quilt.vcf.gz", + "imputation/quilt/samples/NA20359.quilt.vcf.gz.tbi", "imputation/stats/NA12878.glimpse2.bcftools_stats.txt", "imputation/stats/NA12878.quilt.bcftools_stats.txt", "imputation/stats/NA19401.glimpse2.bcftools_stats.txt", @@ -271,7 +271,7 @@ "nf-test": "0.9.2", "nextflow": "24.10.1" }, - "timestamp": "2024-11-22T14:06:57.642618122" + "timestamp": "2024-11-27T18:30:51.51345352" }, "Check test_quilt": { "content": [ @@ -281,12 +281,12 @@ "imputation/csv/impute.csv", "imputation/quilt/concat/all.batch0.quilt.vcf.gz", "imputation/quilt/concat/all.batch0.quilt.vcf.gz.tbi", - "imputation/quilt/samples/NA12878.vcf.gz", - "imputation/quilt/samples/NA12878.vcf.gz.tbi", - "imputation/quilt/samples/NA19401.vcf.gz", - "imputation/quilt/samples/NA19401.vcf.gz.tbi", - "imputation/quilt/samples/NA20359.vcf.gz", - "imputation/quilt/samples/NA20359.vcf.gz.tbi", + "imputation/quilt/samples/NA12878.quilt.vcf.gz", + "imputation/quilt/samples/NA12878.quilt.vcf.gz.tbi", + "imputation/quilt/samples/NA19401.quilt.vcf.gz", + "imputation/quilt/samples/NA19401.quilt.vcf.gz.tbi", + "imputation/quilt/samples/NA20359.quilt.vcf.gz", + "imputation/quilt/samples/NA20359.quilt.vcf.gz.tbi", "imputation/stats/NA12878.quilt.bcftools_stats.txt", "imputation/stats/NA19401.quilt.bcftools_stats.txt", "imputation/stats/NA20359.quilt.bcftools_stats.txt" @@ -302,7 +302,7 @@ "nf-test": "0.9.2", "nextflow": "24.10.1" }, - "timestamp": "2024-11-23T19:41:54.261988901" + "timestamp": "2024-11-27T18:15:27.194211063" }, "Check test_sim": { "content": [ @@ -362,12 +362,12 @@ "imputation/csv/impute.csv", "imputation/glimpse2/concat/all.batch0.glimpse2.vcf.gz", "imputation/glimpse2/concat/all.batch0.glimpse2.vcf.gz.tbi", - "imputation/glimpse2/samples/NA12878.vcf.gz", - "imputation/glimpse2/samples/NA12878.vcf.gz.tbi", - "imputation/glimpse2/samples/NA19401.vcf.gz", - "imputation/glimpse2/samples/NA19401.vcf.gz.tbi", - "imputation/glimpse2/samples/NA20359.vcf.gz", - "imputation/glimpse2/samples/NA20359.vcf.gz.tbi", + "imputation/glimpse2/samples/NA12878.glimpse2.vcf.gz", + "imputation/glimpse2/samples/NA12878.glimpse2.vcf.gz.tbi", + "imputation/glimpse2/samples/NA19401.glimpse2.vcf.gz", + "imputation/glimpse2/samples/NA19401.glimpse2.vcf.gz.tbi", + "imputation/glimpse2/samples/NA20359.glimpse2.vcf.gz", + "imputation/glimpse2/samples/NA20359.glimpse2.vcf.gz.tbi", "imputation/stats/NA12878.glimpse2.bcftools_stats.txt", "imputation/stats/NA19401.glimpse2.bcftools_stats.txt", "imputation/stats/NA20359.glimpse2.bcftools_stats.txt" @@ -383,7 +383,7 @@ "nf-test": "0.9.2", "nextflow": "24.10.1" }, - "timestamp": "2024-11-22T13:03:28.516026252" + "timestamp": "2024-11-27T18:13:04.376764506" }, "Check test": { "content": [ @@ -393,12 +393,12 @@ "imputation/csv/impute.csv", "imputation/glimpse1/concat/all.batch0.glimpse1.vcf.gz", "imputation/glimpse1/concat/all.batch0.glimpse1.vcf.gz.tbi", - "imputation/glimpse1/samples/NA12878.vcf.gz", - "imputation/glimpse1/samples/NA12878.vcf.gz.tbi", - "imputation/glimpse1/samples/NA19401.vcf.gz", - "imputation/glimpse1/samples/NA19401.vcf.gz.tbi", - "imputation/glimpse1/samples/NA20359.vcf.gz", - "imputation/glimpse1/samples/NA20359.vcf.gz.tbi", + "imputation/glimpse1/samples/NA12878.glimpse1.vcf.gz", + "imputation/glimpse1/samples/NA12878.glimpse1.vcf.gz.tbi", + "imputation/glimpse1/samples/NA19401.glimpse1.vcf.gz", + "imputation/glimpse1/samples/NA19401.glimpse1.vcf.gz.tbi", + "imputation/glimpse1/samples/NA20359.glimpse1.vcf.gz", + "imputation/glimpse1/samples/NA20359.glimpse1.vcf.gz.tbi", "imputation/stats/NA12878.glimpse1.bcftools_stats.txt", "imputation/stats/NA19401.glimpse1.bcftools_stats.txt", "imputation/stats/NA20359.glimpse1.bcftools_stats.txt" @@ -414,7 +414,7 @@ "nf-test": "0.9.2", "nextflow": "24.10.1" }, - "timestamp": "2024-11-23T19:36:03.277999895" + "timestamp": "2024-11-27T18:11:44.567996758" }, "Check test_dog": { "content": [ @@ -447,16 +447,16 @@ "imputation/csv/impute.csv", "imputation/glimpse1/concat/dog_1735.batch0.glimpse1.vcf.gz", "imputation/glimpse1/concat/dog_1735.batch0.glimpse1.vcf.gz.tbi", - "imputation/glimpse1/samples/dog_1735.vcf.gz", - "imputation/glimpse1/samples/dog_1735.vcf.gz.tbi", + "imputation/glimpse1/samples/dog_1735.glimpse1.vcf.gz", + "imputation/glimpse1/samples/dog_1735.glimpse1.vcf.gz.tbi", "imputation/glimpse2/concat/all.batch0.glimpse2.vcf.gz", "imputation/glimpse2/concat/all.batch0.glimpse2.vcf.gz.tbi", - "imputation/glimpse2/samples/dog_1735.vcf.gz", - "imputation/glimpse2/samples/dog_1735.vcf.gz.tbi", + "imputation/glimpse2/samples/dog_1735.glimpse2.vcf.gz", + "imputation/glimpse2/samples/dog_1735.glimpse2.vcf.gz.tbi", "imputation/quilt/concat/all.batch0.quilt.vcf.gz", "imputation/quilt/concat/all.batch0.quilt.vcf.gz.tbi", - "imputation/quilt/samples/1735.vcf.gz", - "imputation/quilt/samples/1735.vcf.gz.tbi", + "imputation/quilt/samples/1735.quilt.vcf.gz", + "imputation/quilt/samples/1735.quilt.vcf.gz.tbi", "imputation/stats/1735.quilt.bcftools_stats.txt", "imputation/stats/dog_1735.glimpse1.bcftools_stats.txt", "imputation/stats/dog_1735.glimpse2.bcftools_stats.txt" @@ -466,6 +466,6 @@ "nf-test": "0.9.2", "nextflow": "24.10.1" }, - "timestamp": "2024-11-22T13:16:12.803136748" + "timestamp": "2024-11-27T18:25:56.980470563" } } \ No newline at end of file From 8d97f749c0df657215e3d139258eb45583332470 Mon Sep 17 00:00:00 2001 From: Louis Le Nezet Date: Thu, 28 Nov 2024 16:12:40 +0100 Subject: [PATCH 08/17] Update gawk --- modules.json | 5 +- modules/nf-core/gawk/main.nf | 12 ++- modules/nf-core/gawk/tests/main.nf.test | 77 +++++++++++++++++-- modules/nf-core/gawk/tests/main.nf.test.snap | 35 ++++++++- modules/nf-core/gawk/tests/nextflow.config | 4 +- .../tests/nextflow_with_program_file.config | 5 -- 6 files changed, 115 insertions(+), 23 deletions(-) delete mode 100644 modules/nf-core/gawk/tests/nextflow_with_program_file.config diff --git a/modules.json b/modules.json index b40eb508..f3cd1cb6 100644 --- a/modules.json +++ b/modules.json @@ -72,9 +72,8 @@ }, "gawk": { "branch": "master", - "git_sha": "97321eded31a12598837a476d3615300af413bb7", - "installed_by": ["modules"], - "patch": "modules/nf-core/gawk/gawk.diff" + "git_sha": "caab1314ca62679b629da4c79afa9a4cab2bb8ee", + "installed_by": ["modules"] }, "glimpse/chunk": { "branch": "master", diff --git a/modules/nf-core/gawk/main.nf b/modules/nf-core/gawk/main.nf index dca68b69..b9df2b8c 100644 --- a/modules/nf-core/gawk/main.nf +++ b/modules/nf-core/gawk/main.nf @@ -24,12 +24,16 @@ process GAWK { prefix = task.ext.prefix ?: "${meta.id}" suffix = task.ext.suffix ?: "${input.collect{ it.getExtension()}.get(0)}" // use the first extension of the input files - program = program_file ? "-f ${program_file}" : "${args2}" - lst_gz = input.collect{ it.getExtension().endsWith("gz") } - unzip = lst_gz.contains(false) ? "" : "find ${input} -exec zcat {} \\; | \\" - input_cmd = unzip ? "" : "${input}" + program = program_file ? "-f ${program_file}" : "${args2}" + lst_gz = input.collect{ it.getExtension().endsWith("gz") } + unzip = lst_gz.contains(false) ? "" : "find ${input} -exec zcat {} \\; | \\" + input_cmd = unzip ? "" : "${input}" output_cmd = suffix.endsWith("gz") ? "| gzip" : "" + input.collect{ + assert it.name != "${prefix}.${suffix}" : "Input and output names are the same, set prefix in module configuration to disambiguate!" + } + """ ${unzip} awk \\ diff --git a/modules/nf-core/gawk/tests/main.nf.test b/modules/nf-core/gawk/tests/main.nf.test index 5952e9a2..b3cde8bf 100644 --- a/modules/nf-core/gawk/tests/main.nf.test +++ b/modules/nf-core/gawk/tests/main.nf.test @@ -8,10 +8,14 @@ nextflow_process { tag "modules_nfcore" tag "gawk" - test("Convert fasta to bed") { - config "./nextflow.config" + config "./nextflow.config" + test("Convert fasta to bed") { when { + params { + gawk_suffix = "bed" + gawk_args2 = '\'BEGIN {FS="\t"}; {print \$1 FS "0" FS \$2}\'' + } process { """ input[0] = [ @@ -32,9 +36,11 @@ nextflow_process { } test("Convert fasta to bed with program file") { - config "./nextflow_with_program_file.config" - when { + params { + gawk_suffix = "bed" + gawk_args2 = "" + } process { """ input[0] = [ @@ -55,9 +61,11 @@ nextflow_process { } test("Extract first column from multiple files") { - config "./nextflow_with_program_file.config" - tag "test" when { + params { + gawk_suffix = "bed" + gawk_args2 = "" + } process { """ input[0] = [ @@ -79,9 +87,11 @@ nextflow_process { } test("Unzip files before processing") { - config "./nextflow_with_program_file.config" - when { + params { + gawk_suffix = "bed" + gawk_args2 = "" + } process { """ input[0] = [ @@ -101,4 +111,55 @@ nextflow_process { ) } } + + test("Compress after processing") { + when { + params { + gawk_suffix = "txt.gz" + gawk_args2 = '\'BEGIN {FS="\t"}; {print \$1 FS "0" FS \$2}\'' + } + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("Input and output files are similar") { + when { + params { + gawk_suffix = "txt" + gawk_args2 = "" + } + process { + """ + input[0] = [ + [ id:'hello' ], // meta map + [file(params.modules_testdata_base_path + 'generic/txt/hello.txt', checkIfExists: true), + file(params.modules_testdata_base_path + 'generic/txt/species_names.txt', checkIfExists: true)] + ] + input[1] = Channel.of('BEGIN {FS=" "}; {print \$1}').collectFile(name:"program.txt") + """ + } + } + + then { + assertAll( + { assert process.failed }, + { assert process.errorReport.contains("Input and output names are the same, set prefix in module configuration to disambiguate!") } + ) + } + } } \ No newline at end of file diff --git a/modules/nf-core/gawk/tests/main.nf.test.snap b/modules/nf-core/gawk/tests/main.nf.test.snap index d396f738..1b3c2f71 100644 --- a/modules/nf-core/gawk/tests/main.nf.test.snap +++ b/modules/nf-core/gawk/tests/main.nf.test.snap @@ -1,4 +1,37 @@ { + "Compress after processing": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.txt.gz:md5,87a15eb9c2ff20ccd5cd8735a28708f7" + ] + ], + "1": [ + "versions.yml:md5,842acc9870dc8ac280954047cb2aa23a" + ], + "output": [ + [ + { + "id": "test" + }, + "test.txt.gz:md5,87a15eb9c2ff20ccd5cd8735a28708f7" + ] + ], + "versions": [ + "versions.yml:md5,842acc9870dc8ac280954047cb2aa23a" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.1" + }, + "timestamp": "2024-11-27T17:11:20.054143406" + }, "Convert fasta to bed": { "content": [ { @@ -131,4 +164,4 @@ }, "timestamp": "2024-10-19T22:08:19.533527657" } -} +} \ No newline at end of file diff --git a/modules/nf-core/gawk/tests/nextflow.config b/modules/nf-core/gawk/tests/nextflow.config index 6e5d43a3..895709a7 100644 --- a/modules/nf-core/gawk/tests/nextflow.config +++ b/modules/nf-core/gawk/tests/nextflow.config @@ -1,6 +1,6 @@ process { withName: GAWK { - ext.suffix = "bed" - ext.args2 = '\'BEGIN {FS="\t"}; {print \$1 FS "0" FS \$2}\'' + ext.suffix = params.gawk_suffix + ext.args2 = params.gawk_args2 } } diff --git a/modules/nf-core/gawk/tests/nextflow_with_program_file.config b/modules/nf-core/gawk/tests/nextflow_with_program_file.config deleted file mode 100644 index 693ad419..00000000 --- a/modules/nf-core/gawk/tests/nextflow_with_program_file.config +++ /dev/null @@ -1,5 +0,0 @@ -process { - withName: GAWK { - ext.suffix = "bed" - } -} From 8e271b4cebd26424ec53c52a53169c7223cd3ec3 Mon Sep 17 00:00:00 2001 From: Louis Le Nezet Date: Mon, 2 Dec 2024 09:32:04 +0100 Subject: [PATCH 09/17] Update CHANGELOG --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8078f3cc..fb034907 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -67,7 +67,7 @@ Special thanks to [Matthias Hörtenhuber](https://github.com/mashehu), [Mazzalab - [#160](https://github.com/nf-core/phaseimpute/pull/160) - Improve `CHANGELOG.md` and add details to `usage.md` - [#158](https://github.com/nf-core/phaseimpute/pull/158) - Remove frequency computation and phasing from full test to reduce cost and computational time. - [#164](https://github.com/nf-core/phaseimpute/pull/164) - Rename `BAM_REGION_SAMTOOLS` to `BAM_EXTRACT_REGION_SAMTOOLS`. Remove `GLIMPSE2_SPLITREFERENCE` as it is not used. Add more steps to `test_all` profile for more exhaustivity. -- [#163](https://github.com/nf-core/phaseimpute/pull/163) - Improve configuration for demanding processes. Use Genome in a Bottle VCF benchmarking file for AWS full test. +- [#163](https://github.com/nf-core/phaseimpute/pull/163) - Improve configuration for demanding processes. Use Genome in a Bottle VCF benchmarking file for AWS full test. Moved from `glimpse1` to `glimpse2` for the full test profile. ### `Fixed` @@ -81,7 +81,7 @@ Special thanks to [Matthias Hörtenhuber](https://github.com/mashehu), [Mazzalab - [#153](https://github.com/nf-core/phaseimpute/pull/153) - Fix getFileExtension function. Fix image in `usage.md`. Fix small warnings and errors with updated language server. `def` has been added when necessary, `:` use instead of `,` in assertions, `_` added to variables not used in closures, `for` loop replaced by `.each{}`, remove unused code / input. - [#161](https://github.com/nf-core/phaseimpute/pull/161) - Fix `VCF_SPLIT_BCFTOOLS` when only one sample present by updating `BCFTOOLS_PLUGINSPLIT` and adding `BCFTOOLS_QUERY` to get truth samples names for renaming the resulting files. - [#162](https://github.com/nf-core/phaseimpute/pull/162) - Fix `fai` usage when provided by `genomes` parameter. -- [#163](https://github.com/nf-core/phaseimpute/pull/163) - Fix MULTIQC samples names. Fix output panel `publisDir`. +- [#163](https://github.com/nf-core/phaseimpute/pull/163) - Fix MULTIQC samples names (add post-processing for clean up `FILTER_CHR_DWN`, `FILTER_CHR_INP`, `GAWK_ERROR_SPL`, `GAWK_RSQUARE_SPL`). Fix output panel `publisDir`. - [#164](https://github.com/nf-core/phaseimpute/pull/164) - Improve documentation writing ### `Dependencies` From 1f6c784fd88a8fc58b9e0107bf8f22095fd29d95 Mon Sep 17 00:00:00 2001 From: Louis Le Nezet Date: Mon, 2 Dec 2024 16:58:35 +0100 Subject: [PATCH 10/17] Fix configuration --- conf/steps/panel_prep.config | 12 ++++++------ conf/test_full.config | 12 ++++++++++++ 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/conf/steps/panel_prep.config b/conf/steps/panel_prep.config index 99efa33b..23675e9b 100644 --- a/conf/steps/panel_prep.config +++ b/conf/steps/panel_prep.config @@ -56,9 +56,9 @@ process { publishDir = [ path: { "${params.outdir}/prep_panel/panel" }, mode: params.publish_dir_mode, - saveAs: { + saveAs: { filename -> if ( !params.compute_freq && !params.phase ) { - filename -> filename.equals('versions.yml') ? null : filename + filename.equals('versions.yml') ? null : filename } else { null } @@ -71,9 +71,9 @@ process { publishDir = [ path: { "${params.outdir}/prep_panel/panel" }, mode: params.publish_dir_mode, - saveAs: { + saveAs: { filename -> if ( !params.phase ) { - filename -> filename.equals('versions.yml') ? null : filename + filename.equals('versions.yml') ? null : filename } else { null } @@ -86,9 +86,9 @@ process { publishDir = [ path: { "${params.outdir}/prep_panel/panel" }, mode: params.publish_dir_mode, - saveAs: { + saveAs: { filename -> if ( !params.phase ) { - filename -> filename.equals('versions.yml') ? null : filename + filename.equals('versions.yml') ? null : filename } else { null } diff --git a/conf/test_full.config b/conf/test_full.config index 61cfe3df..94e04dd9 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -42,4 +42,16 @@ params { // Impute tools tools = "glimpse2" + + // Concordance arguments + min_val_gl = null + min_val_dp = null +} + +process { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CONCORDANCE_GLIMPSE2:GLIMPSE2_CONCORDANCE' { + ext.args = "--gt-val --af-tag AF" + ext.prefix = { "${meta.id}${meta.panel ? '_P' + meta.panel : ''}${meta.tools ? '_T' + meta.tools : ''}.concordance" } + publishDir = [ enabled: false ] + } } From 98fbeaa736c15ddff40d5af58d9c748883c147fd Mon Sep 17 00:00:00 2001 From: Louis Le Nezet Date: Mon, 2 Dec 2024 17:00:30 +0100 Subject: [PATCH 11/17] Update sample samplesheet --- tests/csv/sample_sim_full.csv | 2 +- tests/csv/sample_sim_full_truth.csv | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/csv/sample_sim_full.csv b/tests/csv/sample_sim_full.csv index cc9f4828..5100bfa1 100644 --- a/tests/csv/sample_sim_full.csv +++ b/tests/csv/sample_sim_full.csv @@ -1,2 +1,2 @@ sample,file,index -NA12878,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR3239334/NA12878.final.cram,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR3239334/NA12878.final.cram.crai +HG001,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR3239334/NA12878.final.cram,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR3239334/NA12878.final.cram.crai diff --git a/tests/csv/sample_sim_full_truth.csv b/tests/csv/sample_sim_full_truth.csv index 2b089769..e65884d9 100644 --- a/tests/csv/sample_sim_full_truth.csv +++ b/tests/csv/sample_sim_full_truth.csv @@ -1,2 +1,2 @@ sample,file,index -NA12878,ftp://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz,ftp://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.csi +HG001,https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz,https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz.tbi From 275d31d994c77dc3e1fadeadd25103fc3036acb4 Mon Sep 17 00:00:00 2001 From: Louis Le Nezet Date: Mon, 2 Dec 2024 17:01:25 +0100 Subject: [PATCH 12/17] Fix input truth vcf usage --- main.nf | 7 ------- workflows/phaseimpute/main.nf | 7 +++++-- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/main.nf b/main.nf index 0c3f01ec..b31db4b1 100644 --- a/main.nf +++ b/main.nf @@ -80,13 +80,6 @@ workflow NFCORE_PHASEIMPUTE { ch_input_validate = ch_input } - if (params.steps.split(',').contains("all")) { - ch_input_truth.map{ - error "Cannot run all steps with --input-truth" - } - ch_input_truth = ch_input - } - // // WORKFLOW: Run pipeline // diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 1e914b56..47724cc4 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -118,7 +118,10 @@ workflow PHASEIMPUTE { } // Use input for simulation as truth for validation step - ch_input_truth = ch_input_sim + // if no truth is provided + if (!params.input_truth) { + ch_input_truth = ch_input_sim + } // Program to filter chromosomes filter_chr_program = ch_region @@ -491,7 +494,7 @@ workflow PHASEIMPUTE { [[],[]], [[],[]], [[],[]], - ch_fasta.map{ [it[0], it[1]] } + [[],[]] //ch_fasta.map{ [it[0], it[1]] } ) ch_versions = ch_versions.mix(BCFTOOLS_STATS_TRUTH.out.versions) ch_multiqc_files = ch_multiqc_files.mix(BCFTOOLS_STATS_TRUTH.out.stats.map{ [it[1]] }) From 69306d93e1c63f444651aee39f9f55926d73fb1e Mon Sep 17 00:00:00 2001 From: Louis Le Nezet Date: Mon, 2 Dec 2024 17:01:54 +0100 Subject: [PATCH 13/17] Fix glimpse2 concordance usage --- modules/nf-core/glimpse2/concordance/main.nf | 30 +++++++++++-------- .../local/vcf_concordance_glimpse2/main.nf | 3 +- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/modules/nf-core/glimpse2/concordance/main.nf b/modules/nf-core/glimpse2/concordance/main.nf index 06eb139b..f96f5c38 100644 --- a/modules/nf-core/glimpse2/concordance/main.nf +++ b/modules/nf-core/glimpse2/concordance/main.nf @@ -9,9 +9,7 @@ process GLIMPSE2_CONCORDANCE { input: tuple val(meta), path(estimate), path(estimate_index), path(truth), path(truth_index), path(freq), path(freq_index), path(samples), val(region) - tuple val(meta2), path(groups), val(bins), val(ac_bins), val(allele_counts) - val(min_val_gl) - val(min_val_dp) + tuple val(meta2), path(groups), val(bins), val(ac_bins), val(allele_counts), val(min_val_gl), val(min_val_dp) output: tuple val(meta), path("*.error.cal.txt.gz") , emit: errors_cal @@ -26,17 +24,23 @@ process GLIMPSE2_CONCORDANCE { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def samples_cmd = samples ? "--samples ${samples}" : "" - def groups_cmd = groups ? "--groups ${groups}" : "" - def bins_cmd = bins ? "--bins ${bins}" : "" - def ac_bins_cmd = ac_bins ? "--ac-bins ${ac_bins}" : "" - def ale_ct_cmd = allele_counts ? "--allele-counts ${allele_counts}" : "" - def region_str = region instanceof List ? region.join('\\n') : region + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def samples_cmd = samples ? "--samples ${samples}" : "" + def groups_cmd = groups ? "--groups ${groups}" : "" + def bins_cmd = bins ? "--bins ${bins}" : "" + def ac_bins_cmd = ac_bins ? "--ac-bins ${ac_bins}" : "" + def ale_ct_cmd = allele_counts ? "--allele-counts ${allele_counts}" : "" + def min_val_gl_cmd = min_val_gl ? "--min-val-gl ${min_val_gl}" : "" + def min_val_dp_cmd = min_val_dp ? "--min-val-dp ${min_val_dp}" : "" + def region_str = region instanceof List ? region.join('\\n') : region if (((groups ? 1:0) + (bins ? 1:0) + (ac_bins ? 1:0) + (allele_counts ? 1:0)) != 1) error "One and only one argument should be selected between groups, bins, ac_bins, allele_counts" + if (args.contains("--gt-val")) { + assert !(min_val_gl || min_val_dp) : "If --gt-val is set, --min-val-gl nor --min-val-dp must be set" + } + """ printf '$region_str' > regions.txt sed 's/\$/ $freq $truth $estimate/' regions.txt > input.txt @@ -47,8 +51,8 @@ process GLIMPSE2_CONCORDANCE { $bins_cmd \\ $ac_bins_cmd \\ $ale_ct_cmd \\ - --min-val-gl $min_val_gl \\ - --min-val-dp $min_val_dp \\ + $min_val_gl_cmd \\ + $min_val_dp_cmd \\ --input input.txt \\ --thread $task.cpus \\ --output ${prefix} diff --git a/subworkflows/local/vcf_concordance_glimpse2/main.nf b/subworkflows/local/vcf_concordance_glimpse2/main.nf index ec4b04a0..8545d6a0 100644 --- a/subworkflows/local/vcf_concordance_glimpse2/main.nf +++ b/subworkflows/local/vcf_concordance_glimpse2/main.nf @@ -32,8 +32,7 @@ workflow VCF_CONCORDANCE_GLIMPSE2 { GLIMPSE2_CONCORDANCE ( ch_concordance, - [[], [], params.bins, [], []], - params.min_val_gl, params.min_val_dp + [[], [], params.bins, [], [], params.min_val_gl, params.min_val_dp] ) ch_versions = ch_versions.mix(GLIMPSE2_CONCORDANCE.out.versions.first()) From 0d90bb25d8ad1c5666449c5daa747af3c29a6e3f Mon Sep 17 00:00:00 2001 From: LouisLeNezet Date: Mon, 2 Dec 2024 19:09:32 +0100 Subject: [PATCH 14/17] Reverse modification --- workflows/phaseimpute/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 47724cc4..111fc315 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -494,7 +494,7 @@ workflow PHASEIMPUTE { [[],[]], [[],[]], [[],[]], - [[],[]] //ch_fasta.map{ [it[0], it[1]] } + ch_fasta.map{ [it[0], it[1]] } ) ch_versions = ch_versions.mix(BCFTOOLS_STATS_TRUTH.out.versions) ch_multiqc_files = ch_multiqc_files.mix(BCFTOOLS_STATS_TRUTH.out.stats.map{ [it[1]] }) From 9b630c3163b6352fc046560f326460fa6c243b9a Mon Sep 17 00:00:00 2001 From: LouisLeNezet Date: Wed, 4 Dec 2024 11:26:02 +0100 Subject: [PATCH 15/17] Update glimpse2 concordance --- modules.json | 2 +- modules/nf-core/glimpse2/concordance/meta.yml | 4 +- .../glimpse2/concordance/tests/main.nf.test | 155 ++++-------------- .../concordance/tests/main.nf.test.snap | 45 +---- .../concordance/tests/nextflow.config | 4 +- .../concordance/tests/nextflow_R2.config | 7 - 6 files changed, 49 insertions(+), 168 deletions(-) delete mode 100644 modules/nf-core/glimpse2/concordance/tests/nextflow_R2.config diff --git a/modules.json b/modules.json index f3cd1cb6..25071320 100644 --- a/modules.json +++ b/modules.json @@ -98,7 +98,7 @@ }, "glimpse2/concordance": { "branch": "master", - "git_sha": "cc64e71652f67ce627064af51008fe0a00850987", + "git_sha": "6aed50284f6b208fd8eff1ec1dae4b25bf03c432", "installed_by": ["modules"] }, "glimpse2/ligate": { diff --git a/modules/nf-core/glimpse2/concordance/meta.yml b/modules/nf-core/glimpse2/concordance/meta.yml index f286f6e9..a9e27c90 100644 --- a/modules/nf-core/glimpse2/concordance/meta.yml +++ b/modules/nf-core/glimpse2/concordance/meta.yml @@ -79,12 +79,12 @@ input: description: | Default allele count bins used for rsquared computations. AN field must be defined in the frequency file. - - - min_val_gl: + - min_val_gl: type: float description: | Minimum genotype likelihood probability P(G|R) in validation data. Set to zero to have no filter of if using –gt-validation - - - min_val_dp: + - min_val_dp: type: integer description: | Minimum coverage in validation data. diff --git a/modules/nf-core/glimpse2/concordance/tests/main.nf.test b/modules/nf-core/glimpse2/concordance/tests/main.nf.test index 9c562520..d1c68bc2 100644 --- a/modules/nf-core/glimpse2/concordance/tests/main.nf.test +++ b/modules/nf-core/glimpse2/concordance/tests/main.nf.test @@ -10,78 +10,36 @@ nextflow_process { tag "modules_nfcore" tag "modules" - setup { - run("GLIMPSE2_PHASE") { - script "../../phase/main.nf" - process { - """ - input_vcf = Channel.of([ - [ id:'input'], // meta map - file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/glimpse/NA12878.chr21.s.1x.vcf.gz", checkIfExists: true), - file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/glimpse/NA12878.chr21.s.1x.vcf.gz.csi", checkIfExists: true), - [] - ]) - - samples_infos = Channel.of('NA12878 2').collectFile(name: 'sampleinfos.txt') - region = Channel.of(["chr21:16600000-16800000","chr21:16650000-16750000"]) - - ch_ref_panel = Channel.of([ - file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/glimpse/1000GP.chr21.noNA12878.s.bcf", checkIfExists: true), - file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/glimpse/1000GP.chr21.noNA12878.s.bcf.csi", checkIfExists: true) - ]) - - ch_map = Channel.of([ - file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/glimpse/chr21.b38.gmap.gz", checkIfExists: true), - ]) - - // [meta, vcf, index, sample_infos, regionin, regionout, regionindex, ref, ref_index, map] - input[0] = input_vcf - .combine(samples_infos) - .combine(region) - .combine( ch_ref_panel ) - .combine( ch_map ) - input[1]= Channel.of([[],[],[]]) - """ - } - } - - run("BCFTOOLS_INDEX") { - script "../../../bcftools/index/main.nf" - process { - """ - input[0] = GLIMPSE2_PHASE.output.phased_variants - """ - } - } - } - - - test("test_glimpse2_concordance") { + test("test glimpse2 concordance") { config "./nextflow.config" when { + params { + glimpse2_concordance_args = "--gt-val --af-tag AF" + } process { """ - allele_freq = Channel.of([ - file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/glimpse/1000GP.chr21.noNA12878.s.sites.vcf.gz",checkIfExists:true), - file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/glimpse/1000GP.chr21.noNA12878.s.sites.vcf.gz.csi",checkIfExists:true) + target = Channel.of([ + [id: "input"], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/vcf/NA12878.chr22.1X.vcf.gz",checkIfExists:true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/vcf/NA12878.chr22.1X.vcf.gz.csi",checkIfExists:true) ]) - truth = Channel.of([ - file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/glimpse/NA12878.chr21.s.bcf",checkIfExists:true), - file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/glimpse/NA12878.chr21.s.bcf.csi",checkIfExists:true) + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/vcf/NA12878_GIAB.chr22.vcf.gz",checkIfExists:true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/vcf/NA12878_GIAB.chr22.vcf.gz.csi",checkIfExists:true) ]) - list_inputs = GLIMPSE2_PHASE.output.phased_variants - .join( BCFTOOLS_INDEX.out.csi ) + allele_freq = Channel.of([ + file(params.modules_testdata_base_path + "genomics/homo_sapiens/popgen/1000GP.chr22.sites.vcf.gz",checkIfExists:true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/popgen/1000GP.chr22.sites.vcf.gz.csi",checkIfExists:true) + ]) + list_inputs = target .combine( truth ) .combine( allele_freq ) .combine( Channel.of([[]]) ) - .combine( Channel.of(["chr21"]) ) + .combine( Channel.of(["chr22"]) ) input[0] = list_inputs - input[1] = Channel.of([[id:"params"],[],"0 0.01 0.05 0.1 0.2 0.5",[],[]]) - input[2] = 0.9999 - input[3] = 8 + input[1] = Channel.of([[id:"params"], [],"0 0.01 0.05 0.1 0.2 0.5", [], [], [], []]) """ } } @@ -102,78 +60,37 @@ nextflow_process { } - test("test_list_region") { + test("test list of region and rsquare per site") { config "./nextflow.config" when { + params { + glimpse2_concordance_args = "--gt-val --af-tag AF --out-r2-per-site" + } process { """ - allele_freq = Channel.of([ - file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/glimpse/1000GP.chr21.noNA12878.s.sites.vcf.gz",checkIfExists:true), - file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/glimpse/1000GP.chr21.noNA12878.s.sites.vcf.gz.csi",checkIfExists:true) + target = Channel.of([ + [id: "input"], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/vcf/NA12878.chr22.1X.vcf.gz",checkIfExists:true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/vcf/NA12878.chr22.1X.vcf.gz.csi",checkIfExists:true) ]) - truth = Channel.of([ - file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/glimpse/NA12878.chr21.s.bcf",checkIfExists:true), - file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/glimpse/NA12878.chr21.s.bcf.csi",checkIfExists:true) + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/vcf/NA12878_GIAB.chr22.vcf.gz",checkIfExists:true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/vcf/NA12878_GIAB.chr22.vcf.gz.csi",checkIfExists:true) ]) - list_inputs = GLIMPSE2_PHASE.output.phased_variants - .join( BCFTOOLS_INDEX.out.csi ) - .combine( truth ) - .combine( allele_freq ) - .combine( Channel.of([[]]) ) - .combine( Channel.of(["chr21", "chr21"]) ) - - input[0] = list_inputs - input[1] = Channel.of([[id:"params"],[],"0 0.01 0.05 0.1 0.2 0.5",[],[]]) - input[2] = 0.9999 - input[3] = 8 - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot( - process.out.errors_cal.collect{ file(it[1]).name }, - process.out.errors_grp.collect{ file(it[1]).name }, - process.out.errors_spl.collect{ file(it[1]).name }, - process.out.rsquare_grp.collect{ file(it[1]).name }, - process.out.rsquare_spl.collect{ file(it[1]).name }, - process.out.versions - ).match() - } - ) - } - - } - - test("test_r2_per_site") { - config "./nextflow_R2.config" - - when { - process { - """ allele_freq = Channel.of([ - file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/glimpse/1000GP.chr21.noNA12878.s.sites.vcf.gz",checkIfExists:true), - file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/glimpse/1000GP.chr21.noNA12878.s.sites.vcf.gz.csi",checkIfExists:true) - ]) - - truth = Channel.of([ - file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/glimpse/NA12878.chr21.s.bcf",checkIfExists:true), - file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/glimpse/NA12878.chr21.s.bcf.csi",checkIfExists:true) + file(params.modules_testdata_base_path + "genomics/homo_sapiens/popgen/1000GP.chr22.sites.vcf.gz",checkIfExists:true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/popgen/1000GP.chr22.sites.vcf.gz.csi",checkIfExists:true) ]) - list_inputs = GLIMPSE2_PHASE.output.phased_variants - .join( BCFTOOLS_INDEX.out.csi ) + list_inputs = target .combine( truth ) .combine( allele_freq ) .combine( Channel.of([[]]) ) - .combine( Channel.of(["chr21"]) ) + .combine( Channel.of([["chr22", "chr22"]]) ) + .view() + input[0] = list_inputs - input[1] = Channel.of([[id:"params"],[],"0 0.01 0.05 0.1 0.2 0.5",[],[]]) - input[2] = 0.9999 - input[3] = 8 + input[1] = Channel.of([[id:"params"], [],"0 0.01 0.05 0.1 0.2 0.5", [], [], [], []]) """ } } @@ -189,11 +106,9 @@ nextflow_process { process.out.rsquare_spl.collect{ file(it[1]).name }, process.out.rsquare_per_site.collect{ file(it[1]).name }, process.out.versions - ).match() - } + ).match() } ) } } - } diff --git a/modules/nf-core/glimpse2/concordance/tests/main.nf.test.snap b/modules/nf-core/glimpse2/concordance/tests/main.nf.test.snap index 12bded93..5e81d44e 100644 --- a/modules/nf-core/glimpse2/concordance/tests/main.nf.test.snap +++ b/modules/nf-core/glimpse2/concordance/tests/main.nf.test.snap @@ -1,5 +1,5 @@ { - "test_r2_per_site": { + "test glimpse2 concordance": { "content": [ [ "input.error.cal.txt.gz" @@ -16,20 +16,17 @@ [ "input.rsquare.spl.txt.gz" ], - [ - "input_r2_sites.txt.gz" - ], [ "versions.yml:md5,ba729289bab6b9fbb8c36a620c86bb82" ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" + "nf-test": "0.9.1", + "nextflow": "24.10.1" }, - "timestamp": "2024-10-22T16:23:00.623182365" + "timestamp": "2024-12-03T16:22:35.440086384" }, - "test_glimpse2_concordance": { + "test list of region and rsquare per site": { "content": [ [ "input.error.cal.txt.gz" @@ -47,40 +44,16 @@ "input.rsquare.spl.txt.gz" ], [ - "versions.yml:md5,ba729289bab6b9fbb8c36a620c86bb82" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" - }, - "timestamp": "2024-10-22T16:22:26.823581573" - }, - "test_list_region": { - "content": [ - [ - "input.error.cal.txt.gz" - ], - [ - "input.error.grp.txt.gz" - ], - [ - "input.error.spl.txt.gz" - ], - [ - "input.rsquare.grp.txt.gz" - ], - [ - "input.rsquare.spl.txt.gz" + "input_r2_sites.txt.gz" ], [ "versions.yml:md5,ba729289bab6b9fbb8c36a620c86bb82" ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" + "nf-test": "0.9.1", + "nextflow": "24.10.1" }, - "timestamp": "2024-10-22T16:22:43.352729014" + "timestamp": "2024-12-03T16:26:35.009071185" } } \ No newline at end of file diff --git a/modules/nf-core/glimpse2/concordance/tests/nextflow.config b/modules/nf-core/glimpse2/concordance/tests/nextflow.config index e5721995..f9242b4b 100644 --- a/modules/nf-core/glimpse2/concordance/tests/nextflow.config +++ b/modules/nf-core/glimpse2/concordance/tests/nextflow.config @@ -1,6 +1,6 @@ process { - withName: GLIMPSE2_CHUNK { + withName: GLIMPSE2_CONCORDANCE { + ext.args = { params.glimpse2_concordance_args} ext.prefix = { "${meta.id}" } } - publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } } \ No newline at end of file diff --git a/modules/nf-core/glimpse2/concordance/tests/nextflow_R2.config b/modules/nf-core/glimpse2/concordance/tests/nextflow_R2.config deleted file mode 100644 index 0ceee00d..00000000 --- a/modules/nf-core/glimpse2/concordance/tests/nextflow_R2.config +++ /dev/null @@ -1,7 +0,0 @@ -process { - publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } - - withName:GLIMPSE2_CONCORDANCE { - ext.args = "--out-r2-per-site" - } -} From d1e289c920558450d6b3606b9a518ab2579af2b7 Mon Sep 17 00:00:00 2001 From: LouisLeNezet Date: Wed, 4 Dec 2024 11:41:53 +0100 Subject: [PATCH 16/17] Update java version --- .github/workflows/ci.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d93a3b47..16b89e86 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,6 +59,11 @@ jobs: - name: Check out pipeline code uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + - uses: actions/setup-java@8df1039502a15bceb9433410b1a100fbe190c53b # v4 + with: + distribution: "temurin" + java-version: "17" + - name: Set up Nextflow uses: nf-core/setup-nextflow@v2 with: From 4c846d0ee7af0f0aea7a572e88e513540dbc439b Mon Sep 17 00:00:00 2001 From: Louis LE NEZET <58640615+LouisLeNezet@users.noreply.github.com> Date: Wed, 4 Dec 2024 11:52:09 +0100 Subject: [PATCH 17/17] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fb034907..65359551 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -81,8 +81,8 @@ Special thanks to [Matthias Hörtenhuber](https://github.com/mashehu), [Mazzalab - [#153](https://github.com/nf-core/phaseimpute/pull/153) - Fix getFileExtension function. Fix image in `usage.md`. Fix small warnings and errors with updated language server. `def` has been added when necessary, `:` use instead of `,` in assertions, `_` added to variables not used in closures, `for` loop replaced by `.each{}`, remove unused code / input. - [#161](https://github.com/nf-core/phaseimpute/pull/161) - Fix `VCF_SPLIT_BCFTOOLS` when only one sample present by updating `BCFTOOLS_PLUGINSPLIT` and adding `BCFTOOLS_QUERY` to get truth samples names for renaming the resulting files. - [#162](https://github.com/nf-core/phaseimpute/pull/162) - Fix `fai` usage when provided by `genomes` parameter. -- [#163](https://github.com/nf-core/phaseimpute/pull/163) - Fix MULTIQC samples names (add post-processing for clean up `FILTER_CHR_DWN`, `FILTER_CHR_INP`, `GAWK_ERROR_SPL`, `GAWK_RSQUARE_SPL`). Fix output panel `publisDir`. - [#164](https://github.com/nf-core/phaseimpute/pull/164) - Improve documentation writing +- [#163](https://github.com/nf-core/phaseimpute/pull/163) - Fix MULTIQC samples names (add post-processing for clean up `FILTER_CHR_DWN`, `FILTER_CHR_INP`, `GAWK_ERROR_SPL`, `GAWK_RSQUARE_SPL`). Fix output panel `publisDir`. Fix java version to `17` in `ci.yml` due to new nextflow version. ### `Dependencies`