diff --git a/CHANGELOG.md b/CHANGELOG.md index cb015936..4d437c83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ Initial release of nf-core/phaseimpute, created with the [nf-core](https://nf-co - correct meta map propagation - Test impute and test sim works - [#19](https://github.com/nf-core/phaseimpute/pull/19) - Changed reference panel to accept a csv, update modules and subworkflows (glimpse1/2 and shapeit5) +- [#20](https://github.com/nf-core/phaseimpute/pull/20) - Added automatic detection of vcf contigs for the reference panel and automatic renaming available ### `Fixed` diff --git a/conf/modules.config b/conf/modules.config index a04bf589..6664154e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -56,12 +56,12 @@ process { ext.prefix = { "${meta.id}_${meta.region}" } } - withName: BCFTOOLS_ANNOTATE { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CHR_CHECK:VCF_CHR_RENAME:BCFTOOLS_ANNOTATE' { ext.args = [ "-Oz", "--no-version" ].join(' ') - ext.prefix = { "${meta.id}_chrDel_${meta.region}" } + ext.prefix = { "${meta.id}_chrrename" } } withName: VIEW_VCF_SNPS { diff --git a/docs/development.md b/docs/development.md index 8126332d..5258ca73 100644 --- a/docs/development.md +++ b/docs/development.md @@ -2,7 +2,8 @@ ## Features and tasks -- [] Add automatic detection of chromosome name to create a renaming file for the vcf +- [x] Add automatic detection of chromosome name to create a renaming file for the vcf files +- [] Add automatic detection of chromosome name to create a renaming file for the bam files - [] Make the different tests workflows work - [] Simulation - [] Validation diff --git a/modules.json b/modules.json index 0a99d93d..bfc15ccf 100644 --- a/modules.json +++ b/modules.json @@ -48,6 +48,11 @@ "git_sha": "de45447d060b8c8b98575bc637a4a575fd0638e1", "installed_by": ["modules"] }, + "gawk": { + "branch": "master", + "git_sha": "dc3527855e7358c6d8400828754c0caa5f11698f", + "installed_by": ["modules"] + }, "glimpse/chunk": { "branch": "master", "git_sha": "7e56daae390ff896b292ddc70823447683a79936", diff --git a/modules/local/faitochr/main.nf b/modules/local/faitochr/main.nf deleted file mode 100644 index 9ae36d98..00000000 --- a/modules/local/faitochr/main.nf +++ /dev/null @@ -1,49 +0,0 @@ -process FAITOCHR { - tag "$meta.id" - label 'process_single' - - input: - tuple val(meta), path(fai), val(addchr) - - output: - tuple val(meta), path("*.txt"), emit: annot_chr - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - - """ - # Take the fai file and add the chr prefix to the chromosome names - if [ "${addchr}" = true ]; then - col1="" - col2="chr" - else - col1="chr" - col2="" - fi - awk -F'\t' '{print \$1}' ${fai} | \ - sed 's/chr//g' | \ - awk -v col1=\${col1} -v col2=\${col2} 'BEGIN {OFS=" "} {print col1\$1, col2\$1}' > ${prefix}.txt - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - awk: \$(awk --version | grep -o 'GNU Awk [0-9.]*' | cut -d ' ' -f 3) - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.txt - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - awk: \$(awk --version | grep -o 'GNU Awk [0-9.]*' | cut -d ' ' -f 3) - END_VERSIONS - """ -} diff --git a/modules/local/faitochr/tests/main.nf.test b/modules/local/faitochr/tests/main.nf.test deleted file mode 100644 index 1b066c5c..00000000 --- a/modules/local/faitochr/tests/main.nf.test +++ /dev/null @@ -1,57 +0,0 @@ -nextflow_process { - - name "Test Process FAITOCHR" - script "../main.nf" - process "FAITOCHR" - - tag "modules" - tag "modules_local" - tag "faitochr" - - test("fai add chr") { - - when { - process { - """ - input[0] = [ - [ id:'test' ], // meta map - file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta.fai", checkIfExists: true), - true - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("fai remove chr") { - - when { - process { - """ - input[0] = [ - [ id:'test' ], // meta map - file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta.fai", checkIfExists: true), - false - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - -} diff --git a/modules/local/faitochr/tests/main.nf.test.snap b/modules/local/faitochr/tests/main.nf.test.snap deleted file mode 100644 index 3a5c5379..00000000 --- a/modules/local/faitochr/tests/main.nf.test.snap +++ /dev/null @@ -1,68 +0,0 @@ -{ - "fai add chr": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.txt:md5,dc360653d0d1848e6cc01661dbff389c" - ] - ], - "1": [ - "versions.yml:md5,0d85e18b9c36aa2db49ad51930d9a5e6" - ], - "annot_chr": [ - [ - { - "id": "test" - }, - "test.txt:md5,dc360653d0d1848e6cc01661dbff389c" - ] - ], - "versions": [ - "versions.yml:md5,0d85e18b9c36aa2db49ad51930d9a5e6" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-03-08T17:33:55.801913" - }, - "fai remove chr": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.txt:md5,c8aa996df2a03384617fff85d911f401" - ] - ], - "1": [ - "versions.yml:md5,0d85e18b9c36aa2db49ad51930d9a5e6" - ], - "annot_chr": [ - [ - { - "id": "test" - }, - "test.txt:md5,c8aa996df2a03384617fff85d911f401" - ] - ], - "versions": [ - "versions.yml:md5,0d85e18b9c36aa2db49ad51930d9a5e6" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-03-08T17:34:01.901705" - } -} diff --git a/modules/local/faitochr/tests/tags.yml b/modules/local/faitochr/tests/tags.yml deleted file mode 100644 index 5de9b9a1..00000000 --- a/modules/local/faitochr/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -faitochr: - - "modules/local/faitochr/**" diff --git a/modules/local/vcfchrextract/environment.yml b/modules/local/vcfchrextract/environment.yml new file mode 100644 index 00000000..3280dfaf --- /dev/null +++ b/modules/local/vcfchrextract/environment.yml @@ -0,0 +1,7 @@ +name: vcfchrextract +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.18 diff --git a/modules/local/vcfchrextract/main.nf b/modules/local/vcfchrextract/main.nf new file mode 100644 index 00000000..b458bb0e --- /dev/null +++ b/modules/local/vcfchrextract/main.nf @@ -0,0 +1,49 @@ +process VCFCHREXTRACT { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.18--h8b25389_0': + 'biocontainers/bcftools:1.18--h8b25389_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.txt"), emit: chr + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + bcftools \\ + head \\ + $input \\ + \| grep -o -E '^##contig=]*)' | cut -d'=' -f3 \\ + > ${prefix}.txt + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$( bcftools --version |& sed '1!d; s/^.*bcftools //' ) + grep: \$( grep --help |& grep -o -E '[0-9]+\\.[0-9]+\\.[0-9]+' ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$( bcftools --version |& sed '1!d; s/^.*bcftools //' ) + grep: \$( grep --help |& grep -o -E '[0-9]+\\.[0-9]+\\.[0-9]+' ) + END_VERSIONS + """ +} diff --git a/modules/local/vcfchrextract/meta.yml b/modules/local/vcfchrextract/meta.yml new file mode 100644 index 00000000..19d523d4 --- /dev/null +++ b/modules/local/vcfchrextract/meta.yml @@ -0,0 +1,41 @@ +name: vcfchrextract +description: Extract all contigs name into txt file +keywords: + - bcftools + - vcf + - head + - contig +tools: + - head: + description: Extract header from variant calling file. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: https://samtools.github.io/bcftools/bcftools.html#head + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: Query VCF or BCF file, can be either uncompressed or compressed +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - chr: + type: file + description: List of contigs in the VCF file + pattern: "*{txt}" +authors: + - "@louislenezet" +maintainers: + - "@louislenezet" diff --git a/modules/local/vcfchrextract/tests/main.nf.test b/modules/local/vcfchrextract/tests/main.nf.test new file mode 100644 index 00000000..a004135b --- /dev/null +++ b/modules/local/vcfchrextract/tests/main.nf.test @@ -0,0 +1,32 @@ +nextflow_process { + + name "Test Process VCFCHREXTRACT" + script "../main.nf" + process "VCFCHREXTRACT" + + tag "modules" + tag "modules_local" + tag "vcfchrextract" + + test("Extract chr from vcf") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_vcf_gz'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } +} diff --git a/modules/local/vcfchrextract/tests/main.nf.test.snap b/modules/local/vcfchrextract/tests/main.nf.test.snap new file mode 100644 index 00000000..3431bbe9 --- /dev/null +++ b/modules/local/vcfchrextract/tests/main.nf.test.snap @@ -0,0 +1,35 @@ +{ + "Extract chr from vcf": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.txt:md5,3a9ea6d336e113a74d7fdca5e7b623fc" + ] + ], + "1": [ + "versions.yml:md5,7e6d75a47df5ce3a975172dcd47fd247" + ], + "chr": [ + [ + { + "id": "test" + }, + "test.txt:md5,3a9ea6d336e113a74d7fdca5e7b623fc" + ] + ], + "versions": [ + "versions.yml:md5,7e6d75a47df5ce3a975172dcd47fd247" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-22T15:09:21.585363234" + } +} \ No newline at end of file diff --git a/modules/local/vcfchrextract/tests/tags.yml b/modules/local/vcfchrextract/tests/tags.yml new file mode 100644 index 00000000..429a601f --- /dev/null +++ b/modules/local/vcfchrextract/tests/tags.yml @@ -0,0 +1,2 @@ +vcfchrextract: + - "modules/local/vcfchrextract/**" diff --git a/modules/nf-core/gawk/environment.yml b/modules/nf-core/gawk/environment.yml new file mode 100644 index 00000000..34513c7f --- /dev/null +++ b/modules/nf-core/gawk/environment.yml @@ -0,0 +1,7 @@ +name: gawk +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - anaconda::gawk=5.1.0 diff --git a/modules/nf-core/gawk/main.nf b/modules/nf-core/gawk/main.nf new file mode 100644 index 00000000..f856a1f8 --- /dev/null +++ b/modules/nf-core/gawk/main.nf @@ -0,0 +1,54 @@ +process GAWK { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : + 'biocontainers/gawk:5.1.0' }" + + input: + tuple val(meta), path(input) + path(program_file) + + output: + tuple val(meta), path("${prefix}.${suffix}"), emit: output + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' // args is used for the main arguments of the tool + def args2 = task.ext.args2 ?: '' // args2 is used to specify a program when no program file has been given + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "${input.getExtension()}" + + program = program_file ? "-f ${program_file}" : "${args2}" + + """ + awk \\ + ${args} \\ + ${program} \\ + ${input} \\ + > ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "${input.getExtension}" + + """ + touch ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gawk/meta.yml b/modules/nf-core/gawk/meta.yml new file mode 100644 index 00000000..2b6033b0 --- /dev/null +++ b/modules/nf-core/gawk/meta.yml @@ -0,0 +1,50 @@ +name: "gawk" +description: | + If you are like many computer users, you would frequently like to make changes in various text files + wherever certain patterns appear, or extract data from parts of certain lines while discarding the rest. + The job is easy with awk, especially the GNU implementation gawk. +keywords: + - gawk + - awk + - txt + - text + - file parsing +tools: + - "gawk": + description: "GNU awk" + homepage: "https://www.gnu.org/software/gawk/" + documentation: "https://www.gnu.org/software/gawk/manual/" + tool_dev_url: "https://www.gnu.org/prep/ftp.html" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: The input file - Specify the logic that needs to be executed on this file on the `ext.args2` or in the program file + pattern: "*" + - program_file: + type: file + description: Optional file containing logic for awk to execute. If you don't wish to use a file, you can use `ext.args2` to specify the logic. + pattern: "*" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - output: + type: file + description: The output file - specify the name of this file using `ext.prefix` and the extension using `ext.suffix` + pattern: "*" +authors: + - "@nvnieuwk" +maintainers: + - "@nvnieuwk" diff --git a/nextflow.config b/nextflow.config index 882f501c..ba21f779 100644 --- a/nextflow.config +++ b/nextflow.config @@ -22,7 +22,7 @@ params { panel = null panel_index = null phased = null - panel_chr_rename = null + rename_chr = false // References genome = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 2e36b1c4..b1552439 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -52,13 +52,6 @@ "description": "Is the reference panel phased", "type": "boolean", "pattern": "true|false" - }, - "panel_chr_rename": { - "type": "string", - "description": "Rename the chromosome of the panel", - "pattern": "^\\S+\\.(csv|tsv|txt)$", - "format": "file-path", - "mimetype": "text/csv" } } }, @@ -93,6 +86,11 @@ "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" }, + "rename_chr": { + "type": "boolean", + "description": "Should the panel vcf files be renamed to match the reference genome (e.g. 'chr1' -> '1')", + "pattern": "true|false" + }, "email": { "type": "string", "description": "Email address for completion summary.", diff --git a/subworkflows/local/vcf_chr_check/main.nf b/subworkflows/local/vcf_chr_check/main.nf new file mode 100644 index 00000000..db5960b8 --- /dev/null +++ b/subworkflows/local/vcf_chr_check/main.nf @@ -0,0 +1,76 @@ +include { VCFCHREXTRACT as VCFCHRBFR } from '../../../modules/local/vcfchrextract/main.nf' +include { VCFCHREXTRACT as VCFCHRAFT } from '../../../modules/local/vcfchrextract/main.nf' +include { VCF_CHR_RENAME } from '../vcf_chr_rename/main.nf' + +workflow VCF_CHR_CHECK { + take: + ch_vcf // channel: [ [id], vcf, index ] + ch_fasta // channel: [ [id], fasta, fai ] + + main: + + ch_versions = Channel.empty() + + // Get contig names from the VCF + VCFCHRBFR(ch_vcf.map{ metaV, vcf, csi -> [metaV, vcf] }) + ch_versions = ch_versions.mix(VCFCHRBFR.out.versions.first()) + + // Check if the contig names are the same as the reference + chr_disjoint = check_chr(VCFCHRBFR.out.chr, ch_vcf, ch_fasta) + + if (params.rename_chr == true) { + // Generate the chromosome renaming file + VCF_CHR_RENAME( + chr_disjoint.to_rename.map{meta, vcf, index, nb -> [meta, vcf, index]}, + ch_fasta + ) + ch_versions = ch_versions.mix(VCF_CHR_RENAME.out.versions.first()) + + // Check if modification has solved the problem + VCFCHRAFT(VCF_CHR_RENAME.out.vcf_renamed.map{ metaV, vcf, csi -> [metaV, vcf] }) + ch_versions = ch_versions.mix(VCFCHRAFT.out.versions.first()) + + chr_disjoint_after = check_chr(VCFCHRAFT.out.chr, VCF_CHR_RENAME.out.vcf_renamed, ch_fasta) + + chr_disjoint_after.to_rename.map{ + error 'Even after renaming errors are still present. Please check that contigs name in vcf and fasta file are equivalent.' + } + ch_vcf_renamed = VCF_CHR_RENAME.out.vcf_renamed + + } else { + chr_disjoint.to_rename.map { + error 'Some contig names in the VCF do not match the reference genome. Please set `rename_chr` to `true` to rename the contigs.' + } + ch_vcf_renamed = Channel.empty() + } + + ch_vcf_out = chr_disjoint.no_rename + .map{meta, vcf, csi, chr -> [meta, vcf, csi]} + .mix(ch_vcf_renamed) + + emit: + vcf = ch_vcf_out // [ meta, vcf, csi ] + versions = ch_versions // channel: [ versions.yml ] +} + + +def check_chr(ch_chr, ch_vcf, ch_fasta){ + chr_checked = ch_chr + .combine(ch_vcf, by:0) + .combine(ch_fasta) + .map{metaI, chr, vcf, csi, metaG, fasta, fai -> + [ + metaI, vcf, csi, + chr.readLines()*.split(' ').collect{it[0]}, + fai.readLines()*.split('\t').collect{it[0]} + ] + } + .map { meta, vcf, csi, chr, fai -> + [meta, vcf, csi, (chr-fai).size()] + } + .branch{ + no_rename: it[3] == 0 + to_rename: it[3] > 0 + } + return chr_checked +} diff --git a/subworkflows/local/vcf_chr_check/tests/main.nf.test b/subworkflows/local/vcf_chr_check/tests/main.nf.test new file mode 100644 index 00000000..da76fdc9 --- /dev/null +++ b/subworkflows/local/vcf_chr_check/tests/main.nf.test @@ -0,0 +1,174 @@ +nextflow_workflow { + + name "Test Subworkflow VCF_CHR_CHECK" + script "../main.nf" + + workflow "VCF_CHR_CHECK" + + tag "subworkflows" + tag "subworkflows_local" + tag "subworkflows/vcf_chr_check" + tag "vcf_chr_check" + + tag "bcftools" + tag "bcftools/annotate" + tag "bcftools/index" + tag "gawk" + + test("Rename: panel chr + fasta chr") { + config "./nextflow_rename.config" + when { + workflow { + """ + fai_file = Channel.of('chr22\t10000\t7\t60\t61', 'chr21\t10000\t7\t60\t61').collectFile(name: 'chr21_22.fai', newLine: true) + input[0] = Channel.fromList([ + [ + [id: "chr22"], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz",checkIfExist:true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi",checkIfExist:true) + ], + [ + [id: "chr21"], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/chr21/simulated_sv.vcf.gz",checkIfExist:true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/chr21/simulated_sv.vcf.gz.tbi",checkIfExist:true) + ] + ]) + input[1] = Channel.of([[id:"GRCh37"],[]]) + .combine(fai_file) + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + } + + test("Rename: panel chr + fasta no chr") { + config "./nextflow_rename.config" + when { + workflow { + """ + fai_file = Channel.of('22\t10000\t7\t60\t61', '21\t10000\t7\t60\t61').collectFile(name: '21_22.fai', newLine: true) + input[0] = Channel.fromList([ + [ + [id: "chr22"], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz",checkIfExist:true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi",checkIfExist:true) + ], + [ + [id: "chr21"], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/chr21/simulated_sv.vcf.gz",checkIfExist:true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/chr21/simulated_sv.vcf.gz.tbi",checkIfExist:true) + ] + ]) + input[1] = Channel.of([[id:"GRCh37"],[]]) + .combine(fai_file) + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + } + + test("Rename: panel no chr + fasta chr") { + config "./nextflow_rename.config" + when { + workflow { + """ + fai_file = Channel.of( + 'chr1\t10000\t7\t60\t61','chr2\t10000\t7\t60\t61','chr3\t10000\t7\t60\t61','chr4\t10000\t7\t60\t61','chr5\t10000\t7\t60\t61','chr6\t10000\t7\t60\t61', + 'chr7\t10000\t7\t60\t61','chr8\t10000\t7\t60\t61','chr9\t10000\t7\t60\t61','chr10\t10000\t7\t60\t61','chr11\t10000\t7\t60\t61','chr12\t10000\t7\t60\t61', + 'chr13\t10000\t7\t60\t61','chr14\t10000\t7\t60\t61','chr15\t10000\t7\t60\t61','chr16\t10000\t7\t60\t61','chr17\t10000\t7\t60\t61','chr18\t10000\t7\t60\t61', + 'chr19\t10000\t7\t60\t61','chr20\t10000\t7\t60\t61','chr21\t10000\t7\t60\t61','chr22\t10000\t7\t60\t61', + 'chrX\t10000\t7\t60\t61','chrY\t10000\t7\t60\t61', 'chrMT\t10000\t7\t60\t61' + ).collectFile(name: 'chr.fai', newLine: true) + input[0] = Channel.fromList([ + [ + [id: "22"], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/test_models.vcf.gz",checkIfExist:true), + [] + ] + ]) + input[1] = Channel.of([[id:"GRCh37"],[]]) + .combine(fai_file) + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + } + + test("Error : missing renaming params") { + config "./nextflow.config" + when { + workflow { + """ + input[0] = Channel.fromList([ + [ + [id: "multi"], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/test.rnaseq.vcf.gz",checkIfExist:true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/test.rnaseq.vcf.gz.tbi",checkIfExist:true) + ], + [ + [id: "chr21"], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/chr21/simulated_sv.vcf.gz",checkIfExist:true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/chr21/simulated_sv.vcf.gz.tbi",checkIfExist:true) + ] + ]) + input[1] = Channel.of([[id:"GRCh37"],[], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta.fai",checkIfExist:true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.failed }, + { assert workflow.errorReport.contains("Some contig names in the VCF do not match the reference genome. Please set `rename_chr` to `true` to rename the contigs.")} + ) + } + } + test("Error : still difference after renaming"){ + config "./nextflow_rename.config" + when { + workflow { + """ + input[0] = Channel.fromList([ + [ + [id: "multi"], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz",checkIfExist:true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz.tbi",checkIfExist:true) + ] + ]) // Error due to multiple contigs name in header not present in fasta file + input[1] = Channel.of([ + [id:"GRCh37"], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta",checkIfExist:true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta.fai",checkIfExist:true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.failed }, + { assert workflow.errorReport.contains("Even after renaming errors are still present. Please check that contigs name in vcf and fasta file are equivalent.")} + ) + } + } +} diff --git a/subworkflows/local/vcf_chr_check/tests/main.nf.test.snap b/subworkflows/local/vcf_chr_check/tests/main.nf.test.snap new file mode 100644 index 00000000..10f7f443 --- /dev/null +++ b/subworkflows/local/vcf_chr_check/tests/main.nf.test.snap @@ -0,0 +1,145 @@ +{ + "Rename: panel chr + fasta chr": { + "content": [ + { + "0": [ + [ + { + "id": "chr21" + }, + "/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/chr21/simulated_sv.vcf.gz", + "/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/chr21/simulated_sv.vcf.gz.tbi" + ], + [ + { + "id": "chr22" + }, + "/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz", + "/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi" + ] + ], + "1": [ + "versions.yml:md5,395e1cde3f38a30f5d80769972ba23d8", + "versions.yml:md5,ad4c5338cd27e20789c70e28b8c74a42" + ], + "vcf": [ + [ + { + "id": "chr21" + }, + "/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/chr21/simulated_sv.vcf.gz", + "/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/chr21/simulated_sv.vcf.gz.tbi" + ], + [ + { + "id": "chr22" + }, + "/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz", + "/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi" + ] + ], + "versions": [ + "versions.yml:md5,395e1cde3f38a30f5d80769972ba23d8", + "versions.yml:md5,ad4c5338cd27e20789c70e28b8c74a42" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-27T17:21:13.588561053" + }, + "Rename: panel no chr + fasta chr": { + "content": [ + { + "0": [ + [ + { + "id": "22" + }, + "22_chrrename.vcf.gz:md5,070a96d1053a64f2de2132ee8800847c", + "22_chrrename.vcf.gz.csi:md5,e190b690b4b0a4d088231862e5408582" + ] + ], + "1": [ + "versions.yml:md5,395e1cde3f38a30f5d80769972ba23d8", + "versions.yml:md5,ad4c5338cd27e20789c70e28b8c74a42", + "versions.yml:md5,e576f40503c3506c782228485d06fbf1" + ], + "vcf": [ + [ + { + "id": "22" + }, + "22_chrrename.vcf.gz:md5,070a96d1053a64f2de2132ee8800847c", + "22_chrrename.vcf.gz.csi:md5,e190b690b4b0a4d088231862e5408582" + ] + ], + "versions": [ + "versions.yml:md5,395e1cde3f38a30f5d80769972ba23d8", + "versions.yml:md5,ad4c5338cd27e20789c70e28b8c74a42", + "versions.yml:md5,e576f40503c3506c782228485d06fbf1" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-27T17:21:39.92481538" + }, + "Rename: panel chr + fasta no chr": { + "content": [ + { + "0": [ + [ + { + "id": "chr21" + }, + "chr21_chrrename.vcf.gz:md5,22785a5d7ec1132f766efae5f8e00adf", + "chr21_chrrename.vcf.gz.csi:md5,b5b5fd753ee54ebd3c8e4b1fe2261cdb" + ], + [ + { + "id": "chr22" + }, + "chr22_chrrename.vcf.gz:md5,23de9b4db1406415806e627969cec749", + "chr22_chrrename.vcf.gz.csi:md5,ba370ca13289fee4be59253a1f4609e2" + ] + ], + "1": [ + "versions.yml:md5,395e1cde3f38a30f5d80769972ba23d8", + "versions.yml:md5,ad4c5338cd27e20789c70e28b8c74a42", + "versions.yml:md5,e576f40503c3506c782228485d06fbf1" + ], + "vcf": [ + [ + { + "id": "chr21" + }, + "chr21_chrrename.vcf.gz:md5,22785a5d7ec1132f766efae5f8e00adf", + "chr21_chrrename.vcf.gz.csi:md5,b5b5fd753ee54ebd3c8e4b1fe2261cdb" + ], + [ + { + "id": "chr22" + }, + "chr22_chrrename.vcf.gz:md5,23de9b4db1406415806e627969cec749", + "chr22_chrrename.vcf.gz.csi:md5,ba370ca13289fee4be59253a1f4609e2" + ] + ], + "versions": [ + "versions.yml:md5,395e1cde3f38a30f5d80769972ba23d8", + "versions.yml:md5,ad4c5338cd27e20789c70e28b8c74a42", + "versions.yml:md5,e576f40503c3506c782228485d06fbf1" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-27T17:21:28.214969089" + } +} \ No newline at end of file diff --git a/subworkflows/local/vcf_chr_check/tests/nextflow.config b/subworkflows/local/vcf_chr_check/tests/nextflow.config new file mode 100644 index 00000000..ff02f295 --- /dev/null +++ b/subworkflows/local/vcf_chr_check/tests/nextflow.config @@ -0,0 +1,14 @@ +params { + max_memory = '7.GB' + rename_chr = false +} + +process { + withName: BCFTOOLS_ANNOTATE { + ext.args = [ + "-Oz", + "--no-version" + ].join(' ') + ext.prefix = { "${meta.id}_chrrename" } + } +} diff --git a/subworkflows/local/vcf_chr_check/tests/nextflow_rename.config b/subworkflows/local/vcf_chr_check/tests/nextflow_rename.config new file mode 100644 index 00000000..d048cbcb --- /dev/null +++ b/subworkflows/local/vcf_chr_check/tests/nextflow_rename.config @@ -0,0 +1,14 @@ +params { + max_memory = '7.GB' + rename_chr = true +} + +process { + withName: BCFTOOLS_ANNOTATE { + ext.args = [ + "-Oz", + "--no-version" + ].join(' ') + ext.prefix = { "${meta.id}_chrrename" } + } +} diff --git a/subworkflows/local/vcf_chr_check/tests/tags.yml b/subworkflows/local/vcf_chr_check/tests/tags.yml new file mode 100644 index 00000000..d090629e --- /dev/null +++ b/subworkflows/local/vcf_chr_check/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/vcf_chr_check: + - subworkflows/local/vcf_chr_check/** diff --git a/subworkflows/local/vcf_chr_rename/main.nf b/subworkflows/local/vcf_chr_rename/main.nf index 352f2bd1..84a3b8b9 100644 --- a/subworkflows/local/vcf_chr_rename/main.nf +++ b/subworkflows/local/vcf_chr_rename/main.nf @@ -1,28 +1,39 @@ include { BCFTOOLS_ANNOTATE } from '../../../modules/nf-core/bcftools/annotate/main.nf' -include { BCFTOOLS_INDEX as VCF_INDEX } from '../../../modules/nf-core/bcftools/index/main.nf' +include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main.nf' +include { GAWK as FAITOCHR } from '../../../modules/nf-core/gawk/main.nf' workflow VCF_CHR_RENAME { take: - ch_vcf // channel: [ [id, ref], vcf, csi ] - file_chr_rename // file + ch_vcf // channel: [ [id], vcf, index ] + ch_fasta // channel: [ [id], fasta, fai ] main: ch_versions = Channel.empty() + // Generate the chromosome renaming file + FAITOCHR( + ch_fasta.map{ metaG, fasta, fai -> [metaG, fai] }, + Channel.of( + 'BEGIN {FS="\\t"} NR==1 { if ($1 ~ /^chr/) { col1=""; col2="chr" } else { col1="chr"; col2="" } } { sub(/^chr/, "", $1); if ($1 ~ /^[0-9]+|[XYMT]$/) print col1$1, col2$1; else print $1, $1 }' + ).collectFile(name:"program.txt")) + ch_versions = ch_versions.mix(FAITOCHR.out.versions) + // Rename the chromosome without prefix - BCFTOOLS_ANNOTATE(ch_vcf - .combine(Channel.of([[], [], []])) - .combine(Channel.of(file_chr_rename)) + BCFTOOLS_ANNOTATE( + ch_vcf // channel: [ [id], vcf, index ] + .combine(Channel.of([[],[],[]])) + .combine(FAITOCHR.out.output.map{it[1]}) ) + ch_versions = ch_versions.mix(BCFTOOLS_ANNOTATE.out.versions.first()) - VCF_INDEX(BCFTOOLS_ANNOTATE.out.vcf) - ch_versions = ch_versions.mix(VCF_INDEX.out.versions.first()) + BCFTOOLS_INDEX(BCFTOOLS_ANNOTATE.out.vcf) + ch_versions = ch_versions.mix(BCFTOOLS_INDEX.out.versions.first()) - ch_vcf_rename = BCFTOOLS_ANNOTATE.out.vcf - .combine(VCF_INDEX.out.csi) + ch_vcf_renamed = BCFTOOLS_ANNOTATE.out.vcf + .combine(BCFTOOLS_INDEX.out.csi, by:0) emit: - vcf_rename = ch_vcf_rename // [ meta, vcf, csi ] + vcf_renamed = ch_vcf_renamed // [ meta, vcf, csi ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/vcf_chr_rename/tests/main.nf.test b/subworkflows/local/vcf_chr_rename/tests/main.nf.test index d045d36a..d8d9c4e4 100644 --- a/subworkflows/local/vcf_chr_rename/tests/main.nf.test +++ b/subworkflows/local/vcf_chr_rename/tests/main.nf.test @@ -2,6 +2,7 @@ nextflow_workflow { name "Test Subworkflow VCF_CHR_RENAME" script "../main.nf" + config "./nextflow.config" workflow "VCF_CHR_RENAME" @@ -14,18 +15,29 @@ nextflow_workflow { tag "bcftools" tag "bcftools/annotate" tag "bcftools/index" + tag "gawk" test("Should run without error") { - when { workflow { """ - input[0] = Channel.of([ - [id: "input", genome:"GRCh37"], - "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz", - "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz.tbi" + input[0] = Channel.fromList([ + [ + [id: "multi"], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz",checkIfExist:true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz.tbi",checkIfExist:true) + ], + [ + [id: "chr21"], + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/chr21/simulated_sv.vcf.gz",checkIfExist:true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/chr21/simulated_sv.vcf.gz.tbi",checkIfExist:true) + ] + ]) + input[1] = Channel.of([ + [id:"GRCh37"], + file("https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/reference_genome/21_22/hs38DH.chr21_22.fa",checkIfExist:true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/reference_genome/21_22/hs38DH.chr21_22.fa.fai",checkIfExist:true) ]) - input[1] = file("../../../assets/chr_rename_add.txt", exist: true) """ } } @@ -33,7 +45,7 @@ nextflow_workflow { then { assertAll( { assert workflow.success }, - { assert snapshot(workflow.out.ch_region).match() } + { assert snapshot(workflow.out).match() } ) } } diff --git a/subworkflows/local/vcf_chr_rename/tests/main.nf.test.snap b/subworkflows/local/vcf_chr_rename/tests/main.nf.test.snap index 3aee8a9e..52c5f8fe 100644 --- a/subworkflows/local/vcf_chr_rename/tests/main.nf.test.snap +++ b/subworkflows/local/vcf_chr_rename/tests/main.nf.test.snap @@ -1,10 +1,55 @@ { "Should run without error": { - "content": null, + "content": [ + { + "0": [ + [ + { + "id": "chr21" + }, + "chr21_chrrename.vcf.gz:md5,39cd8e316cd9b9282b8289d69d81260b", + "chr21_chrrename.vcf.gz.csi:md5,3bbbb50b0dd3515d380eabe0013cde19" + ], + [ + { + "id": "multi" + }, + "multi_chrrename.vcf.gz:md5,5f6f1ca261270d55eec054368f3d9587", + "multi_chrrename.vcf.gz.csi:md5,5d175780d5611d962430bff3377f649f" + ] + ], + "1": [ + "versions.yml:md5,176431a832f84d4c329f6d1e9c74d203", + "versions.yml:md5,260c4004a4bb0936c43f932e50de9c19", + "versions.yml:md5,3698013e288e15d392e1cd3e22d2022a" + ], + "vcf_renamed": [ + [ + { + "id": "chr21" + }, + "chr21_chrrename.vcf.gz:md5,39cd8e316cd9b9282b8289d69d81260b", + "chr21_chrrename.vcf.gz.csi:md5,3bbbb50b0dd3515d380eabe0013cde19" + ], + [ + { + "id": "multi" + }, + "multi_chrrename.vcf.gz:md5,5f6f1ca261270d55eec054368f3d9587", + "multi_chrrename.vcf.gz.csi:md5,5d175780d5611d962430bff3377f649f" + ] + ], + "versions": [ + "versions.yml:md5,176431a832f84d4c329f6d1e9c74d203", + "versions.yml:md5,260c4004a4bb0936c43f932e50de9c19", + "versions.yml:md5,3698013e288e15d392e1cd3e22d2022a" + ] + } + ], "meta": { "nf-test": "0.8.4", "nextflow": "23.10.1" }, - "timestamp": "2024-03-13T12:47:49.775995" + "timestamp": "2024-03-27T17:18:53.771496074" } -} +} \ No newline at end of file diff --git a/subworkflows/local/vcf_chr_rename/tests/nextflow.config b/subworkflows/local/vcf_chr_rename/tests/nextflow.config index 227aed3d..cf2f7a63 100644 --- a/subworkflows/local/vcf_chr_rename/tests/nextflow.config +++ b/subworkflows/local/vcf_chr_rename/tests/nextflow.config @@ -1,3 +1,13 @@ params { max_memory = '7.GB' } + +process { + withName: BCFTOOLS_ANNOTATE { + ext.args = [ + "-Oz", + "--no-version" + ].join(' ') + ext.prefix = { "${meta.id}_chrrename" } + } +} diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 1cde520b..8535f860 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -22,7 +22,7 @@ include { BAM_DOWNSAMPLE } from '../../subworkflows/local/bam_downs include { COMPUTE_GL as GL_TRUTH } from '../../subworkflows/local/compute_gl' include { COMPUTE_GL as GL_INPUT } from '../../subworkflows/local/compute_gl' include { VCF_IMPUTE_GLIMPSE } from '../../subworkflows/nf-core/vcf_impute_glimpse' -include { VCF_CHR_RENAME } from '../../subworkflows/local/vcf_chr_rename' +include { VCF_CHR_CHECK } from '../../subworkflows/local/vcf_chr_check' include { GET_PANEL } from '../../subworkflows/local/get_panel' /* @@ -81,17 +81,11 @@ workflow PHASEIMPUTE { // if (params.step == 'impute' || params.step == 'panel_prep') { // Remove if necessary "chr" - if (params.panel_chr_rename != null) { - print("Need to rename the chromosome prefix of the panel") - VCF_CHR_RENAME(ch_panel, params.panel_chr_rename) - ch_panel = VCF_CHR_RENAME.out.vcf_rename - } - - if (ch_panel.map{it[3] == null}.any()) { - print("Need to compute the sites and tsv files for the panel") - GET_PANEL(ch_panel, ch_fasta) - } + VCF_CHR_CHECK(ch_panel, ch_fasta) + ch_versions = ch_versions.mix(VCF_CHR_CHECK.out.versions.first()) + // Prepare the panel + GET_PANEL(VCF_CHR_CHECK.out.vcf, ch_fasta) ch_versions = ch_versions.mix(GET_PANEL.out.versions.first()) // Output channel of input process @@ -138,12 +132,10 @@ workflow PHASEIMPUTE { ch_impute_output = ch_impute_output.mix(output_glimpse1) } if (params.tools.contains("glimpse2")) { - print("Impute with Glimpse2") error "Glimpse2 not yet implemented" // Glimpse2 subworkflow } if (params.tools.contains("quilt")) { - print("Impute with quilt") error "Quilt not yet implemented" // Quilt subworkflow }