From 01e0927a3ce19cc20338d1fab2a5f2e4262d19cc Mon Sep 17 00:00:00 2001 From: LouisLeNezet Date: Mon, 11 Nov 2024 21:38:24 +0100 Subject: [PATCH] Check for multiple sample before splitting --- modules.json | 5 + .../nf-core/bcftools/query/environment.yml | 5 + modules/nf-core/bcftools/query/main.nf | 56 ++++++++++ modules/nf-core/bcftools/query/meta.yml | 67 ++++++++++++ .../nf-core/bcftools/query/tests/main.nf.test | 101 ++++++++++++++++++ .../bcftools/query/tests/main.nf.test.snap | 55 ++++++++++ .../bcftools/query/tests/nextflow.config | 3 + modules/nf-core/bcftools/query/tests/tags.yml | 2 + subworkflows/local/vcf_split_bcftools/main.nf | 28 ++++- .../vcf_split_bcftools/tests/main.nf.test | 59 +++++++++- .../tests/main.nf.test.snap | 69 ++++++++++++ .../vcf_split_bcftools/tests/nextflow.config | 7 ++ 12 files changed, 449 insertions(+), 8 deletions(-) create mode 100644 modules/nf-core/bcftools/query/environment.yml create mode 100644 modules/nf-core/bcftools/query/main.nf create mode 100644 modules/nf-core/bcftools/query/meta.yml create mode 100644 modules/nf-core/bcftools/query/tests/main.nf.test create mode 100644 modules/nf-core/bcftools/query/tests/main.nf.test.snap create mode 100644 modules/nf-core/bcftools/query/tests/nextflow.config create mode 100644 modules/nf-core/bcftools/query/tests/tags.yml diff --git a/modules.json b/modules.json index 6db896db..664e530a 100644 --- a/modules.json +++ b/modules.json @@ -50,6 +50,11 @@ "installed_by": ["modules"], "patch": "modules/nf-core/bcftools/pluginsplit/bcftools-pluginsplit.diff" }, + "bcftools/query": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, "bcftools/stats": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", diff --git a/modules/nf-core/bcftools/query/environment.yml b/modules/nf-core/bcftools/query/environment.yml new file mode 100644 index 00000000..5c00b116 --- /dev/null +++ b/modules/nf-core/bcftools/query/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::bcftools=1.20 diff --git a/modules/nf-core/bcftools/query/main.nf b/modules/nf-core/bcftools/query/main.nf new file mode 100644 index 00000000..58019f4d --- /dev/null +++ b/modules/nf-core/bcftools/query/main.nf @@ -0,0 +1,56 @@ +process BCFTOOLS_QUERY { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0': + 'biocontainers/bcftools:1.20--h8b25389_0' }" + + input: + tuple val(meta), path(vcf), path(tbi) + path regions + path targets + path samples + + output: + tuple val(meta), path("*.${suffix}"), emit: output + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "txt" + def regions_file = regions ? "--regions-file ${regions}" : "" + def targets_file = targets ? "--targets-file ${targets}" : "" + def samples_file = samples ? "--samples-file ${samples}" : "" + """ + bcftools query \\ + $regions_file \\ + $targets_file \\ + $samples_file \\ + $args \\ + $vcf \\ + > ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "txt" + """ + touch ${prefix}.${suffix} \\ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/query/meta.yml b/modules/nf-core/bcftools/query/meta.yml new file mode 100644 index 00000000..279b3205 --- /dev/null +++ b/modules/nf-core/bcftools/query/meta.yml @@ -0,0 +1,67 @@ +name: bcftools_query +description: Extracts fields from VCF or BCF files and outputs them in user-defined + format. +keywords: + - query + - variant calling + - bcftools + - VCF +tools: + - query: + description: | + Extracts fields from VCF or BCF files and outputs them in user-defined format. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] + identifier: biotools:bcftools +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: | + The vcf file to be qeuried. + pattern: "*.{vcf.gz, vcf}" + - tbi: + type: file + description: | + The tab index for the VCF file to be inspected. + pattern: "*.tbi" + - - regions: + type: file + description: | + Optionally, restrict the operation to regions listed in this file. + - - targets: + type: file + description: | + Optionally, restrict the operation to regions listed in this file (doesn't rely upon index files) + - - samples: + type: file + description: | + Optional, file of sample names to be included or excluded. + e.g. 'file.tsv' +output: + - output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.${suffix}": + type: file + description: BCFTools query output file + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@abhi18av" + - "@drpatelh" +maintainers: + - "@abhi18av" + - "@drpatelh" diff --git a/modules/nf-core/bcftools/query/tests/main.nf.test b/modules/nf-core/bcftools/query/tests/main.nf.test new file mode 100644 index 00000000..39e67b35 --- /dev/null +++ b/modules/nf-core/bcftools/query/tests/main.nf.test @@ -0,0 +1,101 @@ +nextflow_process { + + name "Test Process BCFTOOLS_QUERY" + script "../main.nf" + process "BCFTOOLS_QUERY" + + tag "modules" + tag "modules_nfcore" + tag "bcftools" + tag "bcftools/query" + + config "./nextflow.config" + + test("sarscov2 - [vcf, tbi], [], [], []") { + + when { + process { + """ + input[0] = [ + [ id:'out' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz.tbi', checkIfExists: true) + ] + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.output, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [vcf, tbi], vcf, tsv, []") { + + when { + process { + """ + input[0] = [ + [ id:'out' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz.tbi', checkIfExists: true) + ] + input[1] = file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test3.vcf.gz', checkIfExists: true) + input[2] = file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test2.targets.tsv.gz', checkIfExists: true) + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.output, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [vcf, tbi], [], [], [] - stub") { + + when { + process { + """ + input[0] = [ + [ id:'out' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz.tbi', checkIfExists: true) + ] + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.output[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/bcftools/query/tests/main.nf.test.snap b/modules/nf-core/bcftools/query/tests/main.nf.test.snap new file mode 100644 index 00000000..3ead1f2c --- /dev/null +++ b/modules/nf-core/bcftools/query/tests/main.nf.test.snap @@ -0,0 +1,55 @@ +{ + "sarscov2 - [vcf, tbi], vcf, tsv, []": { + "content": [ + [ + [ + { + "id": "out" + }, + "out.txt:md5,75a6bd0084e2e1838cf7baba11b99d19" + ] + ], + [ + "versions.yml:md5,3d93ea9cd5d314743254618b49e4bd16" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-31T15:15:44.916249758" + }, + "sarscov2 - [vcf, tbi], [], [], [] - stub": { + "content": [ + "out.txt", + [ + "versions.yml:md5,3d93ea9cd5d314743254618b49e4bd16" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-31T15:15:49.932359271" + }, + "sarscov2 - [vcf, tbi], [], [], []": { + "content": [ + [ + [ + { + "id": "out" + }, + "out.txt:md5,87a2ab194e1ee3219b44e58429ec3307" + ] + ], + [ + "versions.yml:md5,3d93ea9cd5d314743254618b49e4bd16" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-31T15:15:39.930697926" + } +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/query/tests/nextflow.config b/modules/nf-core/bcftools/query/tests/nextflow.config new file mode 100644 index 00000000..da81c2a0 --- /dev/null +++ b/modules/nf-core/bcftools/query/tests/nextflow.config @@ -0,0 +1,3 @@ +process { + ext.args = "-f '%CHROM %POS %REF %ALT[%SAMPLE=%GT]'" +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/query/tests/tags.yml b/modules/nf-core/bcftools/query/tests/tags.yml new file mode 100644 index 00000000..fb9455cb --- /dev/null +++ b/modules/nf-core/bcftools/query/tests/tags.yml @@ -0,0 +1,2 @@ +bcftools/query: + - "modules/nf-core/bcftools/query/**" diff --git a/subworkflows/local/vcf_split_bcftools/main.nf b/subworkflows/local/vcf_split_bcftools/main.nf index 524b3e88..6f8cec45 100644 --- a/subworkflows/local/vcf_split_bcftools/main.nf +++ b/subworkflows/local/vcf_split_bcftools/main.nf @@ -1,4 +1,5 @@ include { BCFTOOLS_PLUGINSPLIT } from '../../../modules/nf-core/bcftools/pluginsplit' +include { BCFTOOLS_QUERY } from '../../../modules/nf-core/bcftools/query/main' workflow VCF_SPLIT_BCFTOOLS { take: @@ -8,7 +9,25 @@ workflow VCF_SPLIT_BCFTOOLS { ch_versions = Channel.empty() - BCFTOOLS_PLUGINSPLIT(ch_vcf, [], [], [], []) + BCFTOOLS_QUERY(ch_vcf, [], [], []) // List samples + + BCFTOOLS_QUERY.out.output.splitText().groupTuple().view() + + ch_samples = ch_vcf + .join(BCFTOOLS_QUERY.out.output.splitText().groupTuple()) + .branch { + one : it[3].size() == 1 + multiple : it[3].size() > 1 + other : true + } + + ch_samples.other.map { + def file = it[1] + def id = it[0].id + error "File ${file} with id : ${id} does not have any samples information" + } + + BCFTOOLS_PLUGINSPLIT(ch_samples.multiple.map{it[0..2]}, [], [], [], []) ch_versions = ch_versions.mix(BCFTOOLS_PLUGINSPLIT.out.versions.first()) ch_vcf_samples = BCFTOOLS_PLUGINSPLIT.out.vcf @@ -19,8 +38,11 @@ workflow VCF_SPLIT_BCFTOOLS { .transpose() .map{metaITC, tbi -> [metaITC + [id: tbi.getBaseName().tokenize(".")[0]], tbi]} - ch_vcf_tbi_samples = ch_vcf_samples - .join(ch_tbi_samples) + ch_vcf_tbi_samples = ch_samples.one + .map{ it[0..2] } + .mix(ch_vcf_samples + .join(ch_tbi_samples) + ) emit: vcf_tbi = ch_vcf_tbi_samples // channel: [ [id, chr, tools], vcf, index ] diff --git a/subworkflows/local/vcf_split_bcftools/tests/main.nf.test b/subworkflows/local/vcf_split_bcftools/tests/main.nf.test index 46c09660..0f441397 100644 --- a/subworkflows/local/vcf_split_bcftools/tests/main.nf.test +++ b/subworkflows/local/vcf_split_bcftools/tests/main.nf.test @@ -15,7 +15,7 @@ nextflow_workflow { tag "bcftools" tag "bcftools/split" - test("Split vcf file") { + test("Split multiple vcf file") { setup { run("BCFTOOLS_MERGE") { script "../../../../modules/nf-core/bcftools/merge/main.nf" @@ -43,10 +43,6 @@ nextflow_workflow { } } when { - params { - max_cpus = 2 - max_memory = '2.GB' - } workflow { """ input[0] = BCFTOOLS_MERGE.out.vcf.join(BCFTOOLS_MERGE.out.tbi) @@ -74,4 +70,57 @@ nextflow_workflow { ) } } + + test("Split one sample vcf file") { + when { + workflow { + """ + input[0] = Channel.of([ + [id: 'NA12878'], + file(params.pipelines_testdata_base_path + "hum_data/individuals/NA12878/NA12878.s_imputed.bcf", checkIfExist:true), + file(params.pipelines_testdata_base_path + "hum_data/individuals/NA12878/NA12878.s_imputed.bcf.csi", checkIfExist:true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out.versions, + workflow.out.vcf_tbi.collect{[ + it[0], + path(it[1]).getFileName().toString(), + path(it[2]).getFileName().toString() + ] } + ).match() } + ) + } + } + + test("Split no sample vcf file") { + tag "error" + when { + workflow { + """ + input[0] = Channel.of([ + [id: 'dbsnp_146.hg38'], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz", checkIfExist:true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi", checkIfExist:true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.error }, + { assert snapshot( + workflow.out.stdout + ).match() + } + ) + } + } } diff --git a/subworkflows/local/vcf_split_bcftools/tests/main.nf.test.snap b/subworkflows/local/vcf_split_bcftools/tests/main.nf.test.snap index d3bd681e..bfcf0494 100644 --- a/subworkflows/local/vcf_split_bcftools/tests/main.nf.test.snap +++ b/subworkflows/local/vcf_split_bcftools/tests/main.nf.test.snap @@ -1,4 +1,73 @@ { + "Split one sample vcf file": { + "content": [ + [ + + ], + [ + [ + { + "id": "NA12878" + }, + "NA12878.s_imputed.bcf", + "NA12878.s_imputed.bcf.csi" + ] + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-11T16:06:03.822763802" + }, + "Split no sample vcf file": { + "content": null, + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-11T16:06:10.131104388" + }, + "Split multiple vcf file": { + "content": [ + [ + "versions.yml:md5,6c3351d97e3a99f7a7a3231fc49f92e2" + ], + [ + [ + { + "id": "NA12878" + }, + "NA12878.vcf.gz", + "NA12878.vcf.gz.tbi" + ], + [ + { + "id": "NA19401" + }, + "NA19401.vcf.gz", + "NA19401.vcf.gz.tbi" + ], + [ + { + "id": "NA20359" + }, + "NA20359.vcf.gz", + "NA20359.vcf.gz.tbi" + ] + ], + [ + "VcfFile [chromosomes=[chr21, chr22], sampleCount=1, variantCount=1739, phased=true, phasedAutodetect=true]", + "VcfFile [chromosomes=[chr21, chr22], sampleCount=1, variantCount=1739, phased=true, phasedAutodetect=true]", + "VcfFile [chromosomes=[chr21, chr22], sampleCount=1, variantCount=1739, phased=true, phasedAutodetect=true]" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-11T14:42:48.076610625" + }, "Split vcf file": { "content": [ [ diff --git a/subworkflows/local/vcf_split_bcftools/tests/nextflow.config b/subworkflows/local/vcf_split_bcftools/tests/nextflow.config index 523678dc..350757c2 100644 --- a/subworkflows/local/vcf_split_bcftools/tests/nextflow.config +++ b/subworkflows/local/vcf_split_bcftools/tests/nextflow.config @@ -1,7 +1,14 @@ process { + resourceLimits = [cpus: 2, memory: '2.GB'] + withName: BCFTOOLS_MERGE { ext.args = ["--write-index=tbi", "--output-type z"].join(' ') } + + withName: BCFTOOLS_QUERY { + ext.args = "--list-samples" + } + withName: BCFTOOLS_PLUGINSPLIT { ext.args = ["--write-index=tbi", "--output-type z"].join(' ') }