From e196577fe596104d176a2c5e9995b229ab721155 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Wed, 24 Apr 2024 17:48:54 +0000 Subject: [PATCH 01/58] add stitch tool --- conf/test_stitch.config | 33 +++++ modules.json | 5 + modules/nf-core/stitch/environment.yml | 7 + modules/nf-core/stitch/main.nf | 86 +++++++++++++ modules/nf-core/stitch/meta.yml | 120 ++++++++++++++++++ nextflow.config | 9 +- nextflow_schema.json | 2 +- .../bam_impute_stitch/bam_impute_stitch.nf | 28 ++++ workflows/phaseimpute/main.nf | 27 +++- 9 files changed, 311 insertions(+), 6 deletions(-) create mode 100644 conf/test_stitch.config create mode 100644 modules/nf-core/stitch/environment.yml create mode 100644 modules/nf-core/stitch/main.nf create mode 100644 modules/nf-core/stitch/meta.yml create mode 100644 subworkflows/local/bam_impute_stitch/bam_impute_stitch.nf diff --git a/conf/test_stitch.config b/conf/test_stitch.config new file mode 100644 index 00000000..a10f925e --- /dev/null +++ b/conf/test_stitch.config @@ -0,0 +1,33 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/phaseimpute -profile test_stitch, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Minimal Stitch Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function using the tool STITCH' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '2.GB' + max_time = '1.h' + + // Input data + input = "${projectDir}/tests/csv/sample_bam.csv" + input_region = "${projectDir}/tests/csv/region.csv" + + // Genome references + fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/reference_genome/21_22/hs38DH.chr21_22.fa" + phased = true + + // Impute parameters + step = "impute" + tools = "stitch" +} diff --git a/modules.json b/modules.json index 708fac05..1822b3a9 100644 --- a/modules.json +++ b/modules.json @@ -146,6 +146,11 @@ "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["vcf_phase_shapeit5"] }, + "stitch": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, "tabix/bgzip": { "branch": "master", "git_sha": "09d3c8c29b31a2dfd610305b10550f0e1dbcd4a9", diff --git a/modules/nf-core/stitch/environment.yml b/modules/nf-core/stitch/environment.yml new file mode 100644 index 00000000..3facc1bc --- /dev/null +++ b/modules/nf-core/stitch/environment.yml @@ -0,0 +1,7 @@ +name: stitch +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::r-stitch=1.6.10 diff --git a/modules/nf-core/stitch/main.nf b/modules/nf-core/stitch/main.nf new file mode 100644 index 00000000..2f76987f --- /dev/null +++ b/modules/nf-core/stitch/main.nf @@ -0,0 +1,86 @@ +process STITCH { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/r-stitch:1.6.10--r43h06b5641_0': + 'biocontainers/r-stitch:1.6.10--r43h06b5641_0' }" + + input: + tuple val(meta) , path(posfile), path(input, stageAs: "input"), path(rdata, stageAs: "RData_in"), val(chromosome_name), val(K), val(nGen) + tuple val(meta2), path(collected_crams), path(collected_crais), path(cramlist) + tuple val(meta3), path(fasta), path(fasta_fai) + val seed + + output: + tuple val(meta), path("input", type: "dir") , emit: input + tuple val(meta), path("RData", type: "dir") , emit: rdata + tuple val(meta), path("plots", type: "dir") , emit: plots , optional: { generate_input_only } + tuple val(meta), path("*.vcf.gz") , emit: vcf , optional: { generate_input_only || bgen_output } + tuple val(meta), path("*.bgen") , emit: bgen , optional: { generate_input_only || !bgen_output } + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + def args2 = task.ext.args2 ?: "" + def generate_input_only = args2.contains( "--generateInputOnly TRUE" ) + def bgen_output = args2.contains( "--output_format bgen" ) + def reads_ext = collected_crams ? collected_crams.extension.unique() : [] + def rsync_cmd = rdata ? "rsync -rL ${rdata}/ RData" : "" + def stitch_cmd = seed ? "Rscript <(cat \$(which STITCH.R) | tail -n +2 | cat <(echo 'set.seed(${seed})') -)" : "STITCH.R" + def cramlist_cmd = cramlist && reads_ext == ["cram"] ? "--cramlist ${cramlist}" : "" + def bamlist_cmd = cramlist && reads_ext == ["bam" ] ? "--bamlist ${cramlist}" : "" + def reference_cmd = fasta ? "--reference ${fasta}" : "" + def regenerate_input_cmd = input && rdata && !cramlist ? "--regenerateInput FALSE --originalRegionName ${chromosome_name}" : "" + def rsync_version_cmd = rdata ? "rsync: \$(rsync --version | head -n1 | sed 's/^rsync version //; s/ .*\$//')" : "" + """ + ${rsync_cmd} ${args} + + ${stitch_cmd} \\ + --chr ${chromosome_name} \\ + --posfile ${posfile} \\ + --outputdir . \\ + --nCores ${task.cpus} \\ + --K ${K} \\ + --nGen ${nGen} \\ + ${cramlist_cmd} \\ + ${bamlist_cmd} \\ + ${reference_cmd} \\ + ${regenerate_input_cmd} \\ + ${args2} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ${rsync_version_cmd} + r-base: \$(Rscript -e "cat(strsplit(R.version[['version.string']], ' ')[[1]][3])") + r-stitch: \$(Rscript -e "cat(as.character(utils::packageVersion('STITCH')))") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + def args2 = task.ext.args2 ?: "" + def generate_input_only = args2.contains( "--generateInputOnly TRUE" ) + def generate_plots_cmd = !generate_input_only ? "mkdir plots" : "" + def generate_vcf_cmd = !generate_input_only ? "touch ${prefix}.vcf.gz" : "" + def rsync_version_cmd = rdata ? "rsync: \$(rsync --version | head -n1 | sed 's/^rsync version //; s/ .*\$//')" : "" + """ + touch input + touch RData + ${generate_plots_cmd} + ${generate_vcf_cmd} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ${rsync_version_cmd} + r-base: \$(Rscript -e "cat(strsplit(R.version[['version.string']], ' ')[[1]][3])") + r-stitch: \$(Rscript -e "cat(as.character(utils::packageVersion('STITCH')))") + END_VERSIONS + """ +} diff --git a/modules/nf-core/stitch/meta.yml b/modules/nf-core/stitch/meta.yml new file mode 100644 index 00000000..a36d61cd --- /dev/null +++ b/modules/nf-core/stitch/meta.yml @@ -0,0 +1,120 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "stitch" +description: "STITCH is an R program for reference panel free, read aware, low coverage sequencing genotype imputation. STITCH runs on a set of samples with sequencing reads in BAM format, as well as a list of positions to genotype, and outputs imputed genotypes in VCF format." +keywords: + - imputation + - genomics + - vcf + - bgen + - cram + - bam + - sam +tools: + - "stitch": + description: "STITCH - Sequencing To Imputation Through Constructing Haplotypes" + homepage: "https://github.com/rwdavies/stitch" + documentation: "https://github.com/rwdavies/stitch" + tool_dev_url: "https://github.com/rwdavies/stitch" + doi: "10.1038/ng.3594" + licence: "['GPL v3']" +input: + - meta: + type: map + description: | + Groovy Map containing information about the set of positions to run the imputation over + e.g. `[ id:'test' ]` + - posfile: + type: file + description: | + Tab-separated file describing the variable positions to be used for imputation. Refer to the documentation for the `--posfile` argument of STITCH for more information. + pattern: "*.tsv" + - input: + type: directory + description: | + Folder of pre-generated input RData objects used when STITCH is called with the `--regenerateInput FALSE` flag. It is generated by running STITCH with the `--generateInputOnly TRUE` flag. + pattern: "input" + - rdata: + type: directory + description: | + Folder of pre-generated input RData objects used when STITCH is called with the `--regenerateInput FALSE` flag. It is generated by running STITCH with the `--generateInputOnly TRUE` flag. + pattern: "RData" + - chromosome_name: + type: string + description: Name of the chromosome to impute. Should match a chromosome name in the reference genome. + - K: + type: integer + description: Number of ancestral haplotypes to use for imputation. Refer to the documentation for the `--K` argument of STITCH for more information. + - nGen: + type: integer + description: Number of generations since founding of the population to use for imputation. Refer to the documentation for the `--nGen` argument of STITCH for more information. + - meta2: + type: map + description: | + Groovy Map containing information about the set of samples + e.g. `[ id:'test' ]` + - collected_crams: + type: file + description: List of sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - collected_crais: + type: file + description: List of BAM/CRAM/SAM index files + pattern: "*.{bai,crai,sai}" + - cramlist: + type: file + description: | + Text file with the path to the cram files to use in imputation, one per line. Since the cram files are staged to the working directory for the process, this file should just contain the file names without any pre-pending path. + pattern: "*.txt" + - meta3: + type: map + description: | + Groovy Map containing information about the reference genome used + e.g. `[ id:'test' ]` + - fasta: + type: file + description: FASTA reference genome file + pattern: "*.{fa,fasta}" + - fasta_fai: + type: file + description: FASTA index file + pattern: "*.{fai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - input: + type: directory + description: | + Folder of pre-generated input RData objects used when STITCH is called with the `--regenerateInput FALSE` flag. It is generated by running STITCH with the `--generateInputOnly TRUE` flag. + pattern: "input" + - rdata: + type: directory + description: | + Folder of pre-generated input RData objects used when STITCH is called with the `--regenerateInput FALSE` flag. It is generated by running STITCH with the `--generateInputOnly TRUE` flag. + pattern: "RData" + - plots: + type: directory + description: | + Folder containing plots produced by STITCH during imputation. Which plots are produced depends on the command-line arguments passed to STITCH. + pattern: "plots" + - vcf: + type: file + description: | + Imputed genotype calls for the positions in `posfile`, in vcf format. This is the default output. + pattern: ".vcf.gz" + - bgen: + type: file + description: | + Imputed genotype calls for the positions in `posfile`, in vcf format. This is the produced if `--output_format bgen` is specified. + pattern: ".bgen" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@saulpierotti" +maintainers: + - "@saulpierotti" diff --git a/nextflow.config b/nextflow.config index b459cd67..f75c7751 100644 --- a/nextflow.config +++ b/nextflow.config @@ -193,10 +193,11 @@ profiles { executor.cpus = 4 executor.memory = 8.GB } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } - test_sim { includeConfig 'conf/test_sim.config' } - test_quilt { includeConfig 'conf/test_quilt.config' } + test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } + test_sim { includeConfig 'conf/test_sim.config' } + test_quilt { includeConfig 'conf/test_quilt.config' } + test_stitch { includeConfig 'conf/test_stitch.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile diff --git a/nextflow_schema.json b/nextflow_schema.json index afe40f62..584aaf64 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -113,7 +113,7 @@ "type": "string", "description": "Step to run.", "fa_icon": "fas fa-step-forward", - "enum": ["glimpse1", "glimpse2", "quilt"] + "enum": ["glimpse1", "glimpse2", "quilt", "stitch"] } } }, diff --git a/subworkflows/local/bam_impute_stitch/bam_impute_stitch.nf b/subworkflows/local/bam_impute_stitch/bam_impute_stitch.nf new file mode 100644 index 00000000..6069ed0b --- /dev/null +++ b/subworkflows/local/bam_impute_stitch/bam_impute_stitch.nf @@ -0,0 +1,28 @@ +include { BCFTOOLS_QUERY } from '../../../modules/nf-core/bcftools/query/main' +include { STITCH } from '../../../modules/nf-core/stitch/main' + +workflow BAM_IMPUTE_STITCH { + + take: + ch_input // channel: [ val(meta), bam, bai ] + ch_panel_sites + + main: + + ch_versions = Channel.empty() + + // Convert position file to tab-separated file + BCFTOOLS_QUERY(ch_panel_sites) + ch_posfile = BCFTOOLS_QUERY.out.output + + + // Run STITCH + STITCH( stitch_input, GET_READS.out, reference, seed ) + + + + emit: + ch_vcf_tbi // channel: [ meta, vcf, tbi ] + versions = ch_versions // channel: [ versions.yml ] + +} diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index d07a1b8b..2f68edea 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -140,8 +140,33 @@ workflow PHASEIMPUTE { error "Glimpse2 not yet implemented" // Glimpse2 subworkflow } + + if (params.tools.contains("stitch")) { + print("Impute with STITCH") + // STITCH subworkflow + + // Make bamlist from bam input + ch_bamlist = ch_input + .map { it[1].tokenize('/').last() } + .collectFile( name: "bamlist.txt", newLine: true, sort: true ) + + // Get chromosomes + ch_chromosomes = ch_fasta.map{it -> it[2]} + .splitCsv(header: ["chr", "size", "offset", "lidebase", "linewidth", "qualoffset"], sep: "\t") + .map{it -> [chr:it.chr]} + + ch_chromosomes.dump(tag:"ch_chromosomes") + + // Prepare input for STITCH + + // Impute with STITCH + //BAM_IMPUTE_STITCH ( ch_input_stitch, GET_PANEL.out.ch_panel_sites ) + + + } + if (params.tools.contains("quilt")) { - print("Impute with quilt") + print("Impute with QUILT") // Quilt subworkflow From 87cce0ff46b44f040e165bf80b8bb5d016da3b1a Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Thu, 25 Apr 2024 22:49:53 +0000 Subject: [PATCH 02/58] simplify stitch subworkflows --- workflows/phaseimpute/main.nf | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 3a1dd342..47965904 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -34,6 +34,10 @@ include { VCF_CONCATENATE_BCFTOOLS as CONCAT_IMPUT } from '../../subworkflows/ include { VCF_CONCATENATE_BCFTOOLS as CONCAT_TRUTH } from '../../subworkflows/local/vcf_concatenate_bcftools' include { VCF_CONCATENATE_BCFTOOLS as CONCAT_PANEL } from '../../subworkflows/local/vcf_concatenate_bcftools' +include { PREPARE_INPUT_STITCH } from '../../subworkflows/local/prepare_input_stitch/prepare_input_stitch' +include { BAM_IMPUTE_STITCH } from '../../subworkflows/local/bam_impute_stitch/bam_impute_stitch' + + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -172,25 +176,17 @@ workflow PHASEIMPUTE { if (params.tools.contains("stitch")) { print("Impute with STITCH") - // STITCH subworkflow - - // Make bamlist from bam input - ch_bamlist = ch_input - .map { it[1].tokenize('/').last() } - .collectFile( name: "bamlist.txt", newLine: true, sort: true ) - - // Get chromosomes - ch_chromosomes = ch_fasta.map{it -> it[2]} - .splitCsv(header: ["chr", "size", "offset", "lidebase", "linewidth", "qualoffset"], sep: "\t") - .map{it -> [chr:it.chr]} - - ch_chromosomes.dump(tag:"ch_chromosomes") - // Prepare input for STITCH + // Prepare inputs + PREPARE_INPUT_STITCH(GET_PANEL.out.panel_sites, ch_fasta, ch_input_impute) // Impute with STITCH - //BAM_IMPUTE_STITCH ( ch_input_stitch, GET_PANEL.out.ch_panel_sites ) + BAM_IMPUTE_STITCH ( PREPARE_INPUT_STITCH.out.stitch_parameters, + PREPARE_INPUT_STITCH.out.stitch_samples, + ch_fasta ) + // Output channel to concat + ch_impute_output = ch_impute_output.mix(BAM_IMPUTE_STITCH.out.vcf_tbi) } From ebcea37b074e8fd7a03f777af7ba0bdb76a56fe9 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Thu, 25 Apr 2024 22:50:37 +0000 Subject: [PATCH 03/58] add stitch config --- conf/steps/imputation_stitch.config | 78 +++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 conf/steps/imputation_stitch.config diff --git a/conf/steps/imputation_stitch.config b/conf/steps/imputation_stitch.config new file mode 100644 index 00000000..8cdf1a2e --- /dev/null +++ b/conf/steps/imputation_stitch.config @@ -0,0 +1,78 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. +---------------------------------------------------------------------------------------- +*/ + +process { + + withName: CUSTOM_DUMPSOFTWAREVERSIONS { + publishDir = [ + path: { "${params.outdir}/pipeline_info" }, + mode: params.publish_dir_mode, + pattern: '*_versions.yml' + ] + } + + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:GAWK' { + ext.args = "'{ key = \$1 FS \$2 } !seen[key]++'" + ext.prefix = { "${meta.id}_${meta.chr}_no_multiallelic" } + ext.suffix = ".txt" + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_NORM' { + ext.args = '-m +any --output-type z' + ext.prefix = { "${meta.id}_${meta.chr}_multiallelic" } + cpus = 2 + memory = 400.MB + maxRetries = 2 + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_VIEW' { + ext.args = '-v snps -Oz' + ext.prefix = { "${meta.id}_${meta.chr}_biallelic" } + cpus = 2 + memory = 400.MB + maxRetries = 2 + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_INDEX' { + cpus = 2 + memory = 400.MB + maxRetries = 2 + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_INDEX_2' { + ext.args = '--tbi' + cpus = 2 + memory = 400.MB + maxRetries = 2 + } + + + + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_QUERY' { + ext.args = [ + "-f'%CHROM\t%POS\t%REF\t%ALT\\n'", + ].join(' ') + ext.prefix = { "${meta.id}_${meta.chr}_posfile_stitch" } + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_IMPUTE_STITCH:BCFTOOLS_INDEX' { + ext.args = '--tbi' + cpus = 2 + memory = 400.MB + maxRetries = 2 + } + + + +} From 395f17fc29f8fbacf921ea8c455b4243945e3672 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Thu, 25 Apr 2024 22:50:53 +0000 Subject: [PATCH 04/58] add stitch parameters --- nextflow.config | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nextflow.config b/nextflow.config index 6e4d49ee..782f484a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -51,6 +51,10 @@ params { ngen = 100 buffer = 10000 + // STITCH + k_val = 2 + seed = 1 + // Boilerplate options outdir = null publish_dir_mode = 'copy' @@ -290,6 +294,7 @@ includeConfig 'conf/steps/panel_prep.config' includeConfig 'conf/steps/imputation.config' includeConfig 'conf/steps/imputation_glimpse1.config' includeConfig 'conf/steps/imputation_quilt.config' +includeConfig 'conf/steps/imputation_stitch.config' // validation step includeConfig 'conf/steps/validation.config' From 2001c0831c9ceccbf93b7d1bd6df1ab971b69c9d Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Thu, 25 Apr 2024 22:51:10 +0000 Subject: [PATCH 05/58] re-add panel because it breaks otherwise --- conf/test_stitch.config | 1 + 1 file changed, 1 insertion(+) diff --git a/conf/test_stitch.config b/conf/test_stitch.config index a10f925e..3541a74c 100644 --- a/conf/test_stitch.config +++ b/conf/test_stitch.config @@ -26,6 +26,7 @@ params { // Genome references fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/reference_genome/21_22/hs38DH.chr21_22.fa" phased = true + panel = "${projectDir}/tests/csv/panel.csv" // Impute parameters step = "impute" From 1dc8c49c52fc0984ad72a35e060ab8037cc89b04 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Thu, 25 Apr 2024 22:52:48 +0000 Subject: [PATCH 06/58] export vcf and tbi --- .../bam_impute_stitch/bam_impute_stitch.nf | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/subworkflows/local/bam_impute_stitch/bam_impute_stitch.nf b/subworkflows/local/bam_impute_stitch/bam_impute_stitch.nf index 6069ed0b..33a76b68 100644 --- a/subworkflows/local/bam_impute_stitch/bam_impute_stitch.nf +++ b/subworkflows/local/bam_impute_stitch/bam_impute_stitch.nf @@ -1,28 +1,31 @@ -include { BCFTOOLS_QUERY } from '../../../modules/nf-core/bcftools/query/main' include { STITCH } from '../../../modules/nf-core/stitch/main' +include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main' + workflow BAM_IMPUTE_STITCH { take: - ch_input // channel: [ val(meta), bam, bai ] - ch_panel_sites + ch_parameters // channel: [ val(meta), bam, bai ] + ch_samples + ch_fasta main: ch_versions = Channel.empty() - // Convert position file to tab-separated file - BCFTOOLS_QUERY(ch_panel_sites) - ch_posfile = BCFTOOLS_QUERY.out.output - - // Run STITCH - STITCH( stitch_input, GET_READS.out, reference, seed ) + seed = params.seed + STITCH( ch_parameters, ch_samples, ch_fasta, seed ) + + // Index imputed annotated VCF + BCFTOOLS_INDEX(STITCH.out.vcf) + // Join VCFs and TBIs + ch_vcf_tbi = STITCH.out.vcf.join(BCFTOOLS_INDEX.out.tbi) emit: - ch_vcf_tbi // channel: [ meta, vcf, tbi ] + vcf_tbi = ch_vcf_tbi // channel: [ meta, vcf, tbi ] versions = ch_versions // channel: [ versions.yml ] } From 4c94d76e2b35852bf0062a32e9127ad524d84b52 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Thu, 25 Apr 2024 22:53:43 +0000 Subject: [PATCH 07/58] align --- subworkflows/local/vcf_concatenate_bcftools/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/vcf_concatenate_bcftools/main.nf b/subworkflows/local/vcf_concatenate_bcftools/main.nf index bc85d146..6137679d 100644 --- a/subworkflows/local/vcf_concatenate_bcftools/main.nf +++ b/subworkflows/local/vcf_concatenate_bcftools/main.nf @@ -12,7 +12,7 @@ workflow VCF_CONCATENATE_BCFTOOLS { // Remove chromosome from meta ch_vcf_tbi_grouped = ch_vcf_tbi - .map{ meta, vcf, tbi -> [['id' : meta.id], vcf, tbi] } + .map{ meta, vcf, tbi -> [['id' : meta.id], vcf, tbi] } // Group by ID ch_vcf_tbi_grouped = ch_vcf_tbi_grouped.groupTuple( by:0 ) From 20c1999adbd432530d83c9ba14ccfd2fbd44cfe0 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Thu, 25 Apr 2024 22:54:14 +0000 Subject: [PATCH 08/58] export panel sites to be used in stitch --- subworkflows/local/get_panel/main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/get_panel/main.nf b/subworkflows/local/get_panel/main.nf index 2f40dfd6..d9178c12 100644 --- a/subworkflows/local/get_panel/main.nf +++ b/subworkflows/local/get_panel/main.nf @@ -84,5 +84,6 @@ workflow GET_PANEL { emit: panel = ch_panel // channel: [ [panel, chr], norm, n_index, sites, s_index, tsv, t_index, phased, p_index] + panel_sites = ch_panel_sites versions = ch_versions // channel: [ versions.yml ] } From f19f89226ddbf6bc573fa7362994b47e8f4b105f Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Thu, 25 Apr 2024 22:54:44 +0000 Subject: [PATCH 09/58] preprocess data for stitch --- .../prepare_input_stitch.nf | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf diff --git a/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf b/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf new file mode 100644 index 00000000..07b50380 --- /dev/null +++ b/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf @@ -0,0 +1,64 @@ +include { BCFTOOLS_QUERY } from '../../../modules/nf-core/bcftools/query/main' +include { BCFTOOLS_NORM } from '../../../modules/nf-core/bcftools/norm/main' +include { GAWK } from '../../../modules/nf-core/gawk' + + +workflow PREPARE_INPUT_STITCH { + + take: + ch_panel_sites + ch_fasta + ch_input_impute + + main: + + ch_versions = Channel.empty() + + // Prepare posfile and parameters for STITCH + // Convert position file to tab-separated file + BCFTOOLS_QUERY(ch_panel_sites, [], [], []) + ch_posfile = BCFTOOLS_QUERY.out.output + + // Remove multiallelic positions from tsv + GAWK(ch_posfile, []) + + // Get chromosomes of posfile + ch_posfile = GAWK.out.output.map{meta, posfile -> return[['chr': meta.chr], posfile]} + + // Get chromosomes of fasta + ch_chromosomes = ch_fasta.map{it -> it[2]} + .splitCsv(header: ["chr", "size", "offset", "lidebase", "linewidth", "qualoffset"], sep: "\t") + .map{it -> return [[chr: it.chr], it.chr]} + + // Combine channels + def input_empty = [[]] + def rdata_empty = [[]] + k_val = params.k_val + ngen = params.ngen + + // Make final channel with parameters + stitch_parameters = ch_posfile.map { it + input_empty + rdata_empty} + .join(ch_chromosomes) + .map { it + k_val + ngen} + + // Prepare sample files for STITCH + // Group input by ID + ch_bam_bai = ch_input_impute.map {meta, bam, bai -> [[meta.id], bam, bai]}.unique() + + // Make bamlist from bam input + ch_bamlist = ch_bam_bai + .map {it[1].tokenize('/').last()} + .collectFile(name: "bamlist.txt", newLine: true, sort: true) + + // Collect all files + stitch_samples = ch_bam_bai.map {meta, bam, bai -> [["id": "all_samples"], bam, bai]} + .groupTuple() + .combine(ch_bamlist) + .collect() + + emit: + stitch_parameters + stitch_samples + versions = ch_versions // channel: [ versions.yml ] + +} From b22e6c4effa5fc84736fa5def897512c6b35a32b Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Thu, 25 Apr 2024 22:57:59 +0000 Subject: [PATCH 10/58] change meta due to output --- modules.json | 147 +++++++++++++++++++++-------- modules/nf-core/stitch/main.nf | 4 +- modules/nf-core/stitch/stitch.diff | 16 ++++ 3 files changed, 128 insertions(+), 39 deletions(-) create mode 100644 modules/nf-core/stitch/stitch.diff diff --git a/modules.json b/modules.json index 2a4bbc99..05427c6f 100644 --- a/modules.json +++ b/modules.json @@ -8,19 +8,25 @@ "bcftools/annotate": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/bcftools/annotate/bcftools-annotate.diff" }, "bcftools/concat": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/bcftools/concat/bcftools-concat.diff" }, "bcftools/convert": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bcftools/index": { "branch": "master", @@ -35,141 +41,196 @@ "bcftools/mpileup": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/bcftools/mpileup/bcftools-mpileup.diff" }, "bcftools/norm": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bcftools/query": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bcftools/view": { "branch": "master", "git_sha": "1013101da4252623fd7acf19cc581bae91d4f839", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/bcftools/view/bcftools-view.diff" }, "bedtools/makewindows": { "branch": "master", "git_sha": "3b248b84694d1939ac4bb33df84bf6233a34d668", - "installed_by": ["vcf_phase_shapeit5"] + "installed_by": [ + "vcf_phase_shapeit5" + ] }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "de45447d060b8c8b98575bc637a4a575fd0638e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "gawk": { "branch": "master", "git_sha": "da4d05d04e65227d4307e87940842f1a14de62c7", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "glimpse/chunk": { "branch": "master", "git_sha": "7e56daae390ff896b292ddc70823447683a79936", - "installed_by": ["vcf_impute_glimpse"] + "installed_by": [ + "vcf_impute_glimpse" + ] }, "glimpse/ligate": { "branch": "master", "git_sha": "7e56daae390ff896b292ddc70823447683a79936", - "installed_by": ["vcf_impute_glimpse"] + "installed_by": [ + "vcf_impute_glimpse" + ] }, "glimpse/phase": { "branch": "master", "git_sha": "7e56daae390ff896b292ddc70823447683a79936", - "installed_by": ["vcf_impute_glimpse"] + "installed_by": [ + "vcf_impute_glimpse" + ] }, "glimpse2/chunk": { "branch": "master", "git_sha": "14ba46490cae3c78ed8e8f48d2c0f8f3be1e7c03", - "installed_by": ["multiple_impute_glimpse2"] + "installed_by": [ + "multiple_impute_glimpse2" + ] }, "glimpse2/concordance": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "glimpse2/ligate": { "branch": "master", "git_sha": "ee7fee68281944b002bd27a8ff3f19200b4d3fad", - "installed_by": ["multiple_impute_glimpse2"] + "installed_by": [ + "multiple_impute_glimpse2" + ] }, "glimpse2/phase": { "branch": "master", "git_sha": "9c71d32e372650e8bb3e1fb15339017aad5e3f7f", - "installed_by": ["multiple_impute_glimpse2"] + "installed_by": [ + "multiple_impute_glimpse2" + ] }, "glimpse2/splitreference": { "branch": "master", "git_sha": "fa12139827a18b324bd63fce654818586a8e9cc7", - "installed_by": ["multiple_impute_glimpse2"] + "installed_by": [ + "multiple_impute_glimpse2" + ] }, "gunzip": { "branch": "master", "git_sha": "3a5fef109d113b4997c9822198664ca5f2716208", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "multiqc": { "branch": "master", "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "quilt/quilt": { "branch": "master", "git_sha": "46265545d61e7f482adf40de941cc9a94e479bbe", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/coverage": { "branch": "master", "git_sha": "38afbe42f7db7f19c7a89607c0a71c68f3be3131", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/samtools/coverage/samtools-coverage.diff" }, "samtools/faidx": { "branch": "master", "git_sha": "f153f1f10e1083c49935565844cccb7453021682", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/index": { "branch": "master", "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/view": { "branch": "master", "git_sha": "0bd7d2333a88483aa0476acea172e9f5f6dd83bb", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/samtools/view/samtools-view.diff" }, "shapeit5/ligate": { "branch": "master", "git_sha": "dcf17cc0ed8fd5ea57e61a13e0147cddb5c1ee30", - "installed_by": ["vcf_phase_shapeit5"] + "installed_by": [ + "vcf_phase_shapeit5" + ] }, "shapeit5/phasecommon": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["vcf_phase_shapeit5"] + "installed_by": [ + "vcf_phase_shapeit5" + ] }, "stitch": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ], + "patch": "modules/nf-core/stitch/stitch.diff" }, "tabix/bgzip": { "branch": "master", "git_sha": "09d3c8c29b31a2dfd610305b10550f0e1dbcd4a9", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "tabix/tabix": { "branch": "master", "git_sha": "9502adb23c0b97ed8e616bbbdfa73b4585aec9a1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } }, @@ -178,35 +239,47 @@ "multiple_impute_glimpse2": { "branch": "master", "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nextflow_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfvalidation_plugin": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "vcf_impute_glimpse": { "branch": "master", "git_sha": "7e56daae390ff896b292ddc70823447683a79936", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "vcf_phase_shapeit5": { "branch": "master", "git_sha": "dcf17cc0ed8fd5ea57e61a13e0147cddb5c1ee30", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] } } } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/stitch/main.nf b/modules/nf-core/stitch/main.nf index 2f76987f..761f5528 100644 --- a/modules/nf-core/stitch/main.nf +++ b/modules/nf-core/stitch/main.nf @@ -8,8 +8,8 @@ process STITCH { 'biocontainers/r-stitch:1.6.10--r43h06b5641_0' }" input: - tuple val(meta) , path(posfile), path(input, stageAs: "input"), path(rdata, stageAs: "RData_in"), val(chromosome_name), val(K), val(nGen) - tuple val(meta2), path(collected_crams), path(collected_crais), path(cramlist) + tuple val(meta2), path(posfile), path(input, stageAs: "input"), path(rdata, stageAs: "RData_in"), val(chromosome_name), val(K), val(nGen) + tuple val(meta), path(collected_crams), path(collected_crais), path(cramlist) tuple val(meta3), path(fasta), path(fasta_fai) val seed diff --git a/modules/nf-core/stitch/stitch.diff b/modules/nf-core/stitch/stitch.diff new file mode 100644 index 00000000..345e3bd4 --- /dev/null +++ b/modules/nf-core/stitch/stitch.diff @@ -0,0 +1,16 @@ +Changes in module 'nf-core/stitch' +--- modules/nf-core/stitch/main.nf ++++ modules/nf-core/stitch/main.nf +@@ -8,8 +8,8 @@ + 'biocontainers/r-stitch:1.6.10--r43h06b5641_0' }" + + input: +- tuple val(meta) , path(posfile), path(input, stageAs: "input"), path(rdata, stageAs: "RData_in"), val(chromosome_name), val(K), val(nGen) +- tuple val(meta2), path(collected_crams), path(collected_crais), path(cramlist) ++ tuple val(meta2), path(posfile), path(input, stageAs: "input"), path(rdata, stageAs: "RData_in"), val(chromosome_name), val(K), val(nGen) ++ tuple val(meta), path(collected_crams), path(collected_crais), path(cramlist) + tuple val(meta3), path(fasta), path(fasta_fai) + val seed + + +************************************************************ From 4774722e28fd4e6cacd93563ed66d8cb25dfae21 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Thu, 25 Apr 2024 22:58:48 +0000 Subject: [PATCH 11/58] add new params to schema --- nextflow_schema.json | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index edaa7a24..c560d905 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -73,13 +73,13 @@ }, "min_val_gl": { "type": "number", - "description": "Minimum genotype likelihood probability P(G|R) in validation data. Set to zero to have no filter of if using –gt-validation", + "description": "Minimum genotype likelihood probability P(G|R) in validation data. Set to zero to have no filter of if using \u2013gt-validation", "default": 0.9, "pattern": "^\\d+(\\.\\d+)?$" }, "min_val_dp": { "type": "integer", - "description": "Minimum coverage in validation data. If FORMAT/DP is missing and –min_val_dp > 0, the program exits with an error. Set to zero to have no filter of if using –gt-validation", + "description": "Minimum coverage in validation data. If FORMAT/DP is missing and \u2013min_val_dp > 0, the program exits with an error. Set to zero to have no filter of if using \u2013gt-validation", "default": 5, "pattern": "^\\d+$" } @@ -428,6 +428,14 @@ "buffer": { "type": "integer", "default": 10000 + }, + "k_val": { + "type": "integer", + "default": 2 + }, + "seed": { + "type": "integer", + "default": 1 } } } From bd9d306dbc77f144b723acd2492b05ef8bb3b877 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Thu, 25 Apr 2024 23:18:57 +0000 Subject: [PATCH 12/58] add docs --- docs/output.md | 43 +++++++++++++++++++++++++++++++------------ docs/usage.md | 25 +++++++++++++++---------- 2 files changed, 46 insertions(+), 22 deletions(-) diff --git a/docs/output.md b/docs/output.md index 7c589a4a..00a97c38 100644 --- a/docs/output.md +++ b/docs/output.md @@ -2,17 +2,14 @@ ## Introduction -## Introduction - This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. -The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - -## Pipeline overview: QUILT imputation mode +## Pipeline overview +## QUILT imputation mode The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: @@ -21,20 +18,18 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Convert](#convert) - Convert reference panel to .hap and .legend files - [QUILT](#quilt) - Perform imputation - [Concatenate](#concatenate) - Concatenate all imputed chunks into a single VCF. -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution + ### Glimpse Chunk -- `quilt_impute/glimpse/` +- `imputation/glimpse_chunk/` - `*.txt`: TXT file containing the chunks obtained from running Glimpse chunks. [Glimpse chunk](https://odelaneau.github.io/GLIMPSE/) defines chunks where to run imputation. For further reading and documentation see the [Glimpse documentation](https://odelaneau.github.io/GLIMPSE/glimpse1/commands.html). Once that you have generated the chunks for your reference panel, you can skip the reference preparation step and directly submit this file for imputation. ### Convert -- `quilt_impute/bcftools/convert/` +- `imputation/bcftools/convert/` - `*.hap`: a .hap file for the reference panel. - `*.legend*`: a .legend file for the reference panel. @@ -42,7 +37,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d ### QUILT -- `quilt_impute/quilt/` +- `imputation/quilt/` - `quilt.*.vcf.gz`: Imputed VCF for a specific chunk. - `quilt.*.vcf.gz.tbi`: TBI for the Imputed VCF for a specific chunk. @@ -50,11 +45,35 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d ### Concat -- `quilt_impute/bcftools/concat` +- `imputation/bcftools/concat` - `.*.vcf.gz`: Imputed and ligated VCF for all the input samples. [bcftools concat](https://samtools.github.io/bcftools/bcftools.html) will produce a single VCF from a list of imputed VCFs in chunks. +## STITCH imputation mode + +The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: + +- [Remove Multiallelics](#multiallelics) - Remove multiallelic sites +- [STITCH](#quilt) - Perform imputation +- [Concatenate](#concatenate) - Concatenate all imputed chunks into a single VCF + +### Concat + +- `imputation/bcftools/concat` +- `.*.vcf.gz`: Imputed and concatenated VCF for all the input samples. + +[bcftools concat](https://samtools.github.io/bcftools/bcftools.html) will produce a single VCF from a list of imputed VCFs. + + +## Reports + +Reports contain useful metrics and pipeline information for the different modes. + +- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline +- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution + ### MultiQC
diff --git a/docs/usage.md b/docs/usage.md index 1fd90ac8..15006a19 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -4,17 +4,10 @@ > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ -## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/phaseimpute/usage](https://nf-co.re/phaseimpute/usage) - -> _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ - ## Introduction -## Samplesheet input - - ## Samplesheet input @@ -135,14 +128,26 @@ You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-c ### Imputation modes -You can choose different software to perform the imputation. +You can choose different software to perform the imputation. In the following sections, the typical commands for running the pipeline with each software are included. + #### QUILT -The typical command for running the pipeline with this software is as follows: +```bash +nextflow run nf-core/phaseimpute --input samplesheet.csv --panel samplesheet_reference.csv --step impute --tool quilt --outdir results --genome GRCh37 -profile docker +``` + +#### STITCH + +```bash +nextflow run nf-core/phaseimpute --input samplesheet.csv --step impute --tool stitch --outdir results --genome GRCh37 -profile docker +``` +Notice that no reference panel is needed when running STITCH. + +#### GLIMPSE1 ```bash -nextflow run nf-core/phaseimpute --input ./samplesheet.csv --panel ./samplesheet_reference.csv --step impute --tool quilt --outdir ./results --genome GRCh37 -profile docker +nextflow run nf-core/phaseimpute --input samplesheet.csv --panel samplesheet_reference.csv --step impute --tool glimpse1 --outdir results --genome GRCh37 -profile docker ``` ### Updating the pipeline From eaca7f17ff3ee9bbec201987a2a0da81783cb597 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:52:35 +0000 Subject: [PATCH 13/58] separate get panel in normalization subworkflow --- .../vcf_normalize_bcftools.nf | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf diff --git a/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf b/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf new file mode 100644 index 00000000..4746be54 --- /dev/null +++ b/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf @@ -0,0 +1,40 @@ +include { BCFTOOLS_NORM } from '../../../modules/nf-core/bcftools/norm/main' +include { BCFTOOLS_VIEW } from '../../../modules/nf-core/bcftools/view/main' +include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main' +include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_2} from '../../../modules/nf-core/bcftools/index/main' +include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_3} from '../../../modules/nf-core/bcftools/index/main' + + +workflow VCF_NORMALIZE_BCFTOOLS { + take: + ch_vcf // channel: [ [id, chr], vcf, index ] + ch_fasta // channel: [ [genome], fasta, fai ] + + main: + + ch_versions = Channel.empty() + ch_fasta = ch_fasta.map { meta, fasta, fai -> [meta, fasta] } + + // Join duplicated biallelic sites into multiallelic records + BCFTOOLS_NORM(ch_vcf, ch_fasta) + + // Index multiallelic VCF + BCFTOOLS_INDEX(BCFTOOLS_NORM.out.vcf) + + // Join multiallelic VCF and TBI + ch_multiallelic_vcf_tbi = BCFTOOLS_NORM.out.vcf.join(BCFTOOLS_INDEX.out.tbi) + + // Remove all multiallelic records: + BCFTOOLS_VIEW(ch_multiallelic_vcf_tbi, [], [], []) + + // Index biallelic VCF + BCFTOOLS_INDEX_2(BCFTOOLS_VIEW.out.vcf) + + // Join biallelic VCF and TBI + ch_biallelic_vcf_tbi = BCFTOOLS_VIEW.out.vcf.join(BCFTOOLS_INDEX_2.out.tbi) + ch_biallelic_vcf_tbi.dump(tag:"ch_biallelic_vcf_tbi") + + emit: + vcf_tbi = ch_biallelic_vcf_tbi + versions = ch_versions // channel: [ versions.yml ] +} From bb4af56771704e38aa2ce1d56741de7f04817175 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:52:53 +0000 Subject: [PATCH 14/58] separate get panel in channel prep subworkflow --- .../local/panel_prepare_channels/main.nf | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 subworkflows/local/panel_prepare_channels/main.nf diff --git a/subworkflows/local/panel_prepare_channels/main.nf b/subworkflows/local/panel_prepare_channels/main.nf new file mode 100644 index 00000000..95882870 --- /dev/null +++ b/subworkflows/local/panel_prepare_channels/main.nf @@ -0,0 +1,47 @@ +include { VCF_CONCATENATE_BCFTOOLS as CONCAT_PANEL } from '../../../subworkflows/local/vcf_concatenate_bcftools' + +workflow PANEL_PREPARE_CHANNELS { + take: + ch_panel_norm // channel: [ [id, chr], vcf, index ] + ch_panel_sites + ch_panel_tsv + ch_panel_phased + + main: + + ch_versions = Channel.empty() + + ch_panel = ch_panel_norm + .combine(ch_panel_sites, by: 0) + .combine(ch_panel_tsv, by: 0) + .combine(ch_panel_phased, by: 0) + .map{ metaIC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index + -> [[panel:metaIC.id, chr:metaIC.chr ], norm, n_index, sites, s_index, tsv, t_index, phased, p_index] + } + + + ch_panel_sites_tsv = ch_panel + .map{ metaPC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index + -> [metaPC, sites, tsv] + } + CONCAT_PANEL(ch_panel + .map{ metaPC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index + -> [[id:metaPC.panel], sites, s_index] + } + ) + ch_panel_sites = CONCAT_PANEL.out.vcf_tbi_join + + ch_panel_phased = ch_panel_phased + .map{ metaPC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index + -> [metaPC, phased, p_index] + } + + + emit: + panel = ch_panel + ch_panel_sites + ch_panel_phased + ch_panel_sites_tsv + + versions = ch_versions // channel: [ versions.yml ] +} From f5a712f281548c4077f84dda64b36f05930963f2 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:53:08 +0000 Subject: [PATCH 15/58] separate get panel in phasing subworkflow --- subworkflows/local/vcf_phase_panel/main.nf | 40 ++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 subworkflows/local/vcf_phase_panel/main.nf diff --git a/subworkflows/local/vcf_phase_panel/main.nf b/subworkflows/local/vcf_phase_panel/main.nf new file mode 100644 index 00000000..a54ca8e6 --- /dev/null +++ b/subworkflows/local/vcf_phase_panel/main.nf @@ -0,0 +1,40 @@ +include { VCF_PHASE_SHAPEIT5 } from '../../../subworkflows/nf-core/vcf_phase_shapeit5/main' + +workflow VCF_PHASE_PANEL { + take: + ch_vcf // channel: [ [id, chr], vcf, index ] + ch_panel_norm + ch_panel_sites + ch_panel_tsv + + main: + + ch_versions = Channel.empty() + + // Phase panel + if (params.phased == false) { + VCF_PHASE_SHAPEIT5(ch_vcf + .map { meta, vcf, csi -> [meta, vcf, csi, [], meta.region] }, + Channel.of([[],[],[]]).collect(), + Channel.of([[],[],[]]).collect(), + Channel.of([[],[]]).collect()) + ch_versions = ch_versions.mix(VCF_PHASE_SHAPEIT5.out.versions) + ch_panel_phased = VCF_PHASE_SHAPEIT5.out.variants_phased + .combine(VCF_PHASE_SHAPEIT5.out.variants_index, by: 0) + } else { + ch_panel_phased = ch_vcf + } + + ch_panel = ch_panel_norm + .combine(ch_panel_sites, by: 0) + .combine(ch_panel_tsv, by: 0) + .combine(ch_panel_phased, by: 0) + .map{ metaIC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index + -> [[panel:metaIC.id, chr:metaIC.chr ], norm, n_index, sites, s_index, tsv, t_index, phased, p_index] + } + + emit: + vcf_tbi = ch_panel_phased + panel = ch_panel + versions = ch_versions // channel: [ versions.yml ] +} From 321248a711258f8c97149bef7e7d186d8f0b5b1b Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:53:21 +0000 Subject: [PATCH 16/58] separate get panel in sites extraction subworkflow --- .../local/vcf_sites_extract_bcftools/main.nf | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 subworkflows/local/vcf_sites_extract_bcftools/main.nf diff --git a/subworkflows/local/vcf_sites_extract_bcftools/main.nf b/subworkflows/local/vcf_sites_extract_bcftools/main.nf new file mode 100644 index 00000000..0d5396a5 --- /dev/null +++ b/subworkflows/local/vcf_sites_extract_bcftools/main.nf @@ -0,0 +1,71 @@ +include { BCFTOOLS_VIEW as VIEW_VCF_SNPS } from '../../../modules/nf-core/bcftools/view/main.nf' +include { BCFTOOLS_VIEW as VIEW_VCF_SITES } from '../../../modules/nf-core/bcftools/view/main.nf' +include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main.nf' +include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_2 } from '../../../modules/nf-core/bcftools/index/main.nf' +include { BCFTOOLS_INDEX as VCF_INDEX5 } from '../../../modules/nf-core/bcftools/index/main.nf' +include { TABIX_BGZIP } from '../../../modules/nf-core/tabix/bgzip/main' +include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix/main' +include { BCFTOOLS_QUERY } from '../../../modules/nf-core/bcftools/query/main.nf' + + + +workflow VCF_SITES_EXTRACT_BCFTOOLS { + take: + ch_vcf // channel: [ [id, chr], vcf, index ] + + main: + + ch_versions = Channel.empty() + + // Extract only SNPs from VCF + VIEW_VCF_SNPS(ch_vcf, [], [], []) + ch_versions = ch_versions.mix(VIEW_VCF_SNPS.out.versions.first()) + + // Index SNPs + BCFTOOLS_INDEX(VIEW_VCF_SNPS.out.vcf) + ch_versions = ch_versions.mix(BCFTOOLS_INDEX.out.versions.first()) + + // Join VCF and Index + ch_panel_norm = VIEW_VCF_SNPS.out.vcf.combine(BCFTOOLS_INDEX.out.csi, by:0) + + // Extract sites positions + VIEW_VCF_SITES( ch_panel_norm,[], [], []) + ch_versions = ch_versions.mix(VIEW_VCF_SITES.out.versions.first()) + + // Index extracted sites + BCFTOOLS_INDEX_2(VIEW_VCF_SITES.out.vcf) + ch_versions = ch_versions.mix(BCFTOOLS_INDEX_2.out.versions.first()) + + // Join extracted sites and index + ch_panel_sites = VIEW_VCF_SITES.out.vcf.combine(BCFTOOLS_INDEX_2.out.csi, by:0) + + // Create empty channel + + ch_panel_tsv = [] + + // Create TSVs for different tools +// if (params.tools.contains("glimpse1")) { + + // Convert to TSV with structure for Glimpse + BCFTOOLS_QUERY(ch_panel_sites, [], [], []) + ch_versions = ch_versions.mix(BCFTOOLS_QUERY.out.versions.first()) + + // Compress TSV + TABIX_BGZIP(BCFTOOLS_QUERY.out.output) + ch_versions = ch_versions.mix(TABIX_BGZIP.out.versions.first()) + + // Index compressed TSV + TABIX_TABIX(TABIX_BGZIP.out.output) + ch_versions = ch_versions.mix(TABIX_TABIX.out.versions.first()) + + // Join compressed TSV and index + ch_panel_tsv = TABIX_BGZIP.out.output.combine(TABIX_TABIX.out.tbi, by: 0) + +// } + + emit: + panel_tsv = ch_panel_tsv + vcf_tbi = ch_panel_norm + panel_sites = ch_panel_sites + versions = ch_versions // channel: [ versions.yml ] +} From 392f948147f99eea27ec7324e50584279efebb01 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:53:34 +0000 Subject: [PATCH 17/58] add new subworkflows for get panel --- workflows/phaseimpute/main.nf | 46 +++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 47965904..7b219e26 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -27,6 +27,11 @@ include { VCF_CONCORDANCE_GLIMPSE2 } from '../../subworkflows/ include { VCF_CHR_CHECK } from '../../subworkflows/local/vcf_chr_check' include { GET_PANEL } from '../../subworkflows/local/get_panel' +include { VCF_NORMALIZE_BCFTOOLS } from '../../subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools' +include { PANEL_PREPARE_CHANNELS } from '../../subworkflows/local/panel_prepare_channels' +include { VCF_SITES_EXTRACT_BCFTOOLS } from '../../subworkflows/local/vcf_sites_extract_bcftools' +include { VCF_PHASE_PANEL } from '../../subworkflows/local/vcf_phase_panel' + include { MAKE_CHUNKS } from '../../subworkflows/local/make_chunks/make_chunks' include { IMPUTE_QUILT } from '../../subworkflows/local/impute_quilt/impute_quilt' @@ -104,32 +109,41 @@ workflow PHASEIMPUTE { // Prepare panel // if (params.step == 'impute' || params.step == 'panel_prep' || params.step == 'validate' || params.step == 'all') { - // Remove if necessary "chr" + // Check chr prefix and remove if necessary VCF_CHR_CHECK(ch_panel, ch_fasta) ch_versions = ch_versions.mix(VCF_CHR_CHECK.out.versions) - // Prepare the panel - GET_PANEL(VCF_CHR_CHECK.out.vcf, ch_fasta) - ch_versions = ch_versions.mix(GET_PANEL.out.versions) - ch_panel_sites_tsv = GET_PANEL.out.panel - .map{ metaPC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index - -> [metaPC, sites, tsv] - } - CONCAT_PANEL(GET_PANEL.out.panel - .map{ metaPC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index - -> [[id:metaPC.panel], sites, s_index] - } + // Normalize indels in panel + VCF_NORMALIZE_BCFTOOLS(VCF_CHR_CHECK.out.vcf, ch_fasta) + + // Extract sites from normalized vcf + VCF_SITES_EXTRACT_BCFTOOLS(VCF_NORMALIZE_BCFTOOLS.out.vcf_tbi) + + // Phase panel + VCF_PHASE_PANEL(VCF_SITES_EXTRACT_BCFTOOLS.out.vcf_tbi, + VCF_SITES_EXTRACT_BCFTOOLS.out.vcf_tbi, + VCF_SITES_EXTRACT_BCFTOOLS.out.panel_sites, + VCF_SITES_EXTRACT_BCFTOOLS.out.panel_tsv) + VCF_PHASE_PANEL.out.panel.dump(tag:"VCF_PHASE_PANEL") + + // Generate channels (to be simplified) + ch_panel_sites_tsv = VCF_PHASE_PANEL.out.panel + .map{ metaPC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index + -> [metaPC, sites, tsv] + } + CONCAT_PANEL(VCF_PHASE_PANEL.out.panel + .map{ metaPC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index + -> [[id:metaPC.panel], sites, s_index] + } ) ch_panel_sites = CONCAT_PANEL.out.vcf_tbi_join ch_versions = ch_versions.mix(CONCAT_PANEL.out.versions) - ch_panel_phased = GET_PANEL.out.panel + ch_panel_phased = VCF_PHASE_PANEL.out.panel .map{ metaPC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index -> [metaPC, phased, p_index] } - ch_versions = ch_versions.mix(GET_PANEL.out.versions) - if (params.step == 'impute' || params.step == 'all') { // Output channel of input process ch_impute_output = Channel.empty() @@ -178,7 +192,7 @@ workflow PHASEIMPUTE { print("Impute with STITCH") // Prepare inputs - PREPARE_INPUT_STITCH(GET_PANEL.out.panel_sites, ch_fasta, ch_input_impute) + PREPARE_INPUT_STITCH(VCF_SITES_EXTRACT_BCFTOOLS.out.panel_sites, ch_fasta, ch_input_impute) // Impute with STITCH BAM_IMPUTE_STITCH ( PREPARE_INPUT_STITCH.out.stitch_parameters, From 1617cdc4c373827d9e6ee0cc1d20bebf615b83a2 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:53:45 +0000 Subject: [PATCH 18/58] add configs for new subworkflows --- conf/steps/panel_prep.config | 90 ++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/conf/steps/panel_prep.config b/conf/steps/panel_prep.config index 5eec78ce..c4d75ad2 100644 --- a/conf/steps/panel_prep.config +++ b/conf/steps/panel_prep.config @@ -115,3 +115,93 @@ process { ext.args = "--tbi" } } + +process { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CHR_CHECK:.*' { + publishDir = [ + path: { "${params.outdir}/prep_panel/" }, + mode: params.publish_dir_mode, + enabled: false + ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_NORM' { + ext.args = '-m +any --no-version --output-type z' + ext.prefix = { "${meta.id}_${meta.chr}_multiallelic" } + cpus = 2 + memory = 400.MB + maxRetries = 2 + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_INDEX' { + ext.args = "--tbi" + publishDir = [enabled: false] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_VIEW' { + ext.args = '-v snps -Oz' + ext.prefix = { "${meta.id}_${meta.chr}_biallelic" } + cpus = 2 + memory = 400.MB + maxRetries = 2 + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_INDEX_2' { + ext.args = '--tbi' + cpus = 2 + memory = 400.MB + maxRetries = 2 + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:VIEW_VCF_SNPS' { + ext.args = [ + "-m 2", + "-M 2", + "-v snps", + "--output-type z", + "--no-version" + ].join(' ') + ext.prefix = { "${meta.id}_SNPS" } + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:VIEW_VCF_SITES' { + ext.args = [ + "-G", + "-m 2", + "-M 2", + "-v snps", + "--output-type z", + "--no-version" + ].join(' ') + ext.prefix = { "${meta.id}_C${meta.chr}_SITES" } + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:BCFTOOLS_QUERY' { + ext.args = [ + "-f'%CHROM\t%POS\t%REF,%ALT\\n'", + ].join(' ') + ext.prefix = { "${meta.id}_glimpse_SITES_TSV" } + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:TABIX_TABIX' { + ext.args = [ + "-s1", + "-b2", + "-e2" + ].join(' ') + ext.prefix = { "${meta.id}_glimpse_SITES_TSV" } + } + + // Phasing + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_PHASE_PANEL:VCF_PHASE_SHAPEIT5:BEDTOOLS_MAKEWINDOWS' { + ext.args = [ + '-w 60000', + '-s 40000' + ].join(' ') + ext.prefix = { "${meta.id}_chunks" } + publishDir = [ + enabled: false + ] + } + +} From e791828509f8ccf715a88c196a8d3a0a3924531d Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 26 Apr 2024 17:02:43 +0000 Subject: [PATCH 19/58] fix linting --- modules.json | 146 ++++++++------------------------ modules/nf-core/stitch/meta.yml | 2 - 2 files changed, 37 insertions(+), 111 deletions(-) diff --git a/modules.json b/modules.json index 05427c6f..53279d66 100644 --- a/modules.json +++ b/modules.json @@ -8,25 +8,19 @@ "bcftools/annotate": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/bcftools/annotate/bcftools-annotate.diff" }, "bcftools/concat": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/bcftools/concat/bcftools-concat.diff" }, "bcftools/convert": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bcftools/index": { "branch": "master", @@ -41,196 +35,142 @@ "bcftools/mpileup": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/bcftools/mpileup/bcftools-mpileup.diff" }, "bcftools/norm": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bcftools/query": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bcftools/view": { "branch": "master", "git_sha": "1013101da4252623fd7acf19cc581bae91d4f839", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/bcftools/view/bcftools-view.diff" }, "bedtools/makewindows": { "branch": "master", "git_sha": "3b248b84694d1939ac4bb33df84bf6233a34d668", - "installed_by": [ - "vcf_phase_shapeit5" - ] + "installed_by": ["vcf_phase_shapeit5"] }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "de45447d060b8c8b98575bc637a4a575fd0638e1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "gawk": { "branch": "master", "git_sha": "da4d05d04e65227d4307e87940842f1a14de62c7", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "glimpse/chunk": { "branch": "master", "git_sha": "7e56daae390ff896b292ddc70823447683a79936", - "installed_by": [ - "vcf_impute_glimpse" - ] + "installed_by": ["vcf_impute_glimpse"] }, "glimpse/ligate": { "branch": "master", "git_sha": "7e56daae390ff896b292ddc70823447683a79936", - "installed_by": [ - "vcf_impute_glimpse" - ] + "installed_by": ["vcf_impute_glimpse"] }, "glimpse/phase": { "branch": "master", "git_sha": "7e56daae390ff896b292ddc70823447683a79936", - "installed_by": [ - "vcf_impute_glimpse" - ] + "installed_by": ["vcf_impute_glimpse"] }, "glimpse2/chunk": { "branch": "master", "git_sha": "14ba46490cae3c78ed8e8f48d2c0f8f3be1e7c03", - "installed_by": [ - "multiple_impute_glimpse2" - ] + "installed_by": ["multiple_impute_glimpse2"] }, "glimpse2/concordance": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "glimpse2/ligate": { "branch": "master", "git_sha": "ee7fee68281944b002bd27a8ff3f19200b4d3fad", - "installed_by": [ - "multiple_impute_glimpse2" - ] + "installed_by": ["multiple_impute_glimpse2"] }, "glimpse2/phase": { "branch": "master", "git_sha": "9c71d32e372650e8bb3e1fb15339017aad5e3f7f", - "installed_by": [ - "multiple_impute_glimpse2" - ] + "installed_by": ["multiple_impute_glimpse2"] }, "glimpse2/splitreference": { "branch": "master", "git_sha": "fa12139827a18b324bd63fce654818586a8e9cc7", - "installed_by": [ - "multiple_impute_glimpse2" - ] + "installed_by": ["multiple_impute_glimpse2"] }, "gunzip": { "branch": "master", "git_sha": "3a5fef109d113b4997c9822198664ca5f2716208", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "multiqc": { "branch": "master", "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "quilt/quilt": { "branch": "master", "git_sha": "46265545d61e7f482adf40de941cc9a94e479bbe", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/coverage": { "branch": "master", "git_sha": "38afbe42f7db7f19c7a89607c0a71c68f3be3131", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/samtools/coverage/samtools-coverage.diff" }, "samtools/faidx": { "branch": "master", "git_sha": "f153f1f10e1083c49935565844cccb7453021682", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/index": { "branch": "master", "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/view": { "branch": "master", "git_sha": "0bd7d2333a88483aa0476acea172e9f5f6dd83bb", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/samtools/view/samtools-view.diff" }, "shapeit5/ligate": { "branch": "master", "git_sha": "dcf17cc0ed8fd5ea57e61a13e0147cddb5c1ee30", - "installed_by": [ - "vcf_phase_shapeit5" - ] + "installed_by": ["vcf_phase_shapeit5"] }, "shapeit5/phasecommon": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "vcf_phase_shapeit5" - ] + "installed_by": ["vcf_phase_shapeit5"] }, "stitch": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/stitch/stitch.diff" }, "tabix/bgzip": { "branch": "master", "git_sha": "09d3c8c29b31a2dfd610305b10550f0e1dbcd4a9", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "tabix/tabix": { "branch": "master", "git_sha": "9502adb23c0b97ed8e616bbbdfa73b4585aec9a1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] } } }, @@ -239,47 +179,35 @@ "multiple_impute_glimpse2": { "branch": "master", "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nextflow_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfvalidation_plugin": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "vcf_impute_glimpse": { "branch": "master", "git_sha": "7e56daae390ff896b292ddc70823447683a79936", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "vcf_phase_shapeit5": { "branch": "master", "git_sha": "dcf17cc0ed8fd5ea57e61a13e0147cddb5c1ee30", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] } } } } } -} \ No newline at end of file +} diff --git a/modules/nf-core/stitch/meta.yml b/modules/nf-core/stitch/meta.yml index a36d61cd..cb214af8 100644 --- a/modules/nf-core/stitch/meta.yml +++ b/modules/nf-core/stitch/meta.yml @@ -1,5 +1,3 @@ ---- -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: "stitch" description: "STITCH is an R program for reference panel free, read aware, low coverage sequencing genotype imputation. STITCH runs on a set of samples with sequencing reads in BAM format, as well as a list of positions to genotype, and outputs imputed genotypes in VCF format." keywords: From 1b9d87312fdb0907f4d558d2b01cf33621cd45ef Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 26 Apr 2024 17:07:10 +0000 Subject: [PATCH 20/58] fix pre commit issue --- docs/output.md | 3 +-- docs/usage.md | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/output.md b/docs/output.md index 00a97c38..97d7d4d7 100644 --- a/docs/output.md +++ b/docs/output.md @@ -9,6 +9,7 @@ The directories listed below will be created in the results directory after the ## Pipeline overview + ## QUILT imputation mode The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: @@ -19,7 +20,6 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [QUILT](#quilt) - Perform imputation - [Concatenate](#concatenate) - Concatenate all imputed chunks into a single VCF. - ### Glimpse Chunk - `imputation/glimpse_chunk/` @@ -65,7 +65,6 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d [bcftools concat](https://samtools.github.io/bcftools/bcftools.html) will produce a single VCF from a list of imputed VCFs. - ## Reports Reports contain useful metrics and pipeline information for the different modes. diff --git a/docs/usage.md b/docs/usage.md index 15006a19..6232bc5d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -8,7 +8,6 @@ - ## Samplesheet input You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. @@ -130,7 +129,6 @@ You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-c You can choose different software to perform the imputation. In the following sections, the typical commands for running the pipeline with each software are included. - #### QUILT ```bash @@ -142,6 +140,7 @@ nextflow run nf-core/phaseimpute --input samplesheet.csv --panel samplesheet_ref ```bash nextflow run nf-core/phaseimpute --input samplesheet.csv --step impute --tool stitch --outdir results --genome GRCh37 -profile docker ``` + Notice that no reference panel is needed when running STITCH. #### GLIMPSE1 From 4781c1a7d5cabd9ca63fbb8fed401257689abe75 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 26 Apr 2024 17:28:31 +0000 Subject: [PATCH 21/58] fix linting --- modules.json | 146 +++++++++++++++++++++-------- modules/nf-core/stitch/stitch.diff | 9 ++ 2 files changed, 118 insertions(+), 37 deletions(-) diff --git a/modules.json b/modules.json index 53279d66..05427c6f 100644 --- a/modules.json +++ b/modules.json @@ -8,19 +8,25 @@ "bcftools/annotate": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/bcftools/annotate/bcftools-annotate.diff" }, "bcftools/concat": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/bcftools/concat/bcftools-concat.diff" }, "bcftools/convert": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bcftools/index": { "branch": "master", @@ -35,142 +41,196 @@ "bcftools/mpileup": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/bcftools/mpileup/bcftools-mpileup.diff" }, "bcftools/norm": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bcftools/query": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bcftools/view": { "branch": "master", "git_sha": "1013101da4252623fd7acf19cc581bae91d4f839", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/bcftools/view/bcftools-view.diff" }, "bedtools/makewindows": { "branch": "master", "git_sha": "3b248b84694d1939ac4bb33df84bf6233a34d668", - "installed_by": ["vcf_phase_shapeit5"] + "installed_by": [ + "vcf_phase_shapeit5" + ] }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "de45447d060b8c8b98575bc637a4a575fd0638e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "gawk": { "branch": "master", "git_sha": "da4d05d04e65227d4307e87940842f1a14de62c7", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "glimpse/chunk": { "branch": "master", "git_sha": "7e56daae390ff896b292ddc70823447683a79936", - "installed_by": ["vcf_impute_glimpse"] + "installed_by": [ + "vcf_impute_glimpse" + ] }, "glimpse/ligate": { "branch": "master", "git_sha": "7e56daae390ff896b292ddc70823447683a79936", - "installed_by": ["vcf_impute_glimpse"] + "installed_by": [ + "vcf_impute_glimpse" + ] }, "glimpse/phase": { "branch": "master", "git_sha": "7e56daae390ff896b292ddc70823447683a79936", - "installed_by": ["vcf_impute_glimpse"] + "installed_by": [ + "vcf_impute_glimpse" + ] }, "glimpse2/chunk": { "branch": "master", "git_sha": "14ba46490cae3c78ed8e8f48d2c0f8f3be1e7c03", - "installed_by": ["multiple_impute_glimpse2"] + "installed_by": [ + "multiple_impute_glimpse2" + ] }, "glimpse2/concordance": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "glimpse2/ligate": { "branch": "master", "git_sha": "ee7fee68281944b002bd27a8ff3f19200b4d3fad", - "installed_by": ["multiple_impute_glimpse2"] + "installed_by": [ + "multiple_impute_glimpse2" + ] }, "glimpse2/phase": { "branch": "master", "git_sha": "9c71d32e372650e8bb3e1fb15339017aad5e3f7f", - "installed_by": ["multiple_impute_glimpse2"] + "installed_by": [ + "multiple_impute_glimpse2" + ] }, "glimpse2/splitreference": { "branch": "master", "git_sha": "fa12139827a18b324bd63fce654818586a8e9cc7", - "installed_by": ["multiple_impute_glimpse2"] + "installed_by": [ + "multiple_impute_glimpse2" + ] }, "gunzip": { "branch": "master", "git_sha": "3a5fef109d113b4997c9822198664ca5f2716208", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "multiqc": { "branch": "master", "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "quilt/quilt": { "branch": "master", "git_sha": "46265545d61e7f482adf40de941cc9a94e479bbe", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/coverage": { "branch": "master", "git_sha": "38afbe42f7db7f19c7a89607c0a71c68f3be3131", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/samtools/coverage/samtools-coverage.diff" }, "samtools/faidx": { "branch": "master", "git_sha": "f153f1f10e1083c49935565844cccb7453021682", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/index": { "branch": "master", "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/view": { "branch": "master", "git_sha": "0bd7d2333a88483aa0476acea172e9f5f6dd83bb", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/samtools/view/samtools-view.diff" }, "shapeit5/ligate": { "branch": "master", "git_sha": "dcf17cc0ed8fd5ea57e61a13e0147cddb5c1ee30", - "installed_by": ["vcf_phase_shapeit5"] + "installed_by": [ + "vcf_phase_shapeit5" + ] }, "shapeit5/phasecommon": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["vcf_phase_shapeit5"] + "installed_by": [ + "vcf_phase_shapeit5" + ] }, "stitch": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/stitch/stitch.diff" }, "tabix/bgzip": { "branch": "master", "git_sha": "09d3c8c29b31a2dfd610305b10550f0e1dbcd4a9", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "tabix/tabix": { "branch": "master", "git_sha": "9502adb23c0b97ed8e616bbbdfa73b4585aec9a1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } }, @@ -179,35 +239,47 @@ "multiple_impute_glimpse2": { "branch": "master", "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nextflow_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfvalidation_plugin": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "vcf_impute_glimpse": { "branch": "master", "git_sha": "7e56daae390ff896b292ddc70823447683a79936", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "vcf_phase_shapeit5": { "branch": "master", "git_sha": "dcf17cc0ed8fd5ea57e61a13e0147cddb5c1ee30", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] } } } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/stitch/stitch.diff b/modules/nf-core/stitch/stitch.diff index 345e3bd4..eb00d43d 100644 --- a/modules/nf-core/stitch/stitch.diff +++ b/modules/nf-core/stitch/stitch.diff @@ -13,4 +13,13 @@ Changes in module 'nf-core/stitch' val seed +--- modules/nf-core/stitch/meta.yml ++++ modules/nf-core/stitch/meta.yml +@@ -1,5 +1,3 @@ +---- +-# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json + name: "stitch" + description: "STITCH is an R program for reference panel free, read aware, low coverage sequencing genotype imputation. STITCH runs on a set of samples with sequencing reads in BAM format, as well as a list of positions to genotype, and outputs imputed genotypes in VCF format." + keywords: + ************************************************************ From 4a2a0c30f76cc1980aff5f54e09b24a0fa911589 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Sat, 27 Apr 2024 20:07:03 +0000 Subject: [PATCH 22/58] add versions and options with posfile for stitch --- workflows/phaseimpute/main.nf | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 7b219e26..29cb53c7 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -147,7 +147,7 @@ workflow PHASEIMPUTE { if (params.step == 'impute' || params.step == 'all') { // Output channel of input process ch_impute_output = Channel.empty() - if (params.tools.contains("glimpse1")) { + if (params.tools.split(',').contains("glimpse1")) { println "Impute with Glimpse1" // Glimpse1 subworkflow GL_INPUT( // Compute GL for input data once per panel @@ -183,28 +183,40 @@ workflow PHASEIMPUTE { // Add to output channel ch_impute_output = ch_impute_output.mix(output_glimpse1) } - if (params.tools.contains("glimpse2")) { + if (params.tools.split(',').contains("glimpse2")) { error "Glimpse2 not yet implemented" // Glimpse2 subworkflow } - if (params.tools.contains("stitch")) { + if (params.tools.split(',').contains("stitch")) { print("Impute with STITCH") + ch_panel_sites = [] + // Obtain the user's posfile if provided or calculate it from ref panel file + if (params.posfile) { + ch_panel_sites = params.posfile + } else if (params.panel) { + // It should do all the panelprep functions if a panel is provided + ch_panel_sites = VCF_SITES_EXTRACT_BCFTOOLS.out.panel_sites + } else { + error "No posfile or reference panel was included" + } // Prepare inputs - PREPARE_INPUT_STITCH(VCF_SITES_EXTRACT_BCFTOOLS.out.panel_sites, ch_fasta, ch_input_impute) + PREPARE_INPUT_STITCH(ch_panel_sites, ch_fasta, ch_input_impute) + ch_versions = ch_versions.mix(PREPARE_INPUT_STITCH.out.versions) // Impute with STITCH BAM_IMPUTE_STITCH ( PREPARE_INPUT_STITCH.out.stitch_parameters, PREPARE_INPUT_STITCH.out.stitch_samples, ch_fasta ) + ch_versions = ch_versions.mix(BAM_IMPUTE_STITCH.out.versions) // Output channel to concat ch_impute_output = ch_impute_output.mix(BAM_IMPUTE_STITCH.out.vcf_tbi) } - if (params.tools.contains("quilt")) { + if (params.tools.split(',').contains("quilt")) { print("Impute with QUILT") // Quilt subworkflow From 52d05637cba9e06ad13fa1184d167c76c3f052e3 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Sat, 27 Apr 2024 20:07:18 +0000 Subject: [PATCH 23/58] explain stitch --- docs/usage.md | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 6232bc5d..d82bfc06 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -125,7 +125,7 @@ genome: 'GRCh37' You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). -### Imputation modes +### Imputation tools `--mode impute --tools` You can choose different software to perform the imputation. In the following sections, the typical commands for running the pipeline with each software are included. @@ -137,11 +137,19 @@ nextflow run nf-core/phaseimpute --input samplesheet.csv --panel samplesheet_ref #### STITCH +[STITCH](https://github.com/rwdavies/STITCH) is an R program for low coverage sequencing genotype imputation without using a reference panel. The required inputs for this program are bam samples provided in the input samplesheet (`--input`) and a tsv file with the list of positions to genotype (`--posfile`). + +If you do not have a list of position to genotype, you can provide a reference panel to run the `--mode panelprep` which produces a tsv with this list. + ```bash -nextflow run nf-core/phaseimpute --input samplesheet.csv --step impute --tool stitch --outdir results --genome GRCh37 -profile docker +nextflow run nf-core/phaseimpute --input samplesheet.csv --step panelprep --panel samplesheet_reference.csv --outdir results --genome GRCh37 -profile docker ``` -Notice that no reference panel is needed when running STITCH. +Otherwise, you can provide your own position file in the `--mode impute` with STITCH using the the `--posfile` parameter. + +```bash +nextflow run nf-core/phaseimpute --input samplesheet.csv --step impute --posfile posfile.txt --tool stitch --outdir results --genome GRCh37 -profile docker +``` #### GLIMPSE1 From b6f0c6336b2867967d0ad3f885cea93684ce4709 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Sat, 27 Apr 2024 20:33:38 +0000 Subject: [PATCH 24/58] order subworkflows and remove dumps --- workflows/phaseimpute/main.nf | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 29cb53c7..e57c1671 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -18,29 +18,37 @@ include { getAllFilesExtension } from '../../subworkflows/local/utils_nfc // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { VCF_IMPUTE_GLIMPSE as VCF_IMPUTE_GLIMPSE1 } from '../../subworkflows/nf-core/vcf_impute_glimpse' +// Simulate subworkflows include { BAM_REGION } from '../../subworkflows/local/bam_region' include { BAM_DOWNSAMPLE } from '../../subworkflows/local/bam_downsample' -include { COMPUTE_GL as GL_TRUTH } from '../../subworkflows/local/compute_gl' -include { COMPUTE_GL as GL_INPUT } from '../../subworkflows/local/compute_gl' -include { VCF_CONCORDANCE_GLIMPSE2 } from '../../subworkflows/local/vcf_concordance_glimpse2' -include { VCF_CHR_CHECK } from '../../subworkflows/local/vcf_chr_check' -include { GET_PANEL } from '../../subworkflows/local/get_panel' +// Panelprep subworkflows +include { VCF_CHR_CHECK } from '../../subworkflows/local/vcf_chr_check' include { VCF_NORMALIZE_BCFTOOLS } from '../../subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools' include { PANEL_PREPARE_CHANNELS } from '../../subworkflows/local/panel_prepare_channels' include { VCF_SITES_EXTRACT_BCFTOOLS } from '../../subworkflows/local/vcf_sites_extract_bcftools' include { VCF_PHASE_PANEL } from '../../subworkflows/local/vcf_phase_panel' +// GLIMPSE subworkflows +include { VCF_IMPUTE_GLIMPSE as VCF_IMPUTE_GLIMPSE1 } from '../../subworkflows/nf-core/vcf_impute_glimpse' +include { COMPUTE_GL as GL_TRUTH } from '../../subworkflows/local/compute_gl' +include { COMPUTE_GL as GL_INPUT } from '../../subworkflows/local/compute_gl' +// QUILT subworkflows include { MAKE_CHUNKS } from '../../subworkflows/local/make_chunks/make_chunks' include { IMPUTE_QUILT } from '../../subworkflows/local/impute_quilt/impute_quilt' + +// STITCH subworkflows +include { PREPARE_INPUT_STITCH } from '../../subworkflows/local/prepare_input_stitch/prepare_input_stitch' +include { BAM_IMPUTE_STITCH } from '../../subworkflows/local/bam_impute_stitch/bam_impute_stitch' + +// CONCAT subworkflows include { VCF_CONCATENATE_BCFTOOLS as CONCAT_IMPUT } from '../../subworkflows/local/vcf_concatenate_bcftools' include { VCF_CONCATENATE_BCFTOOLS as CONCAT_TRUTH } from '../../subworkflows/local/vcf_concatenate_bcftools' include { VCF_CONCATENATE_BCFTOOLS as CONCAT_PANEL } from '../../subworkflows/local/vcf_concatenate_bcftools' -include { PREPARE_INPUT_STITCH } from '../../subworkflows/local/prepare_input_stitch/prepare_input_stitch' -include { BAM_IMPUTE_STITCH } from '../../subworkflows/local/bam_impute_stitch/bam_impute_stitch' +// Concordance subworkflows +include { VCF_CONCORDANCE_GLIMPSE2 } from '../../subworkflows/local/vcf_concordance_glimpse2' /* @@ -124,7 +132,6 @@ workflow PHASEIMPUTE { VCF_SITES_EXTRACT_BCFTOOLS.out.vcf_tbi, VCF_SITES_EXTRACT_BCFTOOLS.out.panel_sites, VCF_SITES_EXTRACT_BCFTOOLS.out.panel_tsv) - VCF_PHASE_PANEL.out.panel.dump(tag:"VCF_PHASE_PANEL") // Generate channels (to be simplified) ch_panel_sites_tsv = VCF_PHASE_PANEL.out.panel From 589c874ddacbc49010639667795dd7367a8f66f3 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Sat, 27 Apr 2024 20:39:52 +0000 Subject: [PATCH 25/58] add test stitch --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4636b9f1..8780f435 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,6 +31,7 @@ jobs: - "test" - "test_sim" - "test_quilt" + - "test_stitch" steps: - name: Check out pipeline code uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 From 14b5a996fe71b6ab2f16dd57b6092b32d152d352 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Sat, 27 Apr 2024 20:40:29 +0000 Subject: [PATCH 26/58] remove dump --- .../local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf b/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf index 4746be54..f75ae347 100644 --- a/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf +++ b/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf @@ -32,7 +32,6 @@ workflow VCF_NORMALIZE_BCFTOOLS { // Join biallelic VCF and TBI ch_biallelic_vcf_tbi = BCFTOOLS_VIEW.out.vcf.join(BCFTOOLS_INDEX_2.out.tbi) - ch_biallelic_vcf_tbi.dump(tag:"ch_biallelic_vcf_tbi") emit: vcf_tbi = ch_biallelic_vcf_tbi From 19ba8dcf79a3df0ddfcc8edde45b4566666bbb26 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Sat, 27 Apr 2024 20:40:51 +0000 Subject: [PATCH 27/58] add versions --- workflows/phaseimpute/main.nf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index e57c1671..227bb202 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -123,15 +123,18 @@ workflow PHASEIMPUTE { // Normalize indels in panel VCF_NORMALIZE_BCFTOOLS(VCF_CHR_CHECK.out.vcf, ch_fasta) + ch_versions = ch_versions.mix(VCF_NORMALIZE_BCFTOOLS.out.versions) // Extract sites from normalized vcf VCF_SITES_EXTRACT_BCFTOOLS(VCF_NORMALIZE_BCFTOOLS.out.vcf_tbi) + ch_versions = ch_versions.mix(VCF_SITES_EXTRACT_BCFTOOLS.out.versions) // Phase panel VCF_PHASE_PANEL(VCF_SITES_EXTRACT_BCFTOOLS.out.vcf_tbi, VCF_SITES_EXTRACT_BCFTOOLS.out.vcf_tbi, VCF_SITES_EXTRACT_BCFTOOLS.out.panel_sites, VCF_SITES_EXTRACT_BCFTOOLS.out.panel_tsv) + ch_versions = ch_versions.mix(VCF_PHASE_PANEL.out.versions) // Generate channels (to be simplified) ch_panel_sites_tsv = VCF_PHASE_PANEL.out.panel From f8716cf847083562f0eebaa080b8824a72b5da0b Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Sat, 27 Apr 2024 20:58:35 +0000 Subject: [PATCH 28/58] remove unused index module --- subworkflows/local/vcf_sites_extract_bcftools/main.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/subworkflows/local/vcf_sites_extract_bcftools/main.nf b/subworkflows/local/vcf_sites_extract_bcftools/main.nf index 0d5396a5..8b99a9a8 100644 --- a/subworkflows/local/vcf_sites_extract_bcftools/main.nf +++ b/subworkflows/local/vcf_sites_extract_bcftools/main.nf @@ -2,7 +2,6 @@ include { BCFTOOLS_VIEW as VIEW_VCF_SNPS } from '../../../modules/nf-cor include { BCFTOOLS_VIEW as VIEW_VCF_SITES } from '../../../modules/nf-core/bcftools/view/main.nf' include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main.nf' include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_2 } from '../../../modules/nf-core/bcftools/index/main.nf' -include { BCFTOOLS_INDEX as VCF_INDEX5 } from '../../../modules/nf-core/bcftools/index/main.nf' include { TABIX_BGZIP } from '../../../modules/nf-core/tabix/bgzip/main' include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix/main' include { BCFTOOLS_QUERY } from '../../../modules/nf-core/bcftools/query/main.nf' From 64b99f483b140871ed10ffcfb0656625bfd82c7e Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Sat, 27 Apr 2024 21:01:52 +0000 Subject: [PATCH 29/58] remove unicode --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index c560d905..fec27056 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -73,7 +73,7 @@ }, "min_val_gl": { "type": "number", - "description": "Minimum genotype likelihood probability P(G|R) in validation data. Set to zero to have no filter of if using \u2013gt-validation", + "description": "Minimum genotype likelihood probability P(G|R) in validation data. Set to zero to have no filter of if using gt-validation", "default": 0.9, "pattern": "^\\d+(\\.\\d+)?$" }, From 83bbf71deffd02c270e8ba6457931a02f2b2c8d1 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Sun, 28 Apr 2024 00:11:26 +0000 Subject: [PATCH 30/58] reorder stitch inputs --- modules/nf-core/stitch/main.nf | 2 +- subworkflows/local/bam_impute_stitch/bam_impute_stitch.nf | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/nf-core/stitch/main.nf b/modules/nf-core/stitch/main.nf index 761f5528..0f8d8109 100644 --- a/modules/nf-core/stitch/main.nf +++ b/modules/nf-core/stitch/main.nf @@ -8,8 +8,8 @@ process STITCH { 'biocontainers/r-stitch:1.6.10--r43h06b5641_0' }" input: - tuple val(meta2), path(posfile), path(input, stageAs: "input"), path(rdata, stageAs: "RData_in"), val(chromosome_name), val(K), val(nGen) tuple val(meta), path(collected_crams), path(collected_crais), path(cramlist) + tuple val(meta2), path(posfile), path(input, stageAs: "input"), path(rdata, stageAs: "RData_in"), val(chromosome_name), val(K), val(nGen) tuple val(meta3), path(fasta), path(fasta_fai) val seed diff --git a/subworkflows/local/bam_impute_stitch/bam_impute_stitch.nf b/subworkflows/local/bam_impute_stitch/bam_impute_stitch.nf index 33a76b68..ea162fd0 100644 --- a/subworkflows/local/bam_impute_stitch/bam_impute_stitch.nf +++ b/subworkflows/local/bam_impute_stitch/bam_impute_stitch.nf @@ -5,7 +5,7 @@ include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/mai workflow BAM_IMPUTE_STITCH { take: - ch_parameters // channel: [ val(meta), bam, bai ] + ch_parameters ch_samples ch_fasta @@ -15,7 +15,7 @@ workflow BAM_IMPUTE_STITCH { // Run STITCH seed = params.seed - STITCH( ch_parameters, ch_samples, ch_fasta, seed ) + STITCH( ch_samples, ch_parameters, ch_fasta, seed ) // Index imputed annotated VCF BCFTOOLS_INDEX(STITCH.out.vcf) From dc6e9a418d4020a3e549c63e82abe4d02ee50738 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Sun, 28 Apr 2024 00:13:22 +0000 Subject: [PATCH 31/58] remove extra space --- subworkflows/local/vcf_concatenate_bcftools/main.nf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/subworkflows/local/vcf_concatenate_bcftools/main.nf b/subworkflows/local/vcf_concatenate_bcftools/main.nf index 6137679d..583b6070 100644 --- a/subworkflows/local/vcf_concatenate_bcftools/main.nf +++ b/subworkflows/local/vcf_concatenate_bcftools/main.nf @@ -11,8 +11,7 @@ workflow VCF_CONCATENATE_BCFTOOLS { ch_versions = Channel.empty() // Remove chromosome from meta - ch_vcf_tbi_grouped = ch_vcf_tbi - .map{ meta, vcf, tbi -> [['id' : meta.id], vcf, tbi] } + ch_vcf_tbi_grouped = ch_vcf_tbi.map{ meta, vcf, tbi -> [['id' : meta.id], vcf, tbi] } // Group by ID ch_vcf_tbi_grouped = ch_vcf_tbi_grouped.groupTuple( by:0 ) From 30d9b41a9e0aab2c7493337dfa14353d18e7c45c Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Sun, 28 Apr 2024 00:52:16 +0000 Subject: [PATCH 32/58] change posfile generation to panelprep --- .../prepare_input_stitch/prepare_input_stitch.nf | 12 ++---------- .../local/vcf_sites_extract_bcftools/main.nf | 12 ++++++++++-- workflows/phaseimpute/main.nf | 8 ++++---- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf b/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf index 07b50380..4ffd661c 100644 --- a/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf +++ b/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf @@ -6,7 +6,7 @@ include { GAWK } from '../../../modules/nf-core/gaw workflow PREPARE_INPUT_STITCH { take: - ch_panel_sites + ch_posfile ch_fasta ch_input_impute @@ -14,16 +14,8 @@ workflow PREPARE_INPUT_STITCH { ch_versions = Channel.empty() - // Prepare posfile and parameters for STITCH - // Convert position file to tab-separated file - BCFTOOLS_QUERY(ch_panel_sites, [], [], []) - ch_posfile = BCFTOOLS_QUERY.out.output - - // Remove multiallelic positions from tsv - GAWK(ch_posfile, []) - // Get chromosomes of posfile - ch_posfile = GAWK.out.output.map{meta, posfile -> return[['chr': meta.chr], posfile]} + ch_posfile = ch_posfile.map{meta, posfile -> return[['chr': meta.chr], posfile]} // Get chromosomes of fasta ch_chromosomes = ch_fasta.map{it -> it[2]} diff --git a/subworkflows/local/vcf_sites_extract_bcftools/main.nf b/subworkflows/local/vcf_sites_extract_bcftools/main.nf index 8b99a9a8..bd59fc21 100644 --- a/subworkflows/local/vcf_sites_extract_bcftools/main.nf +++ b/subworkflows/local/vcf_sites_extract_bcftools/main.nf @@ -5,6 +5,8 @@ include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_2 } from '../../../modules/nf-cor include { TABIX_BGZIP } from '../../../modules/nf-core/tabix/bgzip/main' include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix/main' include { BCFTOOLS_QUERY } from '../../../modules/nf-core/bcftools/query/main.nf' +include { BCFTOOLS_QUERY as BCFTOOLS_QUERY_STITCH} from '../../../modules/nf-core/bcftools/query/main.nf' +include { GAWK as GAWK_STITCH } from '../../../modules/nf-core/gawk' @@ -43,7 +45,6 @@ workflow VCF_SITES_EXTRACT_BCFTOOLS { ch_panel_tsv = [] // Create TSVs for different tools -// if (params.tools.contains("glimpse1")) { // Convert to TSV with structure for Glimpse BCFTOOLS_QUERY(ch_panel_sites, [], [], []) @@ -60,11 +61,18 @@ workflow VCF_SITES_EXTRACT_BCFTOOLS { // Join compressed TSV and index ch_panel_tsv = TABIX_BGZIP.out.output.combine(TABIX_TABIX.out.tbi, by: 0) -// } + // TSV for STITCH + // Convert position file to tab-separated file + BCFTOOLS_QUERY_STITCH(ch_panel_sites, [], [], []) + ch_posfile = BCFTOOLS_QUERY_STITCH.out.output + + // Remove multiallelic positions from tsv + GAWK_STITCH(ch_posfile, []) emit: panel_tsv = ch_panel_tsv vcf_tbi = ch_panel_norm panel_sites = ch_panel_sites + posfile = GAWK_STITCH.out.output versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 227bb202..07686983 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -201,18 +201,18 @@ workflow PHASEIMPUTE { if (params.tools.split(',').contains("stitch")) { print("Impute with STITCH") - ch_panel_sites = [] + ch_posfile = [] // Obtain the user's posfile if provided or calculate it from ref panel file if (params.posfile) { - ch_panel_sites = params.posfile + ch_posfile = params.posfile } else if (params.panel) { // It should do all the panelprep functions if a panel is provided - ch_panel_sites = VCF_SITES_EXTRACT_BCFTOOLS.out.panel_sites + ch_posfile = VCF_SITES_EXTRACT_BCFTOOLS.out.posfile } else { error "No posfile or reference panel was included" } // Prepare inputs - PREPARE_INPUT_STITCH(ch_panel_sites, ch_fasta, ch_input_impute) + PREPARE_INPUT_STITCH(ch_posfile, ch_fasta, ch_input_impute) ch_versions = ch_versions.mix(PREPARE_INPUT_STITCH.out.versions) // Impute with STITCH From 22ef3d4c99e29002ff797bb54b8ff847a6a8ee4f Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Sun, 28 Apr 2024 00:53:01 +0000 Subject: [PATCH 33/58] change tsv and posfile config to panelprep --- conf/steps/imputation_stitch.config | 16 ---------------- conf/steps/panel_prep.config | 16 ++++++++++++++++ nextflow.config | 1 + 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/conf/steps/imputation_stitch.config b/conf/steps/imputation_stitch.config index 8cdf1a2e..48c805cb 100644 --- a/conf/steps/imputation_stitch.config +++ b/conf/steps/imputation_stitch.config @@ -21,12 +21,6 @@ process { } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:GAWK' { - ext.args = "'{ key = \$1 FS \$2 } !seen[key]++'" - ext.prefix = { "${meta.id}_${meta.chr}_no_multiallelic" } - ext.suffix = ".txt" - } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_NORM' { ext.args = '-m +any --output-type z' ext.prefix = { "${meta.id}_${meta.chr}_multiallelic" } @@ -56,16 +50,6 @@ process { maxRetries = 2 } - - - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_QUERY' { - ext.args = [ - "-f'%CHROM\t%POS\t%REF\t%ALT\\n'", - ].join(' ') - ext.prefix = { "${meta.id}_${meta.chr}_posfile_stitch" } - } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_IMPUTE_STITCH:BCFTOOLS_INDEX' { ext.args = '--tbi' cpus = 2 diff --git a/conf/steps/panel_prep.config b/conf/steps/panel_prep.config index c4d75ad2..bb5996ab 100644 --- a/conf/steps/panel_prep.config +++ b/conf/steps/panel_prep.config @@ -204,4 +204,20 @@ process { ] } + // TSV + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:BCFTOOLS_QUERY_STITCH' { + ext.args = [ + "-f'%CHROM\t%POS\t%REF\t%ALT\\n'", + ].join(' ') + ext.prefix = { "${meta.id}_${meta.chr}_posfile_stitch" } + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:GAWK_STITCH' { + ext.args = "'{ key = \$1 FS \$2 } !seen[key]++'" + ext.prefix = { "${meta.id}_${meta.chr}_posfile" } + ext.suffix = ".txt" + } + + } diff --git a/nextflow.config b/nextflow.config index 782f484a..83c45c28 100644 --- a/nextflow.config +++ b/nextflow.config @@ -54,6 +54,7 @@ params { // STITCH k_val = 2 seed = 1 + posfile = null // Boilerplate options outdir = null From 03dbb7ffc7333502e88c88458bb64ad37a2a7748 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Sun, 28 Apr 2024 01:23:52 +0000 Subject: [PATCH 34/58] remove unused config --- conf/steps/panel_prep.config | 133 ++++-------------- .../local/vcf_sites_extract_bcftools/main.nf | 38 +++-- 2 files changed, 46 insertions(+), 125 deletions(-) diff --git a/conf/steps/panel_prep.config b/conf/steps/panel_prep.config index bb5996ab..b20548df 100644 --- a/conf/steps/panel_prep.config +++ b/conf/steps/panel_prep.config @@ -25,104 +25,7 @@ process { "--no-version" ].join(' ') ext.prefix = { "${meta.id}_chrrename" } - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GET_PANEL:.*' { - publishDir = [ - path: { "${params.outdir}/prep_panel/" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GET_PANEL:BCFTOOLS_NORM' { - ext.args = [ - "-m", - "-any", - "--no-version" - ].join(' ') - ext.prefix = { "${meta.id}_norm" } - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GET_PANEL:VIEW_VCF_SNPS' { - ext.args = [ - "-m 2", - "-M 2", - "-v snps", - "--output-type z", - "--no-version" - ].join(' ') - ext.prefix = { "${meta.id}_SNPS" } - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GET_PANEL:VIEW_VCF_SITES' { - ext.args = [ - "-G", - "-m 2", - "-M 2", - "-v snps", - "--output-type z", - "--no-version" - ].join(' ') - ext.prefix = { "${meta.id}_C${meta.chr}_SITES" } - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GET_PANEL:BCFTOOLS_QUERY' { - ext.args = [ - "-f'%CHROM\t%POS\t%REF,%ALT\\n'", - ].join(' ') - ext.prefix = { "${meta.id}_SITES_TSV" } - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GET_PANEL:TABIX_TABIX' { - ext.args = [ - "-s1", - "-b2", - "-e2" - ].join(' ') - ext.prefix = { "${meta.id}_SITES_TSV" } - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GET_PANEL:VCF_PHASE_SHAPEIT5:BEDTOOLS_MAKEWINDOWS' { - ext.args = [ - '-w 60000', - '-s 40000' - ].join(' ') - ext.prefix = { "${meta.id}_chunks" } - publishDir = [ - enabled: false - ] - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_PANEL:.*' { - publishDir = [ - path: { "${params.outdir}/prep_panel/concat" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - ext.prefix = { "${meta.id}_sites_concat" } - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_PANEL:BCFTOOLS_CONCAT' { - ext.args = {[ - "--ligate", - "--output-type z", - ].join(" ").trim()} - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_PANEL:BCFTOOLS_INDEX' { - ext.args = "--tbi" - } -} - -process { - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CHR_CHECK:.*' { - publishDir = [ - path: { "${params.outdir}/prep_panel/" }, - mode: params.publish_dir_mode, - enabled: false - ] + publishDir = [ enabled: false ] } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_NORM' { @@ -131,6 +34,7 @@ process { cpus = 2 memory = 400.MB maxRetries = 2 + publishDir = [ enabled: false ] } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_INDEX' { @@ -144,6 +48,7 @@ process { cpus = 2 memory = 400.MB maxRetries = 2 + publishDir = [ enabled: false ] } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_INDEX_2' { @@ -151,6 +56,7 @@ process { cpus = 2 memory = 400.MB maxRetries = 2 + publishDir = [ enabled: false ] } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:VIEW_VCF_SNPS' { @@ -162,6 +68,7 @@ process { "--no-version" ].join(' ') ext.prefix = { "${meta.id}_SNPS" } + publishDir = [ enabled: false ] } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:VIEW_VCF_SITES' { @@ -174,6 +81,11 @@ process { "--no-version" ].join(' ') ext.prefix = { "${meta.id}_C${meta.chr}_SITES" } + publishDir = [ + path: { "${params.outdir}/prep_panel/sites/vcf/" }, + mode: params.publish_dir_mode, + enabled: true + ] } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:BCFTOOLS_QUERY' { @@ -181,6 +93,11 @@ process { "-f'%CHROM\t%POS\t%REF,%ALT\\n'", ].join(' ') ext.prefix = { "${meta.id}_glimpse_SITES_TSV" } + publishDir = [ + path: { "${params.outdir}/prep_panel/sites/tsv/" }, + mode: params.publish_dir_mode, + enabled: true + ] } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:TABIX_TABIX' { @@ -190,6 +107,11 @@ process { "-e2" ].join(' ') ext.prefix = { "${meta.id}_glimpse_SITES_TSV" } + publishDir = [ + path: { "${params.outdir}/prep_panel/sites/tsv/" }, + mode: params.publish_dir_mode, + enabled: true + ] } // Phasing @@ -199,24 +121,27 @@ process { '-s 40000' ].join(' ') ext.prefix = { "${meta.id}_chunks" } - publishDir = [ - enabled: false - ] + publishDir = [ enabled: false ] } // TSV - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:BCFTOOLS_QUERY_STITCH' { ext.args = [ "-f'%CHROM\t%POS\t%REF\t%ALT\\n'", ].join(' ') ext.prefix = { "${meta.id}_${meta.chr}_posfile_stitch" } + publishDir = [ enabled: false ] } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:GAWK_STITCH' { ext.args = "'{ key = \$1 FS \$2 } !seen[key]++'" - ext.prefix = { "${meta.id}_${meta.chr}_posfile" } - ext.suffix = ".txt" + ext.prefix = { "${meta.id}_${meta.chr}_posfile_stitch" } + ext.suffix = "txt" + publishDir = [ + path: { "${params.outdir}/prep_panel/sites/tsv/" }, + mode: params.publish_dir_mode, + enabled: true + ] } diff --git a/subworkflows/local/vcf_sites_extract_bcftools/main.nf b/subworkflows/local/vcf_sites_extract_bcftools/main.nf index bd59fc21..d3d30674 100644 --- a/subworkflows/local/vcf_sites_extract_bcftools/main.nf +++ b/subworkflows/local/vcf_sites_extract_bcftools/main.nf @@ -40,34 +40,30 @@ workflow VCF_SITES_EXTRACT_BCFTOOLS { // Join extracted sites and index ch_panel_sites = VIEW_VCF_SITES.out.vcf.combine(BCFTOOLS_INDEX_2.out.csi, by:0) - // Create empty channel - - ch_panel_tsv = [] - // Create TSVs for different tools - // Convert to TSV with structure for Glimpse - BCFTOOLS_QUERY(ch_panel_sites, [], [], []) - ch_versions = ch_versions.mix(BCFTOOLS_QUERY.out.versions.first()) + // Convert to TSV with structure for Glimpse + BCFTOOLS_QUERY(ch_panel_sites, [], [], []) + ch_versions = ch_versions.mix(BCFTOOLS_QUERY.out.versions.first()) - // Compress TSV - TABIX_BGZIP(BCFTOOLS_QUERY.out.output) - ch_versions = ch_versions.mix(TABIX_BGZIP.out.versions.first()) + // Compress TSV + TABIX_BGZIP(BCFTOOLS_QUERY.out.output) + ch_versions = ch_versions.mix(TABIX_BGZIP.out.versions.first()) - // Index compressed TSV - TABIX_TABIX(TABIX_BGZIP.out.output) - ch_versions = ch_versions.mix(TABIX_TABIX.out.versions.first()) + // Index compressed TSV + TABIX_TABIX(TABIX_BGZIP.out.output) + ch_versions = ch_versions.mix(TABIX_TABIX.out.versions.first()) - // Join compressed TSV and index - ch_panel_tsv = TABIX_BGZIP.out.output.combine(TABIX_TABIX.out.tbi, by: 0) + // Join compressed TSV and index + ch_panel_tsv = TABIX_BGZIP.out.output.combine(TABIX_TABIX.out.tbi, by: 0) - // TSV for STITCH - // Convert position file to tab-separated file - BCFTOOLS_QUERY_STITCH(ch_panel_sites, [], [], []) - ch_posfile = BCFTOOLS_QUERY_STITCH.out.output + // TSV for STITCH + // Convert position file to tab-separated file + BCFTOOLS_QUERY_STITCH(ch_panel_sites, [], [], []) + ch_posfile = BCFTOOLS_QUERY_STITCH.out.output - // Remove multiallelic positions from tsv - GAWK_STITCH(ch_posfile, []) + // Remove multiallelic positions from tsv + GAWK_STITCH(ch_posfile, []) emit: panel_tsv = ch_panel_tsv From 816366acca16e7811bb6f014319500af29a13197 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:15:32 +0000 Subject: [PATCH 35/58] back to working configs for stitch --- conf/steps/imputation_stitch.config | 16 ++++++++++++++++ conf/test_stitch.config | 1 + .../prepare_input_stitch/prepare_input_stitch.nf | 13 +++++++++++-- workflows/phaseimpute/main.nf | 7 ++++--- 4 files changed, 32 insertions(+), 5 deletions(-) diff --git a/conf/steps/imputation_stitch.config b/conf/steps/imputation_stitch.config index 48c805cb..8cdf1a2e 100644 --- a/conf/steps/imputation_stitch.config +++ b/conf/steps/imputation_stitch.config @@ -21,6 +21,12 @@ process { } + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:GAWK' { + ext.args = "'{ key = \$1 FS \$2 } !seen[key]++'" + ext.prefix = { "${meta.id}_${meta.chr}_no_multiallelic" } + ext.suffix = ".txt" + } + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_NORM' { ext.args = '-m +any --output-type z' ext.prefix = { "${meta.id}_${meta.chr}_multiallelic" } @@ -50,6 +56,16 @@ process { maxRetries = 2 } + + + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_QUERY' { + ext.args = [ + "-f'%CHROM\t%POS\t%REF\t%ALT\\n'", + ].join(' ') + ext.prefix = { "${meta.id}_${meta.chr}_posfile_stitch" } + } + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_IMPUTE_STITCH:BCFTOOLS_INDEX' { ext.args = '--tbi' cpus = 2 diff --git a/conf/test_stitch.config b/conf/test_stitch.config index 3541a74c..6f36c76e 100644 --- a/conf/test_stitch.config +++ b/conf/test_stitch.config @@ -27,6 +27,7 @@ params { fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/reference_genome/21_22/hs38DH.chr21_22.fa" phased = true panel = "${projectDir}/tests/csv/panel.csv" + //posfile = "https://github.com/nf-core/test-datasets/raw/phaseimpute/data/panel/22/chr22_posfile_stitch.txt" // Impute parameters step = "impute" diff --git a/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf b/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf index 4ffd661c..c182e531 100644 --- a/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf +++ b/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf @@ -6,7 +6,7 @@ include { GAWK } from '../../../modules/nf-core/gaw workflow PREPARE_INPUT_STITCH { take: - ch_posfile + ch_panel_sites ch_fasta ch_input_impute @@ -14,8 +14,17 @@ workflow PREPARE_INPUT_STITCH { ch_versions = Channel.empty() + // Prepare posfile and parameters for STITCH + // Convert position file to tab-separated file + BCFTOOLS_QUERY(ch_panel_sites, [], [], []) + ch_posfile = BCFTOOLS_QUERY.out.output + + // Remove multiallelic positions from tsv + GAWK(ch_posfile, []) + GAWK.out.output.dump(tag:"GAWK.out.output") + // Get chromosomes of posfile - ch_posfile = ch_posfile.map{meta, posfile -> return[['chr': meta.chr], posfile]} + ch_posfile = GAWK.out.output.map{meta, posfile -> return[['chr': meta.chr], posfile]} // Get chromosomes of fasta ch_chromosomes = ch_fasta.map{it -> it[2]} diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 07686983..69eb4685 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -203,11 +203,12 @@ workflow PHASEIMPUTE { ch_posfile = [] // Obtain the user's posfile if provided or calculate it from ref panel file - if (params.posfile) { - ch_posfile = params.posfile + if (params.posfile) { // Untested + ch_posfile = Channel.of([id:'posfile'], file(params.posfile), checkIfExists:true) } else if (params.panel) { // It should do all the panelprep functions if a panel is provided - ch_posfile = VCF_SITES_EXTRACT_BCFTOOLS.out.posfile + // Currently: the panelprep functions are run by default + ch_posfile = VCF_SITES_EXTRACT_BCFTOOLS.out.panel_sites } else { error "No posfile or reference panel was included" } From 1819ffe43cc09860186ec728b34afc391e2db3cf Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:47:36 +0000 Subject: [PATCH 36/58] allow several tools and steps to be run --- nextflow_schema.json | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index fec27056..b4163b34 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -138,13 +138,14 @@ "type": "string", "description": "Step to run.", "fa_icon": "fas fa-step-forward", - "enum": ["all", "simulate", "panelprep", "impute", "validate"] + "pattern": "^((all|simulate|panelprep|impute|validate)?,?)*(? Date: Mon, 29 Apr 2024 17:28:04 +0000 Subject: [PATCH 37/58] remove unused subworkflow --- workflows/phaseimpute/main.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 69eb4685..78c5650d 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -25,7 +25,6 @@ include { BAM_DOWNSAMPLE } from '../../subworkflows/ // Panelprep subworkflows include { VCF_CHR_CHECK } from '../../subworkflows/local/vcf_chr_check' include { VCF_NORMALIZE_BCFTOOLS } from '../../subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools' -include { PANEL_PREPARE_CHANNELS } from '../../subworkflows/local/panel_prepare_channels' include { VCF_SITES_EXTRACT_BCFTOOLS } from '../../subworkflows/local/vcf_sites_extract_bcftools' include { VCF_PHASE_PANEL } from '../../subworkflows/local/vcf_phase_panel' From de52d05365376923999f7ab8c0079f1c488e5f43 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Mon, 29 Apr 2024 17:30:23 +0000 Subject: [PATCH 38/58] fix pre-commit --- modules.json | 146 +++++++++++++-------------------------------------- 1 file changed, 37 insertions(+), 109 deletions(-) diff --git a/modules.json b/modules.json index 05427c6f..53279d66 100644 --- a/modules.json +++ b/modules.json @@ -8,25 +8,19 @@ "bcftools/annotate": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/bcftools/annotate/bcftools-annotate.diff" }, "bcftools/concat": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/bcftools/concat/bcftools-concat.diff" }, "bcftools/convert": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bcftools/index": { "branch": "master", @@ -41,196 +35,142 @@ "bcftools/mpileup": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/bcftools/mpileup/bcftools-mpileup.diff" }, "bcftools/norm": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bcftools/query": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bcftools/view": { "branch": "master", "git_sha": "1013101da4252623fd7acf19cc581bae91d4f839", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/bcftools/view/bcftools-view.diff" }, "bedtools/makewindows": { "branch": "master", "git_sha": "3b248b84694d1939ac4bb33df84bf6233a34d668", - "installed_by": [ - "vcf_phase_shapeit5" - ] + "installed_by": ["vcf_phase_shapeit5"] }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "de45447d060b8c8b98575bc637a4a575fd0638e1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "gawk": { "branch": "master", "git_sha": "da4d05d04e65227d4307e87940842f1a14de62c7", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "glimpse/chunk": { "branch": "master", "git_sha": "7e56daae390ff896b292ddc70823447683a79936", - "installed_by": [ - "vcf_impute_glimpse" - ] + "installed_by": ["vcf_impute_glimpse"] }, "glimpse/ligate": { "branch": "master", "git_sha": "7e56daae390ff896b292ddc70823447683a79936", - "installed_by": [ - "vcf_impute_glimpse" - ] + "installed_by": ["vcf_impute_glimpse"] }, "glimpse/phase": { "branch": "master", "git_sha": "7e56daae390ff896b292ddc70823447683a79936", - "installed_by": [ - "vcf_impute_glimpse" - ] + "installed_by": ["vcf_impute_glimpse"] }, "glimpse2/chunk": { "branch": "master", "git_sha": "14ba46490cae3c78ed8e8f48d2c0f8f3be1e7c03", - "installed_by": [ - "multiple_impute_glimpse2" - ] + "installed_by": ["multiple_impute_glimpse2"] }, "glimpse2/concordance": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "glimpse2/ligate": { "branch": "master", "git_sha": "ee7fee68281944b002bd27a8ff3f19200b4d3fad", - "installed_by": [ - "multiple_impute_glimpse2" - ] + "installed_by": ["multiple_impute_glimpse2"] }, "glimpse2/phase": { "branch": "master", "git_sha": "9c71d32e372650e8bb3e1fb15339017aad5e3f7f", - "installed_by": [ - "multiple_impute_glimpse2" - ] + "installed_by": ["multiple_impute_glimpse2"] }, "glimpse2/splitreference": { "branch": "master", "git_sha": "fa12139827a18b324bd63fce654818586a8e9cc7", - "installed_by": [ - "multiple_impute_glimpse2" - ] + "installed_by": ["multiple_impute_glimpse2"] }, "gunzip": { "branch": "master", "git_sha": "3a5fef109d113b4997c9822198664ca5f2716208", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "multiqc": { "branch": "master", "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "quilt/quilt": { "branch": "master", "git_sha": "46265545d61e7f482adf40de941cc9a94e479bbe", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/coverage": { "branch": "master", "git_sha": "38afbe42f7db7f19c7a89607c0a71c68f3be3131", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/samtools/coverage/samtools-coverage.diff" }, "samtools/faidx": { "branch": "master", "git_sha": "f153f1f10e1083c49935565844cccb7453021682", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/index": { "branch": "master", "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/view": { "branch": "master", "git_sha": "0bd7d2333a88483aa0476acea172e9f5f6dd83bb", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/samtools/view/samtools-view.diff" }, "shapeit5/ligate": { "branch": "master", "git_sha": "dcf17cc0ed8fd5ea57e61a13e0147cddb5c1ee30", - "installed_by": [ - "vcf_phase_shapeit5" - ] + "installed_by": ["vcf_phase_shapeit5"] }, "shapeit5/phasecommon": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "vcf_phase_shapeit5" - ] + "installed_by": ["vcf_phase_shapeit5"] }, "stitch": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/stitch/stitch.diff" }, "tabix/bgzip": { "branch": "master", "git_sha": "09d3c8c29b31a2dfd610305b10550f0e1dbcd4a9", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "tabix/tabix": { "branch": "master", "git_sha": "9502adb23c0b97ed8e616bbbdfa73b4585aec9a1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] } } }, @@ -239,47 +179,35 @@ "multiple_impute_glimpse2": { "branch": "master", "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nextflow_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfvalidation_plugin": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "vcf_impute_glimpse": { "branch": "master", "git_sha": "7e56daae390ff896b292ddc70823447683a79936", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "vcf_phase_shapeit5": { "branch": "master", "git_sha": "dcf17cc0ed8fd5ea57e61a13e0147cddb5c1ee30", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] } } } } } -} \ No newline at end of file +} From 6fbaa91866b19221be41106839c2ad0a3ca9ac48 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Mon, 29 Apr 2024 17:46:12 +0000 Subject: [PATCH 39/58] fix linting --- modules/nf-core/stitch/meta.yml | 4 +- modules/nf-core/stitch/stitch.diff | 15 ++- nextflow_schema.json | 195 ++++++++++++++++------------- 3 files changed, 121 insertions(+), 93 deletions(-) diff --git a/modules/nf-core/stitch/meta.yml b/modules/nf-core/stitch/meta.yml index cb214af8..7df60130 100644 --- a/modules/nf-core/stitch/meta.yml +++ b/modules/nf-core/stitch/meta.yml @@ -1,3 +1,5 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: "stitch" description: "STITCH is an R program for reference panel free, read aware, low coverage sequencing genotype imputation. STITCH runs on a set of samples with sequencing reads in BAM format, as well as a list of positions to genotype, and outputs imputed genotypes in VCF format." keywords: @@ -115,4 +117,4 @@ output: authors: - "@saulpierotti" maintainers: - - "@saulpierotti" + - "@saulpierotti" \ No newline at end of file diff --git a/modules/nf-core/stitch/stitch.diff b/modules/nf-core/stitch/stitch.diff index eb00d43d..0a987c1b 100644 --- a/modules/nf-core/stitch/stitch.diff +++ b/modules/nf-core/stitch/stitch.diff @@ -7,19 +7,18 @@ Changes in module 'nf-core/stitch' input: - tuple val(meta) , path(posfile), path(input, stageAs: "input"), path(rdata, stageAs: "RData_in"), val(chromosome_name), val(K), val(nGen) - tuple val(meta2), path(collected_crams), path(collected_crais), path(cramlist) -+ tuple val(meta2), path(posfile), path(input, stageAs: "input"), path(rdata, stageAs: "RData_in"), val(chromosome_name), val(K), val(nGen) + tuple val(meta), path(collected_crams), path(collected_crais), path(cramlist) ++ tuple val(meta2), path(posfile), path(input, stageAs: "input"), path(rdata, stageAs: "RData_in"), val(chromosome_name), val(K), val(nGen) tuple val(meta3), path(fasta), path(fasta_fai) val seed --- modules/nf-core/stitch/meta.yml +++ modules/nf-core/stitch/meta.yml -@@ -1,5 +1,3 @@ ----- --# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json - name: "stitch" - description: "STITCH is an R program for reference panel free, read aware, low coverage sequencing genotype imputation. STITCH runs on a set of samples with sequencing reads in BAM format, as well as a list of positions to genotype, and outputs imputed genotypes in VCF format." - keywords: - +@@ -117,4 +117,4 @@ + authors: + - "@saulpierotti" + maintainers: +- - "@saulpierotti" ++ - "@saulpierotti" ************************************************************ diff --git a/nextflow_schema.json b/nextflow_schema.json index b4163b34..c11172a9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -5,6 +5,69 @@ "description": "Phasing and imputation pipeline", "type": "object", "definitions": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data and save output data.", + "required": ["outdir"], + "properties": { + "input": { + "type": "string", + "format": "file-path", + "exists": true, + "schema": "assets/schema_input.json", + "mimetype": "text/csv", + "pattern": "^\\S+\\.csv$", + "description": "Path to comma-separated file containing information about the samples in the experiment.", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/phaseimpute/usage#samplesheet-input).", + "fa_icon": "fas fa-file-csv" + }, + "input_region": { + "type": "string", + "description": "Region of the genome to use (optional: if no file given, the whole genome will be used). The file should be a comma-separated file with 3 columns, and a header row.", + "schema": "assets/schema_input_region.json", + "format": "file-path", + "pattern": "^\\S+\\.csv$" + }, + "outdir": { + "type": "string", + "format": "directory-path", + "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", + "fa_icon": "fas fa-folder-open" + }, + "rename_chr": { + "type": "boolean", + "description": "Should the panel vcf files be renamed to match the reference genome (e.g. 'chr1' -> '1')", + "pattern": "true|false" + }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + }, + "multiqc_title": { + "type": "string", + "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", + "fa_icon": "fas fa-file-signature" + }, + "step": { + "type": "string", + "description": "Step to run.", + "fa_icon": "fas fa-step-forward", + "pattern": "^((all|simulate|panelprep|impute|validate)?,?)*(? '1')", - "pattern": "true|false" - }, - "email": { - "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" - }, - "multiqc_title": { - "type": "string", - "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", - "fa_icon": "fas fa-file-signature" - }, - "step": { - "type": "string", - "description": "Step to run.", - "fa_icon": "fas fa-step-forward", - "pattern": "^((all|simulate|panelprep|impute|validate)?,?)*(? Date: Mon, 29 Apr 2024 18:01:11 +0000 Subject: [PATCH 40/58] fix pre commit --- modules/nf-core/stitch/meta.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/nf-core/stitch/meta.yml b/modules/nf-core/stitch/meta.yml index 7df60130..a36d61cd 100644 --- a/modules/nf-core/stitch/meta.yml +++ b/modules/nf-core/stitch/meta.yml @@ -117,4 +117,4 @@ output: authors: - "@saulpierotti" maintainers: - - "@saulpierotti" \ No newline at end of file + - "@saulpierotti" From c5e000d6d0afc08a8baec4b154753d0361ed1ba4 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Wed, 1 May 2024 20:48:54 +0000 Subject: [PATCH 41/58] allow several steps to be selected --- conf/test.config | 2 +- conf/test_quilt.config | 2 +- conf/test_stitch.config | 2 +- main.nf | 6 +++--- .../local/utils_nfcore_phaseimpute_pipeline/main.nf | 2 +- workflows/phaseimpute/main.nf | 10 +++++----- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/conf/test.config b/conf/test.config index 525d6fe0..c754b84e 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,6 +29,6 @@ params { phased = true // Impute parameters - step = "impute" + step = "panelprep,impute" tools = "glimpse1" } diff --git a/conf/test_quilt.config b/conf/test_quilt.config index a7d04a00..27d31445 100644 --- a/conf/test_quilt.config +++ b/conf/test_quilt.config @@ -29,6 +29,6 @@ params { phased = true // Impute parameters - step = "impute" + step = "panelprep,impute" tools = "quilt" } diff --git a/conf/test_stitch.config b/conf/test_stitch.config index 6f36c76e..16850791 100644 --- a/conf/test_stitch.config +++ b/conf/test_stitch.config @@ -30,6 +30,6 @@ params { //posfile = "https://github.com/nf-core/test-datasets/raw/phaseimpute/data/panel/22/chr22_posfile_stitch.txt" // Impute parameters - step = "impute" + step = "panelprep,impute" tools = "stitch" } diff --git a/main.nf b/main.nf index 319c2f5c..92f1c378 100644 --- a/main.nf +++ b/main.nf @@ -52,15 +52,15 @@ workflow NFCORE_PHASEIMPUTE { input_simulate = Channel.empty() input_validate = Channel.empty() - if (params.step == "impute") { + if (params.step.split(',').contains("impute")) { input_impute = ch_input .combine(ch_regions) .map { metaI, file, index, metaCR, region -> [ metaI+metaCR, file, index ] } - } else if (params.step == "simulate" || params.step == "all") { + } else if (params.step.split(',').contains("simulate") || params.step.split(',').contains("all")) { input_simulate = ch_input - } else if (params.step == "validate") { + } else if (params.step.split(',').contains("validate")) { input_validate = ch_input .combine(ch_regions) .map { metaI, file, index, metaCR, region -> diff --git a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf index 77f7fe42..3cfae016 100644 --- a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf @@ -274,7 +274,7 @@ def validateInputParameters() { assert params.step, "A step must be provided" // Check that at least one tool is provided - if (params.step == "impute" || params.step == "panel_prep") { + if (params.step.split(',').contains("impute") || params.step.split(',').contains("panelprep")) { assert params.tools, "No tools provided" } } diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 78c5650d..3db859fe 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -77,7 +77,7 @@ workflow PHASEIMPUTE { // // Simulate data if asked // - if (params.step == 'simulate' || params.step == 'all') { + if (params.step.split(',').contains("simulate") || params.step.split(',').contains("all")) { // Output channel of simulate process ch_sim_output = Channel.empty() @@ -115,7 +115,7 @@ workflow PHASEIMPUTE { // // Prepare panel // - if (params.step == 'impute' || params.step == 'panel_prep' || params.step == 'validate' || params.step == 'all') { + if (params.step.split(',').contains("panelprep") || params.step.split(',').contains("validate") || params.step.split(',').contains("all")) { // Check chr prefix and remove if necessary VCF_CHR_CHECK(ch_panel, ch_fasta) ch_versions = ch_versions.mix(VCF_CHR_CHECK.out.versions) @@ -153,7 +153,7 @@ workflow PHASEIMPUTE { -> [metaPC, phased, p_index] } - if (params.step == 'impute' || params.step == 'all') { + if (params.step.split(',').contains("impute") || params.step.split(',').contains("all")) { // Output channel of input process ch_impute_output = Channel.empty() if (params.tools.split(',').contains("glimpse1")) { @@ -249,7 +249,7 @@ workflow PHASEIMPUTE { } - if (params.step == 'validate' || params.step == 'all') { + if (params.step.split(',').contains("validate") || params.step.split(',').contains("all")) { ch_truth_vcf = Channel.empty() // Get extension of input files truth_ext = getAllFilesExtension(ch_input_validate_truth) @@ -290,7 +290,7 @@ workflow PHASEIMPUTE { ch_versions = ch_versions.mix(VCF_CONCORDANCE_GLIMPSE2.out.versions) } - if (params.step == 'refine') { + if (params.step.split(',').contains("refine")) { error "refine step not yet implemented" } From 4456a588808a3f8487ef21aa55c309e31b169e54 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Wed, 1 May 2024 21:06:17 +0000 Subject: [PATCH 42/58] move stitch posfile preparation to panelprep section --- conf/steps/imputation_stitch.config | 4 +-- .../prepare_input_stitch.nf | 18 ++----------- .../prepare_posfile_tsv.nf | 26 +++++++++++++++++++ workflows/phaseimpute/main.nf | 12 +++++---- 4 files changed, 37 insertions(+), 23 deletions(-) create mode 100644 subworkflows/local/prepare_input_stitch/prepare_posfile_tsv.nf diff --git a/conf/steps/imputation_stitch.config b/conf/steps/imputation_stitch.config index 8cdf1a2e..49deab4c 100644 --- a/conf/steps/imputation_stitch.config +++ b/conf/steps/imputation_stitch.config @@ -21,7 +21,7 @@ process { } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:GAWK' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_POSFILE_TSV:GAWK' { ext.args = "'{ key = \$1 FS \$2 } !seen[key]++'" ext.prefix = { "${meta.id}_${meta.chr}_no_multiallelic" } ext.suffix = ".txt" @@ -59,7 +59,7 @@ process { - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_QUERY' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_POSFILE_TSV:BCFTOOLS_QUERY' { ext.args = [ "-f'%CHROM\t%POS\t%REF\t%ALT\\n'", ].join(' ') diff --git a/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf b/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf index c182e531..87cc489e 100644 --- a/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf +++ b/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf @@ -1,12 +1,7 @@ -include { BCFTOOLS_QUERY } from '../../../modules/nf-core/bcftools/query/main' -include { BCFTOOLS_NORM } from '../../../modules/nf-core/bcftools/norm/main' -include { GAWK } from '../../../modules/nf-core/gawk' - - workflow PREPARE_INPUT_STITCH { take: - ch_panel_sites + ch_posfile ch_fasta ch_input_impute @@ -14,17 +9,8 @@ workflow PREPARE_INPUT_STITCH { ch_versions = Channel.empty() - // Prepare posfile and parameters for STITCH - // Convert position file to tab-separated file - BCFTOOLS_QUERY(ch_panel_sites, [], [], []) - ch_posfile = BCFTOOLS_QUERY.out.output - - // Remove multiallelic positions from tsv - GAWK(ch_posfile, []) - GAWK.out.output.dump(tag:"GAWK.out.output") - // Get chromosomes of posfile - ch_posfile = GAWK.out.output.map{meta, posfile -> return[['chr': meta.chr], posfile]} + ch_posfile = ch_posfile.map{meta, posfile -> return[['chr': meta.chr], posfile]} // Get chromosomes of fasta ch_chromosomes = ch_fasta.map{it -> it[2]} diff --git a/subworkflows/local/prepare_input_stitch/prepare_posfile_tsv.nf b/subworkflows/local/prepare_input_stitch/prepare_posfile_tsv.nf new file mode 100644 index 00000000..0612d9bc --- /dev/null +++ b/subworkflows/local/prepare_input_stitch/prepare_posfile_tsv.nf @@ -0,0 +1,26 @@ +include { BCFTOOLS_QUERY } from '../../../modules/nf-core/bcftools/query/main' +include { GAWK } from '../../../modules/nf-core/gawk' + + +workflow PREPARE_POSFILE_TSV { + + take: + ch_panel_sites + ch_fasta + + main: + + ch_versions = Channel.empty() + + // Convert position file to tab-separated file + BCFTOOLS_QUERY(ch_panel_sites, [], [], []) + ch_posfile = BCFTOOLS_QUERY.out.output + + // Remove multiallelic positions from tsv + GAWK(ch_posfile, []) + + emit: + posfile = GAWK.out.output // channel: [ [id, chr], txt ] + versions = ch_versions // channel: [ versions.yml ] + +} diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 3db859fe..72216de8 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -27,6 +27,7 @@ include { VCF_CHR_CHECK } from '../../subworkflows/ include { VCF_NORMALIZE_BCFTOOLS } from '../../subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools' include { VCF_SITES_EXTRACT_BCFTOOLS } from '../../subworkflows/local/vcf_sites_extract_bcftools' include { VCF_PHASE_PANEL } from '../../subworkflows/local/vcf_phase_panel' +include { PREPARE_POSFILE_TSV } from '../../subworkflows/local/prepare_input_stitch/prepare_posfile_tsv' // GLIMPSE subworkflows include { VCF_IMPUTE_GLIMPSE as VCF_IMPUTE_GLIMPSE1 } from '../../subworkflows/nf-core/vcf_impute_glimpse' @@ -152,6 +153,9 @@ workflow PHASEIMPUTE { .map{ metaPC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index -> [metaPC, phased, p_index] } + // Prepare posfile stitch + PREPARE_POSFILE_TSV(VCF_SITES_EXTRACT_BCFTOOLS.out.panel_sites, ch_fasta) + ch_versions = ch_versions.mix(PREPARE_POSFILE_TSV.out.versions) if (params.step.split(',').contains("impute") || params.step.split(',').contains("all")) { // Output channel of input process @@ -204,12 +208,10 @@ workflow PHASEIMPUTE { // Obtain the user's posfile if provided or calculate it from ref panel file if (params.posfile) { // Untested ch_posfile = Channel.of([id:'posfile'], file(params.posfile), checkIfExists:true) - } else if (params.panel) { - // It should do all the panelprep functions if a panel is provided - // Currently: the panelprep functions are run by default - ch_posfile = VCF_SITES_EXTRACT_BCFTOOLS.out.panel_sites + } else if (params.panel && params.step.split(',').contains("panelprep")) { + ch_posfile = PREPARE_POSFILE_TSV.out.posfile } else { - error "No posfile or reference panel was included" + error "No posfile or reference panel preparation was included" } // Prepare inputs PREPARE_INPUT_STITCH(ch_posfile, ch_fasta, ch_input_impute) From 4741e6f8a25c2775dfc33e8970a80a115ed0fef6 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Wed, 1 May 2024 21:13:14 +0000 Subject: [PATCH 43/58] remove concat to avoid input filename collision --- workflows/phaseimpute/main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 72216de8..d81056bc 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -278,13 +278,13 @@ workflow PHASEIMPUTE { .mix(GL_TRUTH.out.vcf) // Concatenate by chromosomes - CONCAT_TRUTH(ch_truth_vcf) - ch_versions = ch_versions.mix(CONCAT_TRUTH.out.versions) + // CONCAT_TRUTH(ch_truth_vcf) + // ch_versions = ch_versions.mix(CONCAT_TRUTH.out.versions) // Compute concordance analysis VCF_CONCORDANCE_GLIMPSE2( ch_input_validate, - CONCAT_TRUTH.out.vcf_tbi_join, + ch_truth_vcf, ch_panel_sites, ch_region ) From b907cbeda6b944e88f5ee6c3c0e7a915305f2863 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Wed, 1 May 2024 23:08:01 +0000 Subject: [PATCH 44/58] modify code to accept user supplied posfile --- conf/test_stitch.config | 5 +-- .../prepare_input_stitch.nf | 40 +++++++++++++------ workflows/phaseimpute/main.nf | 11 +++-- 3 files changed, 34 insertions(+), 22 deletions(-) diff --git a/conf/test_stitch.config b/conf/test_stitch.config index 16850791..228a3447 100644 --- a/conf/test_stitch.config +++ b/conf/test_stitch.config @@ -26,10 +26,9 @@ params { // Genome references fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/reference_genome/21_22/hs38DH.chr21_22.fa" phased = true - panel = "${projectDir}/tests/csv/panel.csv" - //posfile = "https://github.com/nf-core/test-datasets/raw/phaseimpute/data/panel/22/chr22_posfile_stitch.txt" + posfile = "https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/panel/22/chr22_posfile_stitch.txt" // Impute parameters - step = "panelprep,impute" + step = "impute" tools = "stitch" } diff --git a/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf b/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf index 87cc489e..952a53ba 100644 --- a/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf +++ b/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf @@ -9,24 +9,38 @@ workflow PREPARE_INPUT_STITCH { ch_versions = Channel.empty() - // Get chromosomes of posfile - ch_posfile = ch_posfile.map{meta, posfile -> return[['chr': meta.chr], posfile]} - - // Get chromosomes of fasta - ch_chromosomes = ch_fasta.map{it -> it[2]} - .splitCsv(header: ["chr", "size", "offset", "lidebase", "linewidth", "qualoffset"], sep: "\t") - .map{it -> return [[chr: it.chr], it.chr]} - - // Combine channels + // Value channels def input_empty = [[]] def rdata_empty = [[]] k_val = params.k_val ngen = params.ngen - // Make final channel with parameters - stitch_parameters = ch_posfile.map { it + input_empty + rdata_empty} - .join(ch_chromosomes) - .map { it + k_val + ngen} + if (params.panel && params.step.split(',').contains("panelprep")) { + // Get chromosomes of posfile + ch_posfile = ch_posfile.map{meta, posfile -> return[['chr': meta.chr], posfile]} + + // Get chromosomes of fasta + ch_chromosomes = ch_fasta.map{it -> it[2]} + .splitCsv(header: ["chr", "size", "offset", "lidebase", "linewidth", "qualoffset"], sep: "\t") + .map{it -> return [[chr: it.chr], it.chr]} + + // Make final channel with parameters + stitch_parameters = ch_posfile.map { it + input_empty + rdata_empty} + .join(ch_chromosomes) + .map { it + k_val + ngen} + } else if (params.posfile){ + // Get unique chromosomes in posfile + ch_chromosomes_posfile = ch_posfile.map{it -> it[1]} + .splitCsv(header: ["chr", "pos", "ref", "alt"], sep: "\t") + .map{it -> return [it.chr]} + + ch_chromosomes_posfile = ch_chromosomes_posfile.unique() + + // Make final channel with parameters + stitch_parameters = ch_posfile.map { it + input_empty + rdata_empty} + .combine(ch_chromosomes_posfile) + .map { it + k_val + ngen} + } // Prepare sample files for STITCH // Group input by ID diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index d81056bc..6bb716a1 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -156,8 +156,9 @@ workflow PHASEIMPUTE { // Prepare posfile stitch PREPARE_POSFILE_TSV(VCF_SITES_EXTRACT_BCFTOOLS.out.panel_sites, ch_fasta) ch_versions = ch_versions.mix(PREPARE_POSFILE_TSV.out.versions) + } - if (params.step.split(',').contains("impute") || params.step.split(',').contains("all")) { + if (params.step.split(',').contains("impute") || params.step.split(',').contains("all")) { // Output channel of input process ch_impute_output = Channel.empty() if (params.tools.split(',').contains("glimpse1")) { @@ -206,9 +207,9 @@ workflow PHASEIMPUTE { ch_posfile = [] // Obtain the user's posfile if provided or calculate it from ref panel file - if (params.posfile) { // Untested - ch_posfile = Channel.of([id:'posfile'], file(params.posfile), checkIfExists:true) - } else if (params.panel && params.step.split(',').contains("panelprep")) { + if (params.posfile) { // User supplied posfile + ch_posfile = Channel.of([['id':'posfile'], file(params.posfile, checkIfExists:true)]) + } else if (params.panel && params.step.split(',').contains("panelprep")) { // Panelprep posfile ch_posfile = PREPARE_POSFILE_TSV.out.posfile } else { error "No posfile or reference panel preparation was included" @@ -249,8 +250,6 @@ workflow PHASEIMPUTE { ch_input_validate = ch_input_validate.mix(CONCAT_IMPUT.out.vcf_tbi_join) } - } - if (params.step.split(',').contains("validate") || params.step.split(',').contains("all")) { ch_truth_vcf = Channel.empty() // Get extension of input files From 177fa2bedbc04351dce8333f4bb84ec874167f35 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Wed, 1 May 2024 23:21:20 +0000 Subject: [PATCH 45/58] add posfile documentation --- docs/usage.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index d82bfc06..e542f603 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -150,6 +150,16 @@ Otherwise, you can provide your own position file in the `--mode impute` with ST ```bash nextflow run nf-core/phaseimpute --input samplesheet.csv --step impute --posfile posfile.txt --tool stitch --outdir results --genome GRCh37 -profile docker ``` +The tsv with the list of positions provided in `--posfile` should have the following structure, from STITCH documentation: "File is tab separated with no header, one row per SNP, with col 1 = chromosome, col 2 = physical position (sorted from smallest to largest), col 3 = reference base, col 4 = alternate base. Bases are capitalized. STITCH only handles bi-allelic SNPs" [STITCH](https://github.com/rwdavies/STITCH/blob/master/Options.md) + +As an example: + +```console +chr22 16570065 A G +chr22 16570067 A C +chr22 16570176 C A +chr22 16570211 T C +``` #### GLIMPSE1 From 73ff1c02030bd2cc836df83b78bd17c8f0e93fd6 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Thu, 2 May 2024 12:40:29 +0000 Subject: [PATCH 46/58] correct docs --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index e542f603..602eba27 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -125,7 +125,7 @@ genome: 'GRCh37' You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). -### Imputation tools `--mode impute --tools` +### Imputation tools `--step impute --tools [glimpse1, quilt, stitch]` You can choose different software to perform the imputation. In the following sections, the typical commands for running the pipeline with each software are included. From 4fd69a54b76228cf3726402c18524ae70e2cc871 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Thu, 2 May 2024 12:40:40 +0000 Subject: [PATCH 47/58] remove phased params --- conf/test_stitch.config | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/conf/test_stitch.config b/conf/test_stitch.config index 228a3447..b7d2d805 100644 --- a/conf/test_stitch.config +++ b/conf/test_stitch.config @@ -24,8 +24,7 @@ params { input_region = "${projectDir}/tests/csv/region.csv" // Genome references - fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/reference_genome/21_22/hs38DH.chr21_22.fa" - phased = true + fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/reference_genome/21_22/hs38DH.chr21_22.fa" posfile = "https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/panel/22/chr22_posfile_stitch.txt" // Impute parameters From 94dd953ed168d0d72aeac5db00d6a178e084ea50 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Thu, 2 May 2024 12:41:19 +0000 Subject: [PATCH 48/58] remove cpu and memory configs --- conf/steps/imputation_quilt.config | 12 ------------ conf/steps/imputation_stitch.config | 10 ---------- conf/steps/panel_prep.config | 6 ------ 3 files changed, 28 deletions(-) diff --git a/conf/steps/imputation_quilt.config b/conf/steps/imputation_quilt.config index f75c777e..32c64857 100644 --- a/conf/steps/imputation_quilt.config +++ b/conf/steps/imputation_quilt.config @@ -40,46 +40,34 @@ process { } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_INDEX' { - cpus = 2 - memory = 400.MB maxRetries = 2 } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_INDEX_2' { ext.args = '--tbi' - cpus = 2 - memory = 400.MB maxRetries = 2 } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_INDEX_3' { ext.args = '--tbi' - cpus = 2 - memory = 400.MB maxRetries = 2 } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_VIEW' { ext.args = '-v snps -Oz' ext.prefix = { "${meta.id}_${meta.chr}_biallelic" } - cpus = 2 - memory = 400.MB maxRetries = 2 } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_NORM' { ext.args = '-m +any --output-type z' ext.prefix = { "${meta.id}_${meta.chr}_multiallelic" } - cpus = 2 - memory = 400.MB maxRetries = 2 } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_CONVERT' { ext.args = '--haplegendsample test' ext.prefix = { "${meta.id}_${meta.chr}_convert" } - cpus = 2 - memory = 400.MB maxRetries = 2 } diff --git a/conf/steps/imputation_stitch.config b/conf/steps/imputation_stitch.config index 49deab4c..5962e5e7 100644 --- a/conf/steps/imputation_stitch.config +++ b/conf/steps/imputation_stitch.config @@ -30,29 +30,21 @@ process { withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_NORM' { ext.args = '-m +any --output-type z' ext.prefix = { "${meta.id}_${meta.chr}_multiallelic" } - cpus = 2 - memory = 400.MB maxRetries = 2 } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_VIEW' { ext.args = '-v snps -Oz' ext.prefix = { "${meta.id}_${meta.chr}_biallelic" } - cpus = 2 - memory = 400.MB maxRetries = 2 } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_INDEX' { - cpus = 2 - memory = 400.MB maxRetries = 2 } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_INDEX_2' { ext.args = '--tbi' - cpus = 2 - memory = 400.MB maxRetries = 2 } @@ -68,8 +60,6 @@ process { withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_IMPUTE_STITCH:BCFTOOLS_INDEX' { ext.args = '--tbi' - cpus = 2 - memory = 400.MB maxRetries = 2 } diff --git a/conf/steps/panel_prep.config b/conf/steps/panel_prep.config index b20548df..af8d62c5 100644 --- a/conf/steps/panel_prep.config +++ b/conf/steps/panel_prep.config @@ -31,8 +31,6 @@ process { withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_NORM' { ext.args = '-m +any --no-version --output-type z' ext.prefix = { "${meta.id}_${meta.chr}_multiallelic" } - cpus = 2 - memory = 400.MB maxRetries = 2 publishDir = [ enabled: false ] } @@ -45,16 +43,12 @@ process { withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_VIEW' { ext.args = '-v snps -Oz' ext.prefix = { "${meta.id}_${meta.chr}_biallelic" } - cpus = 2 - memory = 400.MB maxRetries = 2 publishDir = [ enabled: false ] } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_INDEX_2' { ext.args = '--tbi' - cpus = 2 - memory = 400.MB maxRetries = 2 publishDir = [ enabled: false ] } From 05f226e29f9a7dd7b0b1b417fdb7f8806c804922 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 3 May 2024 13:05:57 +0000 Subject: [PATCH 49/58] add posfile as csv --- assets/schema_posfile.json | 23 +++++++++++++++++++++++ conf/test_stitch.config | 2 +- nextflow_schema.json | 14 +++++++++----- tests/csv/posfile.csv | 2 ++ 4 files changed, 35 insertions(+), 6 deletions(-) create mode 100644 assets/schema_posfile.json create mode 100644 tests/csv/posfile.csv diff --git a/assets/schema_posfile.json b/assets/schema_posfile.json new file mode 100644 index 00000000..07a23ff1 --- /dev/null +++ b/assets/schema_posfile.json @@ -0,0 +1,23 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/phaseimpute/master/assets/schema_posfile.json", + "title": "nf-core/phaseimpute pipeline - params.posfile schema", + "description": "Schema for the file provided with params.posfile", + "type": "array", + "items": { + "type": "object", + "properties": { + "chr": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Chromosome name must be provided as a string and cannot contain spaces" + }, + "file": { + "type": "integer", + "pattern": "^\\S+\\.txt$", + "errorMessage": "Posfile per chromosome must be provided. Must have .txt extension" + } + }, + "required": ["chr", "file"] + } +} diff --git a/conf/test_stitch.config b/conf/test_stitch.config index b7d2d805..11508421 100644 --- a/conf/test_stitch.config +++ b/conf/test_stitch.config @@ -25,7 +25,7 @@ params { // Genome references fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/reference_genome/21_22/hs38DH.chr21_22.fa" - posfile = "https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/panel/22/chr22_posfile_stitch.txt" + posfile = "${projectDir}/tests/csv/posfile.csv" // Impute parameters step = "impute" diff --git a/nextflow_schema.json b/nextflow_schema.json index c11172a9..cbf73d2a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -396,7 +396,7 @@ "quilt_parameters": { "title": "QUILT parameters", "type": "object", - "description": "", + "description": "Arguments to customize QUILT run", "default": "", "properties": { "buffer": { @@ -407,15 +407,14 @@ "ngen": { "type": "integer", "default": 100, - "description": "Number of generations since founding of the population to use for imputation.", - "help_text": "" + "description": "Number of generations since founding of the population to use for imputation." } } }, "stitch_parameters": { "title": "STITCH parameters", "type": "object", - "description": "", + "description": "Arguments to customize STITCH run", "default": "", "properties": { "seed": { @@ -424,7 +423,12 @@ }, "posfile": { "type": "string", - "description": "Tab-separated file describing the variable positions to be used for imputation. Refer to the documentation for the `--posfile` argument of STITCH for more information." + "description": "Path to comma-separated file containing tab-separated files describing the variable positions to be used for imputation. Refer to the documentation for the `--posfile` argument of STITCH for more information.", + "format": "file-path", + "schema": "assets/schema_posfile.json", + "pattern": "^\\S+\\.(csv|tsv|txt)$", + "mimetype": "text/csv", + "help_text": "" }, "k_val": { "type": "integer", diff --git a/tests/csv/posfile.csv b/tests/csv/posfile.csv new file mode 100644 index 00000000..d5a92024 --- /dev/null +++ b/tests/csv/posfile.csv @@ -0,0 +1,2 @@ +chr,file +chr22,"https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/panel/22/chr22_posfile_stitch.txt" From e4a2215718bec2520c168695c8388bfd20a0f666 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 3 May 2024 13:44:09 +0000 Subject: [PATCH 50/58] add posfile from csv --- assets/schema_posfile.json | 2 +- docs/usage.md | 15 ++++++++++++--- main.nf | 3 +++ .../utils_nfcore_phaseimpute_pipeline/main.nf | 13 +++++++++++++ workflows/phaseimpute/main.nf | 3 ++- 5 files changed, 31 insertions(+), 5 deletions(-) diff --git a/assets/schema_posfile.json b/assets/schema_posfile.json index 07a23ff1..5ed91deb 100644 --- a/assets/schema_posfile.json +++ b/assets/schema_posfile.json @@ -13,7 +13,7 @@ "errorMessage": "Chromosome name must be provided as a string and cannot contain spaces" }, "file": { - "type": "integer", + "type": "string", "pattern": "^\\S+\\.txt$", "errorMessage": "Posfile per chromosome must be provided. Must have .txt extension" } diff --git a/docs/usage.md b/docs/usage.md index 602eba27..95bf802f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -148,11 +148,20 @@ nextflow run nf-core/phaseimpute --input samplesheet.csv --step panelprep --pane Otherwise, you can provide your own position file in the `--mode impute` with STITCH using the the `--posfile` parameter. ```bash -nextflow run nf-core/phaseimpute --input samplesheet.csv --step impute --posfile posfile.txt --tool stitch --outdir results --genome GRCh37 -profile docker +nextflow run nf-core/phaseimpute --input samplesheet.csv --step impute --posfile samplesheet_posfile.csv --tool stitch --outdir results --genome GRCh37 -profile docker ``` -The tsv with the list of positions provided in `--posfile` should have the following structure, from STITCH documentation: "File is tab separated with no header, one row per SNP, with col 1 = chromosome, col 2 = physical position (sorted from smallest to largest), col 3 = reference base, col 4 = alternate base. Bases are capitalized. STITCH only handles bi-allelic SNPs" [STITCH](https://github.com/rwdavies/STITCH/blob/master/Options.md) +The csv provided in `--posfile` must contain two columns [chr, file]. The first column is the chromosome and the file column are tsvs with the list of positions, unique to each chromosome. -As an example: +```console +chr,file +chr1,posfile_chr1.txt +chr2,posfile_chr2.txt +chr3,posfile_chr3.txt +``` + +The file column should contain a TSV with the following structure, from STITCH documentation: "File is tab separated with no header, one row per SNP, with col 1 = chromosome, col 2 = physical position (sorted from smallest to largest), col 3 = reference base, col 4 = alternate base. Bases are capitalized. STITCH only handles bi-allelic SNPs" [STITCH](https://github.com/rwdavies/STITCH/blob/master/Options.md). + +As an example, chr22 tsv file: ```console chr22 16570065 A G diff --git a/main.nf b/main.nf index 92f1c378..1e1c3dd7 100644 --- a/main.nf +++ b/main.nf @@ -40,6 +40,7 @@ workflow NFCORE_PHASEIMPUTE { ch_regions // channel: regions to use [[chr, region], region] ch_depth // channel: depth of coverage file [[depth], depth] ch_map // channel: map file for imputation + ch_posfile // channel: samplesheet read in from --posfile ch_versions // channel: versions of software used main: @@ -86,6 +87,7 @@ workflow NFCORE_PHASEIMPUTE { ch_regions, ch_depth, ch_map, + ch_posfile, ch_versions ) @@ -128,6 +130,7 @@ workflow { PIPELINE_INITIALISATION.out.regions, PIPELINE_INITIALISATION.out.depth, PIPELINE_INITIALISATION.out.map, + PIPELINE_INITIALISATION.out.posfile, PIPELINE_INITIALISATION.out.versions ) diff --git a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf index 3cfae016..90dbd14e 100644 --- a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf @@ -207,6 +207,18 @@ workflow PIPELINE_INITIALISATION { ch_genotype = Channel.of([[],[]]) } + // + // Create posfile channel + // + + ch_posfile = Channel + .fromSamplesheet("posfile") + .map { + meta, file -> + [ meta, file ] + } + ch_posfile.dump(tag:"ch_posfile_initialisation") + emit: input = ch_input // [ [meta], file, index ] @@ -216,6 +228,7 @@ workflow PIPELINE_INITIALISATION { depth = ch_depth // [ [depth], depth ] regions = ch_regions // [ [chr, region], region ] map = ch_map // [ [map], map ] + posfile = ch_posfile // [ [chr], txt ] versions = ch_versions } diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 6bb716a1..fd07be06 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -69,6 +69,7 @@ workflow PHASEIMPUTE { ch_region // channel: region to use [ [chr, region], region] ch_depth // channel: depth select [ [depth], depth ] ch_map // channel: genetic map [ [chr], map] + ch_posfile // channel: posfile [ [chr], txt] ch_versions // channel: versions of software used main: @@ -205,7 +206,7 @@ workflow PHASEIMPUTE { if (params.tools.split(',').contains("stitch")) { print("Impute with STITCH") - ch_posfile = [] + ch_posfile.dump(tag:"ch_posfile_stitch") // Obtain the user's posfile if provided or calculate it from ref panel file if (params.posfile) { // User supplied posfile ch_posfile = Channel.of([['id':'posfile'], file(params.posfile, checkIfExists:true)]) From 002cc126b5611e93ecc1819aed90b09b4252204f Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 3 May 2024 14:03:21 +0000 Subject: [PATCH 51/58] adapt code to posfile csv input --- assets/schema_posfile.json | 3 +- .../prepare_input_stitch.nf | 38 ++++++------------- .../utils_nfcore_phaseimpute_pipeline/main.nf | 5 ++- workflows/phaseimpute/main.nf | 3 +- 4 files changed, 18 insertions(+), 31 deletions(-) diff --git a/assets/schema_posfile.json b/assets/schema_posfile.json index 5ed91deb..5a3c9d5a 100644 --- a/assets/schema_posfile.json +++ b/assets/schema_posfile.json @@ -10,7 +10,8 @@ "chr": { "type": "string", "pattern": "^\\S+$", - "errorMessage": "Chromosome name must be provided as a string and cannot contain spaces" + "errorMessage": "Chromosome name must be provided as a string and cannot contain spaces", + "meta": ["chr"] }, "file": { "type": "string", diff --git a/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf b/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf index 952a53ba..6b2e7a37 100644 --- a/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf +++ b/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf @@ -15,32 +15,18 @@ workflow PREPARE_INPUT_STITCH { k_val = params.k_val ngen = params.ngen - if (params.panel && params.step.split(',').contains("panelprep")) { - // Get chromosomes of posfile - ch_posfile = ch_posfile.map{meta, posfile -> return[['chr': meta.chr], posfile]} - - // Get chromosomes of fasta - ch_chromosomes = ch_fasta.map{it -> it[2]} - .splitCsv(header: ["chr", "size", "offset", "lidebase", "linewidth", "qualoffset"], sep: "\t") - .map{it -> return [[chr: it.chr], it.chr]} - - // Make final channel with parameters - stitch_parameters = ch_posfile.map { it + input_empty + rdata_empty} - .join(ch_chromosomes) - .map { it + k_val + ngen} - } else if (params.posfile){ - // Get unique chromosomes in posfile - ch_chromosomes_posfile = ch_posfile.map{it -> it[1]} - .splitCsv(header: ["chr", "pos", "ref", "alt"], sep: "\t") - .map{it -> return [it.chr]} - - ch_chromosomes_posfile = ch_chromosomes_posfile.unique() - - // Make final channel with parameters - stitch_parameters = ch_posfile.map { it + input_empty + rdata_empty} - .combine(ch_chromosomes_posfile) - .map { it + k_val + ngen} - } + // Get chromosomes of posfile + ch_posfile = ch_posfile.map{meta, posfile -> return[['chr': meta.chr], posfile]} + + // Get chromosomes of fasta + ch_chromosomes = ch_fasta.map{it -> it[2]} + .splitCsv(header: ["chr", "size", "offset", "lidebase", "linewidth", "qualoffset"], sep: "\t") + .map{it -> return [[chr: it.chr], it.chr]} + + // Make final channel with parameters + stitch_parameters = ch_posfile.map { it + input_empty + rdata_empty} + .join(ch_chromosomes) + .map { it + k_val + ngen} // Prepare sample files for STITCH // Group input by ID diff --git a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf index 90dbd14e..28fada14 100644 --- a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf @@ -211,14 +211,15 @@ workflow PIPELINE_INITIALISATION { // Create posfile channel // + if (params.posfile) { ch_posfile = Channel .fromSamplesheet("posfile") .map { meta, file -> [ meta, file ] + }} else { + ch_posfile = [[]] } - ch_posfile.dump(tag:"ch_posfile_initialisation") - emit: input = ch_input // [ [meta], file, index ] diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index fd07be06..2186b72b 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -206,10 +206,9 @@ workflow PHASEIMPUTE { if (params.tools.split(',').contains("stitch")) { print("Impute with STITCH") - ch_posfile.dump(tag:"ch_posfile_stitch") // Obtain the user's posfile if provided or calculate it from ref panel file if (params.posfile) { // User supplied posfile - ch_posfile = Channel.of([['id':'posfile'], file(params.posfile, checkIfExists:true)]) + ch_posfile = ch_posfile } else if (params.panel && params.step.split(',').contains("panelprep")) { // Panelprep posfile ch_posfile = PREPARE_POSFILE_TSV.out.posfile } else { From 04f572c04f26fc37c7d2331c1726cb03fa71369c Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 3 May 2024 14:36:15 +0000 Subject: [PATCH 52/58] move hap and legend generation to panelprep --- conf/steps/imputation_quilt.config | 32 --------------- conf/steps/panel_prep.config | 5 +++ subworkflows/local/make_chunks/make_chunks.nf | 39 ------------------- .../vcf_normalize_bcftools.nf | 13 ++++++- workflows/phaseimpute/main.nf | 4 +- 5 files changed, 18 insertions(+), 75 deletions(-) diff --git a/conf/steps/imputation_quilt.config b/conf/steps/imputation_quilt.config index 32c64857..5b834c1a 100644 --- a/conf/steps/imputation_quilt.config +++ b/conf/steps/imputation_quilt.config @@ -39,38 +39,6 @@ process { ext.prefix = { "${meta.id}_${meta.chr}" } } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_INDEX' { - maxRetries = 2 - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_INDEX_2' { - ext.args = '--tbi' - maxRetries = 2 - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_INDEX_3' { - ext.args = '--tbi' - maxRetries = 2 - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_VIEW' { - ext.args = '-v snps -Oz' - ext.prefix = { "${meta.id}_${meta.chr}_biallelic" } - maxRetries = 2 - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_NORM' { - ext.args = '-m +any --output-type z' - ext.prefix = { "${meta.id}_${meta.chr}_multiallelic" } - maxRetries = 2 - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_CONVERT' { - ext.args = '--haplegendsample test' - ext.prefix = { "${meta.id}_${meta.chr}_convert" } - maxRetries = 2 - } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:IMPUTE_QUILT:.*' { publishDir = [ [ diff --git a/conf/steps/panel_prep.config b/conf/steps/panel_prep.config index af8d62c5..3c08653f 100644 --- a/conf/steps/panel_prep.config +++ b/conf/steps/panel_prep.config @@ -108,6 +108,11 @@ process { ] } + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_CONVERT' { + ext.args = {"--haplegendsample ${meta.id}_${meta.chr}"} + maxRetries = 2 + } + // Phasing withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_PHASE_PANEL:VCF_PHASE_SHAPEIT5:BEDTOOLS_MAKEWINDOWS' { ext.args = [ diff --git a/subworkflows/local/make_chunks/make_chunks.nf b/subworkflows/local/make_chunks/make_chunks.nf index c0fe7924..64f7d93a 100644 --- a/subworkflows/local/make_chunks/make_chunks.nf +++ b/subworkflows/local/make_chunks/make_chunks.nf @@ -1,17 +1,9 @@ -include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main' -include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_2} from '../../../modules/nf-core/bcftools/index/main' -include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_3} from '../../../modules/nf-core/bcftools/index/main' include { GLIMPSE_CHUNK } from '../../../modules/nf-core/glimpse/chunk/main' -include { BCFTOOLS_CONVERT } from '../../../modules/nf-core/bcftools/convert/main' -include { BCFTOOLS_NORM } from '../../../modules/nf-core/bcftools/norm/main' -include { BCFTOOLS_VIEW } from '../../../modules/nf-core/bcftools/view/main' - workflow MAKE_CHUNKS { take: ch_reference // channel: [ val(meta),vcf ] - ch_fasta_fai // channel: [meta, fasta, fai] main: @@ -30,38 +22,7 @@ workflow MAKE_CHUNKS { [metamap, metamap.chr, startEnd[0], startEnd[1]] } - ch_fasta = ch_fasta_fai.map { meta, fasta, fai -> [meta, fasta] } - - // Join duplicated biallelic sites into multiallelic records - BCFTOOLS_NORM(ch_reference, ch_fasta) - - // Index multiallelic VCF - BCFTOOLS_INDEX_2(BCFTOOLS_NORM.out.vcf) - - // Join multiallelic VCF and TBI - ch_multiallelic_vcf_tbi = BCFTOOLS_NORM.out.vcf.join(BCFTOOLS_INDEX_2.out.tbi) - - // Remove all multiallelic records: - BCFTOOLS_VIEW(ch_multiallelic_vcf_tbi, [], [], []) - - // Index biallelic VCF - BCFTOOLS_INDEX_3(BCFTOOLS_VIEW.out.vcf) - - // Join biallelic VCF and TBI - ch_biallelic_vcf_tbi = BCFTOOLS_VIEW.out.vcf.join(BCFTOOLS_INDEX_3.out.tbi) - - // Convert VCF to Hap and Legend files - BCFTOOLS_CONVERT(ch_biallelic_vcf_tbi, ch_fasta, []) - - // Output hap and legend files - ch_hap_legend = BCFTOOLS_CONVERT.out.hap.join(BCFTOOLS_CONVERT.out.legend) - - - - - emit: ch_chunks = ch_chunks // channel: [ chr, val(meta), start, end, number ] - ch_hap_legend = ch_hap_legend // channel: [ chr, val(meta), hap, legend ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf b/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf index f75ae347..18ff01ad 100644 --- a/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf +++ b/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf @@ -3,6 +3,7 @@ include { BCFTOOLS_VIEW } from '../../../modules/nf-core/bcf include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main' include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_2} from '../../../modules/nf-core/bcftools/index/main' include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_3} from '../../../modules/nf-core/bcftools/index/main' +include { BCFTOOLS_CONVERT } from '../../../modules/nf-core/bcftools/convert/main' workflow VCF_NORMALIZE_BCFTOOLS { @@ -33,7 +34,15 @@ workflow VCF_NORMALIZE_BCFTOOLS { // Join biallelic VCF and TBI ch_biallelic_vcf_tbi = BCFTOOLS_VIEW.out.vcf.join(BCFTOOLS_INDEX_2.out.tbi) + // Convert VCF to Hap and Legend files + BCFTOOLS_CONVERT(ch_biallelic_vcf_tbi, ch_fasta, []) + + // Output hap and legend files + ch_hap_legend = BCFTOOLS_CONVERT.out.hap.join(BCFTOOLS_CONVERT.out.legend) + ch_hap_legend.dump(tag:"ch_hap_legend_vcfnormalize") + emit: - vcf_tbi = ch_biallelic_vcf_tbi - versions = ch_versions // channel: [ versions.yml ] + vcf_tbi = ch_biallelic_vcf_tbi // channel: [ [id, chr], vcf, tbi ] + hap_legend = ch_hap_legend // channel: [ [id, chr] '.hap', '.legend' ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 2186b72b..196892aa 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -235,10 +235,10 @@ workflow PHASEIMPUTE { // Quilt subworkflow // Create chunks from reference VCF - MAKE_CHUNKS(ch_panel, ch_fasta) + MAKE_CHUNKS(ch_panel) // Impute BAMs with QUILT - IMPUTE_QUILT(MAKE_CHUNKS.out.ch_hap_legend, ch_input_impute, MAKE_CHUNKS.out.ch_chunks) + IMPUTE_QUILT(VCF_NORMALIZE_BCFTOOLS.out.hap_legend, ch_input_impute, MAKE_CHUNKS.out.ch_chunks) ch_versions = ch_versions.mix(IMPUTE_QUILT.out.versions) // Add to output channel From b26bc83e0a24895cda8d41e4d9dc81733c89b797 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 3 May 2024 14:43:15 +0000 Subject: [PATCH 53/58] move chunk generation to panelprep --- subworkflows/local/make_chunks/make_chunks.nf | 5 +++-- workflows/phaseimpute/main.nf | 19 +++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/subworkflows/local/make_chunks/make_chunks.nf b/subworkflows/local/make_chunks/make_chunks.nf index 64f7d93a..5c4319dc 100644 --- a/subworkflows/local/make_chunks/make_chunks.nf +++ b/subworkflows/local/make_chunks/make_chunks.nf @@ -12,8 +12,9 @@ workflow MAKE_CHUNKS { // Make chunks ch_vcf_csi_chr = ch_reference.map{meta, vcf, csi -> [meta, vcf, csi, meta.chr]} GLIMPSE_CHUNK(ch_vcf_csi_chr) + ch_versions = ch_versions.mix(GLIMPSE_CHUNK.out.versions) - // Rearrange chunks into channel + // Rearrange chunks into channel for QUILT ch_chunks = GLIMPSE_CHUNK.out.chunk_chr .splitText() .map { metamap, line -> @@ -23,6 +24,6 @@ workflow MAKE_CHUNKS { } emit: - ch_chunks = ch_chunks // channel: [ chr, val(meta), start, end, number ] + chunks = ch_chunks // channel: [ chr, val(meta), start, end, number ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 196892aa..5ec1953b 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -157,6 +157,10 @@ workflow PHASEIMPUTE { // Prepare posfile stitch PREPARE_POSFILE_TSV(VCF_SITES_EXTRACT_BCFTOOLS.out.panel_sites, ch_fasta) ch_versions = ch_versions.mix(PREPARE_POSFILE_TSV.out.versions) + + // Create chunks from reference VCF + MAKE_CHUNKS(ch_panel) + ch_versions = ch_versions.mix(MAKE_CHUNKS.out.versions) } if (params.step.split(',').contains("impute") || params.step.split(',').contains("all")) { @@ -232,17 +236,12 @@ workflow PHASEIMPUTE { if (params.tools.split(',').contains("quilt")) { print("Impute with QUILT") - // Quilt subworkflow - - // Create chunks from reference VCF - MAKE_CHUNKS(ch_panel) + // Impute BAMs with QUILT + IMPUTE_QUILT(VCF_NORMALIZE_BCFTOOLS.out.hap_legend, ch_input_impute, MAKE_CHUNKS.out.chunks) + ch_versions = ch_versions.mix(IMPUTE_QUILT.out.versions) - // Impute BAMs with QUILT - IMPUTE_QUILT(VCF_NORMALIZE_BCFTOOLS.out.hap_legend, ch_input_impute, MAKE_CHUNKS.out.ch_chunks) - ch_versions = ch_versions.mix(IMPUTE_QUILT.out.versions) - - // Add to output channel - ch_impute_output = ch_impute_output.mix(IMPUTE_QUILT.out.vcf_tbi) + // Add to output channel + ch_impute_output = ch_impute_output.mix(IMPUTE_QUILT.out.vcf_tbi) } // Concatenate by chromosomes CONCAT_IMPUT(ch_impute_output) From 4eae4469f54b7c9d9fa88e4622bd96906425afcd Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 3 May 2024 14:50:55 +0000 Subject: [PATCH 54/58] remove unused subworkflows --- subworkflows/local/get_panel/main.nf | 89 ------------------- .../local/panel_prepare_channels/main.nf | 47 ---------- 2 files changed, 136 deletions(-) delete mode 100644 subworkflows/local/get_panel/main.nf delete mode 100644 subworkflows/local/panel_prepare_channels/main.nf diff --git a/subworkflows/local/get_panel/main.nf b/subworkflows/local/get_panel/main.nf deleted file mode 100644 index d9178c12..00000000 --- a/subworkflows/local/get_panel/main.nf +++ /dev/null @@ -1,89 +0,0 @@ -include { BCFTOOLS_VIEW as VIEW_VCF_SNPS } from '../../../modules/nf-core/bcftools/view/main.nf' -include { BCFTOOLS_VIEW as VIEW_VCF_SITES } from '../../../modules/nf-core/bcftools/view/main.nf' -include { BCFTOOLS_INDEX as VCF_INDEX1 } from '../../../modules/nf-core/bcftools/index/main.nf' -include { BCFTOOLS_INDEX as VCF_INDEX3 } from '../../../modules/nf-core/bcftools/index/main.nf' -include { BCFTOOLS_INDEX as VCF_INDEX4 } from '../../../modules/nf-core/bcftools/index/main.nf' -include { BCFTOOLS_INDEX as VCF_INDEX5 } from '../../../modules/nf-core/bcftools/index/main.nf' -include { BCFTOOLS_NORM } from '../../../modules/nf-core/bcftools/norm/main.nf' -include { BCFTOOLS_QUERY } from '../../../modules/nf-core/bcftools/query/main.nf' -include { TABIX_BGZIP } from '../../../modules/nf-core/tabix/bgzip/main' -include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix/main' -include { VCF_PHASE_SHAPEIT5 } from '../../../subworkflows/nf-core/vcf_phase_shapeit5/main' - - -workflow GET_PANEL { - take: - ch_vcf // channel: [ [id, chr], vcf, index ] - ch_fasta // channel: [ [genome], fasta, fai ] - - main: - - ch_versions = Channel.empty() - - BCFTOOLS_NORM(ch_vcf, ch_fasta.map{ genome, fasta, fai -> [genome, fasta] }) - ch_versions = ch_versions.mix(BCFTOOLS_NORM.out.versions.first()) - - // Extract only the SNP - VIEW_VCF_SNPS(BCFTOOLS_NORM.out.vcf // [ meta, vcf ] - .combine(Channel.of([[]])), [], [], []) - ch_versions = ch_versions.mix(VIEW_VCF_SNPS.out.versions.first()) - - VCF_INDEX3(VIEW_VCF_SNPS.out.vcf) - ch_versions = ch_versions.mix(VCF_INDEX3.out.versions.first()) - - ch_panel_norm = VIEW_VCF_SNPS.out.vcf - .combine(VCF_INDEX3.out.csi, by:0) - - // Extract sites positions - vcf_region = VIEW_VCF_SNPS.out.vcf - .combine(VCF_INDEX3.out.csi, by:0) - VIEW_VCF_SITES( ch_panel_norm, - [], [], []) - ch_versions = ch_versions.mix(VIEW_VCF_SITES.out.versions.first()) - - VCF_INDEX4(VIEW_VCF_SITES.out.vcf) - ch_versions = ch_versions.mix(VCF_INDEX4.out.versions.first()) - - ch_panel_sites = VIEW_VCF_SITES.out.vcf - .combine(VCF_INDEX4.out.csi, by:0) - - // Convert to TSV - BCFTOOLS_QUERY(ch_panel_sites, [], [], []) - ch_versions = ch_versions.mix(BCFTOOLS_QUERY.out.versions.first()) - - TABIX_BGZIP(BCFTOOLS_QUERY.out.output) - ch_versions = ch_versions.mix(TABIX_BGZIP.out.versions.first()) - - TABIX_TABIX(TABIX_BGZIP.out.output) - ch_versions = ch_versions.mix(TABIX_TABIX.out.versions.first()) - - ch_panel_tsv = TABIX_BGZIP.out.output - .combine(TABIX_TABIX.out.tbi, by: 0) - - // Phase panel - if (params.phased == false) { - VCF_PHASE_SHAPEIT5(vcf_region - .map { meta, vcf, csi -> [meta, vcf, csi, [], meta.region] }, - Channel.of([[],[],[]]).collect(), - Channel.of([[],[],[]]).collect(), - Channel.of([[],[]]).collect()) - ch_versions = ch_versions.mix(VCF_PHASE_SHAPEIT5.out.versions) - ch_panel_phased = VCF_PHASE_SHAPEIT5.out.variants_phased - .combine(VCF_PHASE_SHAPEIT5.out.variants_index, by: 0) - } else { - ch_panel_phased = vcf_region - } - - ch_panel = ch_panel_norm - .combine(ch_panel_sites, by: 0) - .combine(ch_panel_tsv, by: 0) - .combine(ch_panel_phased, by: 0) - .map{ metaIC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index - -> [[panel:metaIC.id, chr:metaIC.chr ], norm, n_index, sites, s_index, tsv, t_index, phased, p_index] - } - - emit: - panel = ch_panel // channel: [ [panel, chr], norm, n_index, sites, s_index, tsv, t_index, phased, p_index] - panel_sites = ch_panel_sites - versions = ch_versions // channel: [ versions.yml ] -} diff --git a/subworkflows/local/panel_prepare_channels/main.nf b/subworkflows/local/panel_prepare_channels/main.nf deleted file mode 100644 index 95882870..00000000 --- a/subworkflows/local/panel_prepare_channels/main.nf +++ /dev/null @@ -1,47 +0,0 @@ -include { VCF_CONCATENATE_BCFTOOLS as CONCAT_PANEL } from '../../../subworkflows/local/vcf_concatenate_bcftools' - -workflow PANEL_PREPARE_CHANNELS { - take: - ch_panel_norm // channel: [ [id, chr], vcf, index ] - ch_panel_sites - ch_panel_tsv - ch_panel_phased - - main: - - ch_versions = Channel.empty() - - ch_panel = ch_panel_norm - .combine(ch_panel_sites, by: 0) - .combine(ch_panel_tsv, by: 0) - .combine(ch_panel_phased, by: 0) - .map{ metaIC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index - -> [[panel:metaIC.id, chr:metaIC.chr ], norm, n_index, sites, s_index, tsv, t_index, phased, p_index] - } - - - ch_panel_sites_tsv = ch_panel - .map{ metaPC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index - -> [metaPC, sites, tsv] - } - CONCAT_PANEL(ch_panel - .map{ metaPC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index - -> [[id:metaPC.panel], sites, s_index] - } - ) - ch_panel_sites = CONCAT_PANEL.out.vcf_tbi_join - - ch_panel_phased = ch_panel_phased - .map{ metaPC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index - -> [metaPC, phased, p_index] - } - - - emit: - panel = ch_panel - ch_panel_sites - ch_panel_phased - ch_panel_sites_tsv - - versions = ch_versions // channel: [ versions.yml ] -} From c066fc7472fd7e66adf53e35f42341718d69354f Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 3 May 2024 15:39:27 +0000 Subject: [PATCH 55/58] arrange directories and make uniform module names --- conf/steps/imputation_quilt.config | 4 +-- conf/steps/imputation_stitch.config | 29 ++++++++++++++++--- conf/steps/panel_prep.config | 5 ++++ .../local/impute_quilt/impute_quilt.nf | 16 +++++----- .../vcf_normalize_bcftools.nf | 1 - .../nf-core/vcf_phase_shapeit5/main.nf | 16 +++++----- 6 files changed, 48 insertions(+), 23 deletions(-) diff --git a/conf/steps/imputation_quilt.config b/conf/steps/imputation_quilt.config index 5b834c1a..7ff6a4ad 100644 --- a/conf/steps/imputation_quilt.config +++ b/conf/steps/imputation_quilt.config @@ -53,7 +53,7 @@ process { publishDir = [enabled: false] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:IMPUTE_QUILT:INDEX1' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:IMPUTE_QUILT:BCFTOOLS_INDEX_1' { ext.args = "--tbi" publishDir = [enabled: false] } @@ -63,7 +63,7 @@ process { ext.prefix = { "${meta.id}_R${meta.region.replace(':','_')}.impute.annotate" } } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:IMPUTE_QUILT:INDEX2' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:IMPUTE_QUILT:BCFTOOLS_INDEX_2' { ext.args = "--tbi" } diff --git a/conf/steps/imputation_stitch.config b/conf/steps/imputation_stitch.config index 5962e5e7..aa44c687 100644 --- a/conf/steps/imputation_stitch.config +++ b/conf/steps/imputation_stitch.config @@ -20,10 +20,17 @@ process { ] } + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_POSFILE_TSV:.*' { + publishDir = [ + path: { "${params.outdir}/prep_panel/posfile/" }, + mode: params.publish_dir_mode, + enabled: true + ] + } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_POSFILE_TSV:GAWK' { ext.args = "'{ key = \$1 FS \$2 } !seen[key]++'" - ext.prefix = { "${meta.id}_${meta.chr}_no_multiallelic" } + ext.prefix = { "${meta.id}_${meta.chr}_posfile_stitch" } ext.suffix = ".txt" } @@ -31,36 +38,50 @@ process { ext.args = '-m +any --output-type z' ext.prefix = { "${meta.id}_${meta.chr}_multiallelic" } maxRetries = 2 + publishDir = [enabled: false] } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_VIEW' { ext.args = '-v snps -Oz' ext.prefix = { "${meta.id}_${meta.chr}_biallelic" } maxRetries = 2 + publishDir = [enabled: false] + } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_INDEX' { maxRetries = 2 + publishDir = [enabled: false] + } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_INDEX_2' { ext.args = '--tbi' maxRetries = 2 - } - - + publishDir = [enabled: false] + } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_POSFILE_TSV:BCFTOOLS_QUERY' { ext.args = [ "-f'%CHROM\t%POS\t%REF\t%ALT\\n'", ].join(' ') ext.prefix = { "${meta.id}_${meta.chr}_posfile_stitch" } + publishDir = [enabled: false] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_IMPUTE_STITCH:.*' { + publishDir = [ + path: { "${params.outdir}/imputation/stitch/" }, + mode: params.publish_dir_mode, + enabled: true + ] } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_IMPUTE_STITCH:BCFTOOLS_INDEX' { ext.args = '--tbi' maxRetries = 2 + publishDir = [enabled: false] } diff --git a/conf/steps/panel_prep.config b/conf/steps/panel_prep.config index 3c08653f..7497e3f0 100644 --- a/conf/steps/panel_prep.config +++ b/conf/steps/panel_prep.config @@ -111,6 +111,11 @@ process { withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_CONVERT' { ext.args = {"--haplegendsample ${meta.id}_${meta.chr}"} maxRetries = 2 + publishDir = [ + path: { "${params.outdir}/prep_panel/haplegend/" }, + mode: params.publish_dir_mode, + enabled: true + ] } // Phasing diff --git a/subworkflows/local/impute_quilt/impute_quilt.nf b/subworkflows/local/impute_quilt/impute_quilt.nf index d8231733..5bba1604 100644 --- a/subworkflows/local/impute_quilt/impute_quilt.nf +++ b/subworkflows/local/impute_quilt/impute_quilt.nf @@ -1,7 +1,7 @@ include { QUILT_QUILT } from '../../../modules/nf-core/quilt/quilt' include { BCFTOOLS_ANNOTATE } from '../../../modules/nf-core/bcftools/annotate' -include { BCFTOOLS_INDEX as INDEX1 } from '../../../modules/nf-core/bcftools/index' -include { BCFTOOLS_INDEX as INDEX2 } from '../../../modules/nf-core/bcftools/index' +include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_1 } from '../../../modules/nf-core/bcftools/index' +include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_2 } from '../../../modules/nf-core/bcftools/index' workflow IMPUTE_QUILT { @@ -50,22 +50,22 @@ workflow IMPUTE_QUILT { ch_versions = ch_versions.mix(QUILT_QUILT.out.versions.first()) // Index imputed VCF - INDEX1(QUILT_QUILT.out.vcf) - ch_versions = ch_versions.mix(INDEX1.out.versions.first()) + BCFTOOLS_INDEX_1(QUILT_QUILT.out.vcf) + ch_versions = ch_versions.mix(BCFTOOLS_INDEX_1.out.versions.first()) // Annotate the variants BCFTOOLS_ANNOTATE(QUILT_QUILT.out.vcf - .join(INDEX1.out.tbi) + .join(BCFTOOLS_INDEX_1.out.tbi) .combine(Channel.of([[], [], [], []])) ) ch_versions = ch_versions.mix(BCFTOOLS_ANNOTATE.out.versions.first()) // Index imputed annotated VCF - INDEX2(BCFTOOLS_ANNOTATE.out.vcf) - ch_versions = ch_versions.mix(INDEX2.out.versions.first()) + BCFTOOLS_INDEX_2(BCFTOOLS_ANNOTATE.out.vcf) + ch_versions = ch_versions.mix(BCFTOOLS_INDEX_2.out.versions.first()) // Join VCFs and TBIs - ch_vcf_tbi = BCFTOOLS_ANNOTATE.out.vcf.join(INDEX2.out.tbi) + ch_vcf_tbi = BCFTOOLS_ANNOTATE.out.vcf.join(BCFTOOLS_INDEX_2.out.tbi) emit: vcf_tbi = ch_vcf_tbi // channel: [ meta, vcf, tbi ] diff --git a/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf b/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf index 18ff01ad..e96e4a02 100644 --- a/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf +++ b/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf @@ -39,7 +39,6 @@ workflow VCF_NORMALIZE_BCFTOOLS { // Output hap and legend files ch_hap_legend = BCFTOOLS_CONVERT.out.hap.join(BCFTOOLS_CONVERT.out.legend) - ch_hap_legend.dump(tag:"ch_hap_legend_vcfnormalize") emit: vcf_tbi = ch_biallelic_vcf_tbi // channel: [ [id, chr], vcf, tbi ] diff --git a/subworkflows/nf-core/vcf_phase_shapeit5/main.nf b/subworkflows/nf-core/vcf_phase_shapeit5/main.nf index 966f9019..51061373 100644 --- a/subworkflows/nf-core/vcf_phase_shapeit5/main.nf +++ b/subworkflows/nf-core/vcf_phase_shapeit5/main.nf @@ -1,8 +1,8 @@ include { BEDTOOLS_MAKEWINDOWS } from '../../../modules/nf-core/bedtools/makewindows/main.nf' include { SHAPEIT5_PHASECOMMON } from '../../../modules/nf-core/shapeit5/phasecommon/main' include { SHAPEIT5_LIGATE } from '../../../modules/nf-core/shapeit5/ligate/main' -include { BCFTOOLS_INDEX as VCF_INDEX1 } from '../../../modules/nf-core/bcftools/index/main.nf' -include { BCFTOOLS_INDEX as VCF_INDEX2 } from '../../../modules/nf-core/bcftools/index/main.nf' +include { BCFTOOLS_INDEX as VCF_BCFTOOLS_INDEX_1 } from '../../../modules/nf-core/bcftools/index/main.nf' +include { BCFTOOLS_INDEX as VCF_BCFTOOLS_INDEX_2 } from '../../../modules/nf-core/bcftools/index/main.nf' workflow VCF_PHASE_SHAPEIT5 { @@ -61,11 +61,11 @@ workflow VCF_PHASE_SHAPEIT5 { ch_map ) ch_versions = ch_versions.mix(SHAPEIT5_PHASECOMMON.out.versions.first()) - VCF_INDEX1(SHAPEIT5_PHASECOMMON.out.phased_variant) - ch_versions = ch_versions.mix(VCF_INDEX1.out.versions.first()) + VCF_BCFTOOLS_INDEX_1(SHAPEIT5_PHASECOMMON.out.phased_variant) + ch_versions = ch_versions.mix(VCF_BCFTOOLS_INDEX_1.out.versions.first()) ch_ligate_input = SHAPEIT5_PHASECOMMON.out.phased_variant - .join(VCF_INDEX1.out.csi, failOnMismatch:true, failOnDuplicate:true) + .join(VCF_BCFTOOLS_INDEX_1.out.csi, failOnMismatch:true, failOnDuplicate:true) .map{ meta, vcf, csi -> newmeta = meta + [id: meta.id.split("_")[0..-2].join("_")] [newmeta, vcf, csi]} @@ -84,13 +84,13 @@ workflow VCF_PHASE_SHAPEIT5 { SHAPEIT5_LIGATE(ch_ligate_input) ch_versions = ch_versions.mix(SHAPEIT5_LIGATE.out.versions.first()) - VCF_INDEX2(SHAPEIT5_LIGATE.out.merged_variants) - ch_versions = ch_versions.mix(VCF_INDEX2.out.versions.first()) + VCF_BCFTOOLS_INDEX_2(SHAPEIT5_LIGATE.out.merged_variants) + ch_versions = ch_versions.mix(VCF_BCFTOOLS_INDEX_2.out.versions.first()) emit: bed = BEDTOOLS_MAKEWINDOWS.out.bed // channel: [ val(meta), bed ] variants_phased = SHAPEIT5_LIGATE.out.merged_variants // channel: [ val(meta), vcf ] - variants_index = VCF_INDEX2.out.csi // channel: [ val(meta), csi ] + variants_index = VCF_BCFTOOLS_INDEX_2.out.csi // channel: [ val(meta), csi ] versions = ch_versions // channel: [ versions.yml ] } From 44d6633ef6551a892c5e4fef04df9e961ba0510a Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 3 May 2024 16:16:42 +0000 Subject: [PATCH 56/58] separate concatenation per tool --- conf/steps/imputation_glimpse1.config | 22 ++++++++++++++++++++ conf/steps/imputation_quilt.config | 22 ++++++++++++++++++++ conf/steps/imputation_stitch.config | 22 ++++++++++++++++++++ workflows/phaseimpute/main.nf | 30 ++++++++++++++++++++++----- 4 files changed, 91 insertions(+), 5 deletions(-) diff --git a/conf/steps/imputation_glimpse1.config b/conf/steps/imputation_glimpse1.config index 5b3d89cd..521c8beb 100644 --- a/conf/steps/imputation_glimpse1.config +++ b/conf/steps/imputation_glimpse1.config @@ -81,4 +81,26 @@ process { path: { "${params.outdir}/imputation/glimpse1" } ] } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_GLIMPSE1:.*' { + publishDir = [ + [ + path: { "${params.outdir}/imputation/glimpse1/concat" }, + mode: params.publish_dir_mode, + ], + ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_GLIMPSE1:BCFTOOLS_CONCAT' { + ext.args = {[ + "--ligate", + "--output-type z", + ].join(" ").trim()} + ext.prefix = { "${meta.id}_glimpse1" } + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_GLIMPSE1:BCFTOOLS_INDEX' { + ext.args = "--tbi" + ext.prefix = { "${meta.id}_glimpse1" } + } } diff --git a/conf/steps/imputation_quilt.config b/conf/steps/imputation_quilt.config index 7ff6a4ad..fbb8dcc4 100644 --- a/conf/steps/imputation_quilt.config +++ b/conf/steps/imputation_quilt.config @@ -67,4 +67,26 @@ process { ext.args = "--tbi" } + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_QUILT:.*' { + publishDir = [ + [ + path: { "${params.outdir}/imputation/quilt/concat" }, + mode: params.publish_dir_mode, + ], + ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_QUILT:BCFTOOLS_CONCAT' { + ext.args = {[ + "--ligate", + "--output-type z", + ].join(" ").trim()} + ext.prefix = { "${meta.id}_quilt" } + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_QUILT:BCFTOOLS_INDEX' { + ext.args = "--tbi" + ext.prefix = { "${meta.id}_quilt" } + } + } diff --git a/conf/steps/imputation_stitch.config b/conf/steps/imputation_stitch.config index aa44c687..920255e0 100644 --- a/conf/steps/imputation_stitch.config +++ b/conf/steps/imputation_stitch.config @@ -84,6 +84,28 @@ process { publishDir = [enabled: false] } + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_STITCH:.*' { + publishDir = [ + [ + path: { "${params.outdir}/imputation/stitch/concat" }, + mode: params.publish_dir_mode, + ], + ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_STITCH:BCFTOOLS_CONCAT' { + ext.args = {[ + "--ligate", + "--output-type z", + ].join(" ").trim()} + ext.prefix = { "${meta.id}_stitch" } + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_STITCH:BCFTOOLS_INDEX' { + ext.args = "--tbi" + ext.prefix = { "${meta.id}_stitch" } + } + } diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 5ec1953b..ab9cc73d 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -33,17 +33,19 @@ include { PREPARE_POSFILE_TSV } from '../../subworkflows/ include { VCF_IMPUTE_GLIMPSE as VCF_IMPUTE_GLIMPSE1 } from '../../subworkflows/nf-core/vcf_impute_glimpse' include { COMPUTE_GL as GL_TRUTH } from '../../subworkflows/local/compute_gl' include { COMPUTE_GL as GL_INPUT } from '../../subworkflows/local/compute_gl' +include { VCF_CONCATENATE_BCFTOOLS as CONCAT_GLIMPSE1} from '../../subworkflows/local/vcf_concatenate_bcftools' // QUILT subworkflows include { MAKE_CHUNKS } from '../../subworkflows/local/make_chunks/make_chunks' include { IMPUTE_QUILT } from '../../subworkflows/local/impute_quilt/impute_quilt' +include { VCF_CONCATENATE_BCFTOOLS as CONCAT_QUILT } from '../../subworkflows/local/vcf_concatenate_bcftools' // STITCH subworkflows include { PREPARE_INPUT_STITCH } from '../../subworkflows/local/prepare_input_stitch/prepare_input_stitch' include { BAM_IMPUTE_STITCH } from '../../subworkflows/local/bam_impute_stitch/bam_impute_stitch' +include { VCF_CONCATENATE_BCFTOOLS as CONCAT_STITCH } from '../../subworkflows/local/vcf_concatenate_bcftools' // CONCAT subworkflows -include { VCF_CONCATENATE_BCFTOOLS as CONCAT_IMPUT } from '../../subworkflows/local/vcf_concatenate_bcftools' include { VCF_CONCATENATE_BCFTOOLS as CONCAT_TRUTH } from '../../subworkflows/local/vcf_concatenate_bcftools' include { VCF_CONCATENATE_BCFTOOLS as CONCAT_PANEL } from '../../subworkflows/local/vcf_concatenate_bcftools' @@ -201,6 +203,14 @@ workflow PHASEIMPUTE { // Add to output channel ch_impute_output = ch_impute_output.mix(output_glimpse1) + + // Concatenate by chromosomes + CONCAT_GLIMPSE1(output_glimpse1) + ch_versions = ch_versions.mix(CONCAT_GLIMPSE1.out.versions) + + // Add results to input validate + ch_input_validate = ch_input_validate.mix(CONCAT_GLIMPSE1.out.vcf_tbi_join) + } if (params.tools.split(',').contains("glimpse2")) { error "Glimpse2 not yet implemented" @@ -231,6 +241,13 @@ workflow PHASEIMPUTE { // Output channel to concat ch_impute_output = ch_impute_output.mix(BAM_IMPUTE_STITCH.out.vcf_tbi) + // Concatenate by chromosomes + CONCAT_STITCH(BAM_IMPUTE_STITCH.out.vcf_tbi) + ch_versions = ch_versions.mix(CONCAT_STITCH.out.versions) + + // Add results to input validate + ch_input_validate = ch_input_validate.mix(CONCAT_STITCH.out.vcf_tbi_join) + } if (params.tools.split(',').contains("quilt")) { @@ -242,11 +259,14 @@ workflow PHASEIMPUTE { // Add to output channel ch_impute_output = ch_impute_output.mix(IMPUTE_QUILT.out.vcf_tbi) + + // Concatenate by chromosomes + CONCAT_QUILT(IMPUTE_QUILT.out.vcf_tbi) + ch_versions = ch_versions.mix(CONCAT_QUILT.out.versions) + + // Add results to input validate + ch_input_validate = ch_input_validate.mix(CONCAT_QUILT.out.vcf_tbi_join) } - // Concatenate by chromosomes - CONCAT_IMPUT(ch_impute_output) - ch_versions = ch_versions.mix(CONCAT_IMPUT.out.versions) - ch_input_validate = ch_input_validate.mix(CONCAT_IMPUT.out.vcf_tbi_join) } if (params.step.split(',').contains("validate") || params.step.split(',').contains("all")) { From b1e224dfffd333d46c418ef84a54afa869ed6c1f Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 3 May 2024 16:27:27 +0000 Subject: [PATCH 57/58] unicode issue --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index cbf73d2a..f10ee0c4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -142,7 +142,7 @@ }, "min_val_dp": { "type": "integer", - "description": "Minimum coverage in validation data. If FORMAT/DP is missing and \u2013min_val_dp > 0, the program exits with an error. Set to zero to have no filter of if using \u2013gt-validation", + "description": "Minimum coverage in validation data. If FORMAT/DP is missing and -min_val_dp > 0, the program exits with an error. Set to zero to have no filter of if using \u2013gt-validation", "default": 5, "pattern": "^\\d+$" } From 7a6c61dbb61d0b509c9f7872c4f31412ebe9ad2c Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Mon, 6 May 2024 02:02:47 +0000 Subject: [PATCH 58/58] fix linting and pre-commit --- .nf-core.yml | 2 ++ docs/usage.md | 1 + 2 files changed, 3 insertions(+) diff --git a/.nf-core.yml b/.nf-core.yml index 3805dc81..88bcff36 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1 +1,3 @@ repository_type: pipeline +lint: + subworkflow_changes: false diff --git a/docs/usage.md b/docs/usage.md index 95bf802f..f5edb39f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -150,6 +150,7 @@ Otherwise, you can provide your own position file in the `--mode impute` with ST ```bash nextflow run nf-core/phaseimpute --input samplesheet.csv --step impute --posfile samplesheet_posfile.csv --tool stitch --outdir results --genome GRCh37 -profile docker ``` + The csv provided in `--posfile` must contain two columns [chr, file]. The first column is the chromosome and the file column are tsvs with the list of positions, unique to each chromosome. ```console