diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4636b9f1..8780f435 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,6 +31,7 @@ jobs: - "test" - "test_sim" - "test_quilt" + - "test_stitch" steps: - name: Check out pipeline code uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 diff --git a/.nf-core.yml b/.nf-core.yml index 3805dc81..88bcff36 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1 +1,3 @@ repository_type: pipeline +lint: + subworkflow_changes: false diff --git a/assets/schema_posfile.json b/assets/schema_posfile.json new file mode 100644 index 00000000..5a3c9d5a --- /dev/null +++ b/assets/schema_posfile.json @@ -0,0 +1,24 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/phaseimpute/master/assets/schema_posfile.json", + "title": "nf-core/phaseimpute pipeline - params.posfile schema", + "description": "Schema for the file provided with params.posfile", + "type": "array", + "items": { + "type": "object", + "properties": { + "chr": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Chromosome name must be provided as a string and cannot contain spaces", + "meta": ["chr"] + }, + "file": { + "type": "string", + "pattern": "^\\S+\\.txt$", + "errorMessage": "Posfile per chromosome must be provided. Must have .txt extension" + } + }, + "required": ["chr", "file"] + } +} diff --git a/conf/steps/imputation_glimpse1.config b/conf/steps/imputation_glimpse1.config index 5b3d89cd..521c8beb 100644 --- a/conf/steps/imputation_glimpse1.config +++ b/conf/steps/imputation_glimpse1.config @@ -81,4 +81,26 @@ process { path: { "${params.outdir}/imputation/glimpse1" } ] } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_GLIMPSE1:.*' { + publishDir = [ + [ + path: { "${params.outdir}/imputation/glimpse1/concat" }, + mode: params.publish_dir_mode, + ], + ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_GLIMPSE1:BCFTOOLS_CONCAT' { + ext.args = {[ + "--ligate", + "--output-type z", + ].join(" ").trim()} + ext.prefix = { "${meta.id}_glimpse1" } + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_GLIMPSE1:BCFTOOLS_INDEX' { + ext.args = "--tbi" + ext.prefix = { "${meta.id}_glimpse1" } + } } diff --git a/conf/steps/imputation_quilt.config b/conf/steps/imputation_quilt.config index f75c777e..fbb8dcc4 100644 --- a/conf/steps/imputation_quilt.config +++ b/conf/steps/imputation_quilt.config @@ -39,50 +39,6 @@ process { ext.prefix = { "${meta.id}_${meta.chr}" } } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_INDEX' { - cpus = 2 - memory = 400.MB - maxRetries = 2 - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_INDEX_2' { - ext.args = '--tbi' - cpus = 2 - memory = 400.MB - maxRetries = 2 - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_INDEX_3' { - ext.args = '--tbi' - cpus = 2 - memory = 400.MB - maxRetries = 2 - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_VIEW' { - ext.args = '-v snps -Oz' - ext.prefix = { "${meta.id}_${meta.chr}_biallelic" } - cpus = 2 - memory = 400.MB - maxRetries = 2 - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_NORM' { - ext.args = '-m +any --output-type z' - ext.prefix = { "${meta.id}_${meta.chr}_multiallelic" } - cpus = 2 - memory = 400.MB - maxRetries = 2 - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:BCFTOOLS_CONVERT' { - ext.args = '--haplegendsample test' - ext.prefix = { "${meta.id}_${meta.chr}_convert" } - cpus = 2 - memory = 400.MB - maxRetries = 2 - } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:IMPUTE_QUILT:.*' { publishDir = [ [ @@ -97,7 +53,7 @@ process { publishDir = [enabled: false] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:IMPUTE_QUILT:INDEX1' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:IMPUTE_QUILT:BCFTOOLS_INDEX_1' { ext.args = "--tbi" publishDir = [enabled: false] } @@ -107,8 +63,30 @@ process { ext.prefix = { "${meta.id}_R${meta.region.replace(':','_')}.impute.annotate" } } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:IMPUTE_QUILT:INDEX2' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:IMPUTE_QUILT:BCFTOOLS_INDEX_2' { + ext.args = "--tbi" + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_QUILT:.*' { + publishDir = [ + [ + path: { "${params.outdir}/imputation/quilt/concat" }, + mode: params.publish_dir_mode, + ], + ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_QUILT:BCFTOOLS_CONCAT' { + ext.args = {[ + "--ligate", + "--output-type z", + ].join(" ").trim()} + ext.prefix = { "${meta.id}_quilt" } + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_QUILT:BCFTOOLS_INDEX' { ext.args = "--tbi" + ext.prefix = { "${meta.id}_quilt" } } } diff --git a/conf/steps/imputation_stitch.config b/conf/steps/imputation_stitch.config new file mode 100644 index 00000000..920255e0 --- /dev/null +++ b/conf/steps/imputation_stitch.config @@ -0,0 +1,111 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. +---------------------------------------------------------------------------------------- +*/ + +process { + + withName: CUSTOM_DUMPSOFTWAREVERSIONS { + publishDir = [ + path: { "${params.outdir}/pipeline_info" }, + mode: params.publish_dir_mode, + pattern: '*_versions.yml' + ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_POSFILE_TSV:.*' { + publishDir = [ + path: { "${params.outdir}/prep_panel/posfile/" }, + mode: params.publish_dir_mode, + enabled: true + ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_POSFILE_TSV:GAWK' { + ext.args = "'{ key = \$1 FS \$2 } !seen[key]++'" + ext.prefix = { "${meta.id}_${meta.chr}_posfile_stitch" } + ext.suffix = ".txt" + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_NORM' { + ext.args = '-m +any --output-type z' + ext.prefix = { "${meta.id}_${meta.chr}_multiallelic" } + maxRetries = 2 + publishDir = [enabled: false] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_VIEW' { + ext.args = '-v snps -Oz' + ext.prefix = { "${meta.id}_${meta.chr}_biallelic" } + maxRetries = 2 + publishDir = [enabled: false] + + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_INDEX' { + maxRetries = 2 + publishDir = [enabled: false] + + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_INPUT_STITCH:BCFTOOLS_INDEX_2' { + ext.args = '--tbi' + maxRetries = 2 + publishDir = [enabled: false] + + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:PREPARE_POSFILE_TSV:BCFTOOLS_QUERY' { + ext.args = [ + "-f'%CHROM\t%POS\t%REF\t%ALT\\n'", + ].join(' ') + ext.prefix = { "${meta.id}_${meta.chr}_posfile_stitch" } + publishDir = [enabled: false] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_IMPUTE_STITCH:.*' { + publishDir = [ + path: { "${params.outdir}/imputation/stitch/" }, + mode: params.publish_dir_mode, + enabled: true + ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_IMPUTE_STITCH:BCFTOOLS_INDEX' { + ext.args = '--tbi' + maxRetries = 2 + publishDir = [enabled: false] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_STITCH:.*' { + publishDir = [ + [ + path: { "${params.outdir}/imputation/stitch/concat" }, + mode: params.publish_dir_mode, + ], + ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_STITCH:BCFTOOLS_CONCAT' { + ext.args = {[ + "--ligate", + "--output-type z", + ].join(" ").trim()} + ext.prefix = { "${meta.id}_stitch" } + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_STITCH:BCFTOOLS_INDEX' { + ext.args = "--tbi" + ext.prefix = { "${meta.id}_stitch" } + } + + + +} diff --git a/conf/steps/panel_prep.config b/conf/steps/panel_prep.config index 5eec78ce..7497e3f0 100644 --- a/conf/steps/panel_prep.config +++ b/conf/steps/panel_prep.config @@ -25,27 +25,35 @@ process { "--no-version" ].join(' ') ext.prefix = { "${meta.id}_chrrename" } + publishDir = [ enabled: false ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GET_PANEL:.*' { - publishDir = [ - path: { "${params.outdir}/prep_panel/" }, - mode: params.publish_dir_mode, - enabled: true, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_NORM' { + ext.args = '-m +any --no-version --output-type z' + ext.prefix = { "${meta.id}_${meta.chr}_multiallelic" } + maxRetries = 2 + publishDir = [ enabled: false ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GET_PANEL:BCFTOOLS_NORM' { - ext.args = [ - "-m", - "-any", - "--no-version" - ].join(' ') - ext.prefix = { "${meta.id}_norm" } + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_INDEX' { + ext.args = "--tbi" + publishDir = [enabled: false] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_VIEW' { + ext.args = '-v snps -Oz' + ext.prefix = { "${meta.id}_${meta.chr}_biallelic" } + maxRetries = 2 + publishDir = [ enabled: false ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GET_PANEL:VIEW_VCF_SNPS' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_INDEX_2' { + ext.args = '--tbi' + maxRetries = 2 + publishDir = [ enabled: false ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:VIEW_VCF_SNPS' { ext.args = [ "-m 2", "-M 2", @@ -54,9 +62,10 @@ process { "--no-version" ].join(' ') ext.prefix = { "${meta.id}_SNPS" } + publishDir = [ enabled: false ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GET_PANEL:VIEW_VCF_SITES' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:VIEW_VCF_SITES' { ext.args = [ "-G", "-m 2", @@ -66,52 +75,78 @@ process { "--no-version" ].join(' ') ext.prefix = { "${meta.id}_C${meta.chr}_SITES" } + publishDir = [ + path: { "${params.outdir}/prep_panel/sites/vcf/" }, + mode: params.publish_dir_mode, + enabled: true + ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GET_PANEL:BCFTOOLS_QUERY' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:BCFTOOLS_QUERY' { ext.args = [ "-f'%CHROM\t%POS\t%REF,%ALT\\n'", ].join(' ') - ext.prefix = { "${meta.id}_SITES_TSV" } + ext.prefix = { "${meta.id}_glimpse_SITES_TSV" } + publishDir = [ + path: { "${params.outdir}/prep_panel/sites/tsv/" }, + mode: params.publish_dir_mode, + enabled: true + ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GET_PANEL:TABIX_TABIX' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:TABIX_TABIX' { ext.args = [ "-s1", "-b2", "-e2" ].join(' ') - ext.prefix = { "${meta.id}_SITES_TSV" } + ext.prefix = { "${meta.id}_glimpse_SITES_TSV" } + publishDir = [ + path: { "${params.outdir}/prep_panel/sites/tsv/" }, + mode: params.publish_dir_mode, + enabled: true + ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:GET_PANEL:VCF_PHASE_SHAPEIT5:BEDTOOLS_MAKEWINDOWS' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_CONVERT' { + ext.args = {"--haplegendsample ${meta.id}_${meta.chr}"} + maxRetries = 2 + publishDir = [ + path: { "${params.outdir}/prep_panel/haplegend/" }, + mode: params.publish_dir_mode, + enabled: true + ] + } + + // Phasing + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_PHASE_PANEL:VCF_PHASE_SHAPEIT5:BEDTOOLS_MAKEWINDOWS' { ext.args = [ '-w 60000', '-s 40000' ].join(' ') ext.prefix = { "${meta.id}_chunks" } - publishDir = [ - enabled: false - ] + publishDir = [ enabled: false ] + } + + // TSV + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:BCFTOOLS_QUERY_STITCH' { + ext.args = [ + "-f'%CHROM\t%POS\t%REF\t%ALT\\n'", + ].join(' ') + ext.prefix = { "${meta.id}_${meta.chr}_posfile_stitch" } + publishDir = [ enabled: false ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_PANEL:.*' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:GAWK_STITCH' { + ext.args = "'{ key = \$1 FS \$2 } !seen[key]++'" + ext.prefix = { "${meta.id}_${meta.chr}_posfile_stitch" } + ext.suffix = "txt" publishDir = [ - path: { "${params.outdir}/prep_panel/concat" }, + path: { "${params.outdir}/prep_panel/sites/tsv/" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + enabled: true ] - ext.prefix = { "${meta.id}_sites_concat" } } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_PANEL:BCFTOOLS_CONCAT' { - ext.args = {[ - "--ligate", - "--output-type z", - ].join(" ").trim()} - } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_PANEL:BCFTOOLS_INDEX' { - ext.args = "--tbi" - } } diff --git a/conf/test.config b/conf/test.config index 525d6fe0..c754b84e 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,6 +29,6 @@ params { phased = true // Impute parameters - step = "impute" + step = "panelprep,impute" tools = "glimpse1" } diff --git a/conf/test_quilt.config b/conf/test_quilt.config index a7d04a00..27d31445 100644 --- a/conf/test_quilt.config +++ b/conf/test_quilt.config @@ -29,6 +29,6 @@ params { phased = true // Impute parameters - step = "impute" + step = "panelprep,impute" tools = "quilt" } diff --git a/conf/test_stitch.config b/conf/test_stitch.config new file mode 100644 index 00000000..11508421 --- /dev/null +++ b/conf/test_stitch.config @@ -0,0 +1,33 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/phaseimpute -profile test_stitch, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Minimal Stitch Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function using the tool STITCH' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '2.GB' + max_time = '1.h' + + // Input data + input = "${projectDir}/tests/csv/sample_bam.csv" + input_region = "${projectDir}/tests/csv/region.csv" + + // Genome references + fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/reference_genome/21_22/hs38DH.chr21_22.fa" + posfile = "${projectDir}/tests/csv/posfile.csv" + + // Impute parameters + step = "impute" + tools = "stitch" +} diff --git a/docs/output.md b/docs/output.md index 7c589a4a..97d7d4d7 100644 --- a/docs/output.md +++ b/docs/output.md @@ -2,17 +2,15 @@ ## Introduction -## Introduction - This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. -The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - -## Pipeline overview: QUILT imputation mode +## Pipeline overview + +## QUILT imputation mode The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: @@ -21,20 +19,17 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Convert](#convert) - Convert reference panel to .hap and .legend files - [QUILT](#quilt) - Perform imputation - [Concatenate](#concatenate) - Concatenate all imputed chunks into a single VCF. -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution ### Glimpse Chunk -- `quilt_impute/glimpse/` +- `imputation/glimpse_chunk/` - `*.txt`: TXT file containing the chunks obtained from running Glimpse chunks. [Glimpse chunk](https://odelaneau.github.io/GLIMPSE/) defines chunks where to run imputation. For further reading and documentation see the [Glimpse documentation](https://odelaneau.github.io/GLIMPSE/glimpse1/commands.html). Once that you have generated the chunks for your reference panel, you can skip the reference preparation step and directly submit this file for imputation. ### Convert -- `quilt_impute/bcftools/convert/` +- `imputation/bcftools/convert/` - `*.hap`: a .hap file for the reference panel. - `*.legend*`: a .legend file for the reference panel. @@ -42,7 +37,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d ### QUILT -- `quilt_impute/quilt/` +- `imputation/quilt/` - `quilt.*.vcf.gz`: Imputed VCF for a specific chunk. - `quilt.*.vcf.gz.tbi`: TBI for the Imputed VCF for a specific chunk. @@ -50,11 +45,34 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d ### Concat -- `quilt_impute/bcftools/concat` +- `imputation/bcftools/concat` - `.*.vcf.gz`: Imputed and ligated VCF for all the input samples. [bcftools concat](https://samtools.github.io/bcftools/bcftools.html) will produce a single VCF from a list of imputed VCFs in chunks. +## STITCH imputation mode + +The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: + +- [Remove Multiallelics](#multiallelics) - Remove multiallelic sites +- [STITCH](#quilt) - Perform imputation +- [Concatenate](#concatenate) - Concatenate all imputed chunks into a single VCF + +### Concat + +- `imputation/bcftools/concat` +- `.*.vcf.gz`: Imputed and concatenated VCF for all the input samples. + +[bcftools concat](https://samtools.github.io/bcftools/bcftools.html) will produce a single VCF from a list of imputed VCFs. + +## Reports + +Reports contain useful metrics and pipeline information for the different modes. + +- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline +- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution + ### MultiQC
diff --git a/docs/usage.md b/docs/usage.md index 1fd90ac8..f5edb39f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -4,20 +4,12 @@ > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ -## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/phaseimpute/usage](https://nf-co.re/phaseimpute/usage) - -> _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ - ## Introduction ## Samplesheet input - - -## Samplesheet input - You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. ```bash @@ -133,16 +125,56 @@ genome: 'GRCh37' You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). -### Imputation modes +### Imputation tools `--step impute --tools [glimpse1, quilt, stitch]` -You can choose different software to perform the imputation. +You can choose different software to perform the imputation. In the following sections, the typical commands for running the pipeline with each software are included. #### QUILT -The typical command for running the pipeline with this software is as follows: +```bash +nextflow run nf-core/phaseimpute --input samplesheet.csv --panel samplesheet_reference.csv --step impute --tool quilt --outdir results --genome GRCh37 -profile docker +``` + +#### STITCH + +[STITCH](https://github.com/rwdavies/STITCH) is an R program for low coverage sequencing genotype imputation without using a reference panel. The required inputs for this program are bam samples provided in the input samplesheet (`--input`) and a tsv file with the list of positions to genotype (`--posfile`). + +If you do not have a list of position to genotype, you can provide a reference panel to run the `--mode panelprep` which produces a tsv with this list. + +```bash +nextflow run nf-core/phaseimpute --input samplesheet.csv --step panelprep --panel samplesheet_reference.csv --outdir results --genome GRCh37 -profile docker +``` + +Otherwise, you can provide your own position file in the `--mode impute` with STITCH using the the `--posfile` parameter. + +```bash +nextflow run nf-core/phaseimpute --input samplesheet.csv --step impute --posfile samplesheet_posfile.csv --tool stitch --outdir results --genome GRCh37 -profile docker +``` + +The csv provided in `--posfile` must contain two columns [chr, file]. The first column is the chromosome and the file column are tsvs with the list of positions, unique to each chromosome. + +```console +chr,file +chr1,posfile_chr1.txt +chr2,posfile_chr2.txt +chr3,posfile_chr3.txt +``` + +The file column should contain a TSV with the following structure, from STITCH documentation: "File is tab separated with no header, one row per SNP, with col 1 = chromosome, col 2 = physical position (sorted from smallest to largest), col 3 = reference base, col 4 = alternate base. Bases are capitalized. STITCH only handles bi-allelic SNPs" [STITCH](https://github.com/rwdavies/STITCH/blob/master/Options.md). + +As an example, chr22 tsv file: + +```console +chr22 16570065 A G +chr22 16570067 A C +chr22 16570176 C A +chr22 16570211 T C +``` + +#### GLIMPSE1 ```bash -nextflow run nf-core/phaseimpute --input ./samplesheet.csv --panel ./samplesheet_reference.csv --step impute --tool quilt --outdir ./results --genome GRCh37 -profile docker +nextflow run nf-core/phaseimpute --input samplesheet.csv --panel samplesheet_reference.csv --step impute --tool glimpse1 --outdir results --genome GRCh37 -profile docker ``` ### Updating the pipeline diff --git a/main.nf b/main.nf index 319c2f5c..1e1c3dd7 100644 --- a/main.nf +++ b/main.nf @@ -40,6 +40,7 @@ workflow NFCORE_PHASEIMPUTE { ch_regions // channel: regions to use [[chr, region], region] ch_depth // channel: depth of coverage file [[depth], depth] ch_map // channel: map file for imputation + ch_posfile // channel: samplesheet read in from --posfile ch_versions // channel: versions of software used main: @@ -52,15 +53,15 @@ workflow NFCORE_PHASEIMPUTE { input_simulate = Channel.empty() input_validate = Channel.empty() - if (params.step == "impute") { + if (params.step.split(',').contains("impute")) { input_impute = ch_input .combine(ch_regions) .map { metaI, file, index, metaCR, region -> [ metaI+metaCR, file, index ] } - } else if (params.step == "simulate" || params.step == "all") { + } else if (params.step.split(',').contains("simulate") || params.step.split(',').contains("all")) { input_simulate = ch_input - } else if (params.step == "validate") { + } else if (params.step.split(',').contains("validate")) { input_validate = ch_input .combine(ch_regions) .map { metaI, file, index, metaCR, region -> @@ -86,6 +87,7 @@ workflow NFCORE_PHASEIMPUTE { ch_regions, ch_depth, ch_map, + ch_posfile, ch_versions ) @@ -128,6 +130,7 @@ workflow { PIPELINE_INITIALISATION.out.regions, PIPELINE_INITIALISATION.out.depth, PIPELINE_INITIALISATION.out.map, + PIPELINE_INITIALISATION.out.posfile, PIPELINE_INITIALISATION.out.versions ) diff --git a/modules.json b/modules.json index 02349dbf..53279d66 100644 --- a/modules.json +++ b/modules.json @@ -156,6 +156,12 @@ "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["vcf_phase_shapeit5"] }, + "stitch": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"], + "patch": "modules/nf-core/stitch/stitch.diff" + }, "tabix/bgzip": { "branch": "master", "git_sha": "09d3c8c29b31a2dfd610305b10550f0e1dbcd4a9", diff --git a/modules/nf-core/stitch/environment.yml b/modules/nf-core/stitch/environment.yml new file mode 100644 index 00000000..3facc1bc --- /dev/null +++ b/modules/nf-core/stitch/environment.yml @@ -0,0 +1,7 @@ +name: stitch +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::r-stitch=1.6.10 diff --git a/modules/nf-core/stitch/main.nf b/modules/nf-core/stitch/main.nf new file mode 100644 index 00000000..0f8d8109 --- /dev/null +++ b/modules/nf-core/stitch/main.nf @@ -0,0 +1,86 @@ +process STITCH { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/r-stitch:1.6.10--r43h06b5641_0': + 'biocontainers/r-stitch:1.6.10--r43h06b5641_0' }" + + input: + tuple val(meta), path(collected_crams), path(collected_crais), path(cramlist) + tuple val(meta2), path(posfile), path(input, stageAs: "input"), path(rdata, stageAs: "RData_in"), val(chromosome_name), val(K), val(nGen) + tuple val(meta3), path(fasta), path(fasta_fai) + val seed + + output: + tuple val(meta), path("input", type: "dir") , emit: input + tuple val(meta), path("RData", type: "dir") , emit: rdata + tuple val(meta), path("plots", type: "dir") , emit: plots , optional: { generate_input_only } + tuple val(meta), path("*.vcf.gz") , emit: vcf , optional: { generate_input_only || bgen_output } + tuple val(meta), path("*.bgen") , emit: bgen , optional: { generate_input_only || !bgen_output } + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + def args2 = task.ext.args2 ?: "" + def generate_input_only = args2.contains( "--generateInputOnly TRUE" ) + def bgen_output = args2.contains( "--output_format bgen" ) + def reads_ext = collected_crams ? collected_crams.extension.unique() : [] + def rsync_cmd = rdata ? "rsync -rL ${rdata}/ RData" : "" + def stitch_cmd = seed ? "Rscript <(cat \$(which STITCH.R) | tail -n +2 | cat <(echo 'set.seed(${seed})') -)" : "STITCH.R" + def cramlist_cmd = cramlist && reads_ext == ["cram"] ? "--cramlist ${cramlist}" : "" + def bamlist_cmd = cramlist && reads_ext == ["bam" ] ? "--bamlist ${cramlist}" : "" + def reference_cmd = fasta ? "--reference ${fasta}" : "" + def regenerate_input_cmd = input && rdata && !cramlist ? "--regenerateInput FALSE --originalRegionName ${chromosome_name}" : "" + def rsync_version_cmd = rdata ? "rsync: \$(rsync --version | head -n1 | sed 's/^rsync version //; s/ .*\$//')" : "" + """ + ${rsync_cmd} ${args} + + ${stitch_cmd} \\ + --chr ${chromosome_name} \\ + --posfile ${posfile} \\ + --outputdir . \\ + --nCores ${task.cpus} \\ + --K ${K} \\ + --nGen ${nGen} \\ + ${cramlist_cmd} \\ + ${bamlist_cmd} \\ + ${reference_cmd} \\ + ${regenerate_input_cmd} \\ + ${args2} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ${rsync_version_cmd} + r-base: \$(Rscript -e "cat(strsplit(R.version[['version.string']], ' ')[[1]][3])") + r-stitch: \$(Rscript -e "cat(as.character(utils::packageVersion('STITCH')))") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + def args2 = task.ext.args2 ?: "" + def generate_input_only = args2.contains( "--generateInputOnly TRUE" ) + def generate_plots_cmd = !generate_input_only ? "mkdir plots" : "" + def generate_vcf_cmd = !generate_input_only ? "touch ${prefix}.vcf.gz" : "" + def rsync_version_cmd = rdata ? "rsync: \$(rsync --version | head -n1 | sed 's/^rsync version //; s/ .*\$//')" : "" + """ + touch input + touch RData + ${generate_plots_cmd} + ${generate_vcf_cmd} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ${rsync_version_cmd} + r-base: \$(Rscript -e "cat(strsplit(R.version[['version.string']], ' ')[[1]][3])") + r-stitch: \$(Rscript -e "cat(as.character(utils::packageVersion('STITCH')))") + END_VERSIONS + """ +} diff --git a/modules/nf-core/stitch/meta.yml b/modules/nf-core/stitch/meta.yml new file mode 100644 index 00000000..a36d61cd --- /dev/null +++ b/modules/nf-core/stitch/meta.yml @@ -0,0 +1,120 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "stitch" +description: "STITCH is an R program for reference panel free, read aware, low coverage sequencing genotype imputation. STITCH runs on a set of samples with sequencing reads in BAM format, as well as a list of positions to genotype, and outputs imputed genotypes in VCF format." +keywords: + - imputation + - genomics + - vcf + - bgen + - cram + - bam + - sam +tools: + - "stitch": + description: "STITCH - Sequencing To Imputation Through Constructing Haplotypes" + homepage: "https://github.com/rwdavies/stitch" + documentation: "https://github.com/rwdavies/stitch" + tool_dev_url: "https://github.com/rwdavies/stitch" + doi: "10.1038/ng.3594" + licence: "['GPL v3']" +input: + - meta: + type: map + description: | + Groovy Map containing information about the set of positions to run the imputation over + e.g. `[ id:'test' ]` + - posfile: + type: file + description: | + Tab-separated file describing the variable positions to be used for imputation. Refer to the documentation for the `--posfile` argument of STITCH for more information. + pattern: "*.tsv" + - input: + type: directory + description: | + Folder of pre-generated input RData objects used when STITCH is called with the `--regenerateInput FALSE` flag. It is generated by running STITCH with the `--generateInputOnly TRUE` flag. + pattern: "input" + - rdata: + type: directory + description: | + Folder of pre-generated input RData objects used when STITCH is called with the `--regenerateInput FALSE` flag. It is generated by running STITCH with the `--generateInputOnly TRUE` flag. + pattern: "RData" + - chromosome_name: + type: string + description: Name of the chromosome to impute. Should match a chromosome name in the reference genome. + - K: + type: integer + description: Number of ancestral haplotypes to use for imputation. Refer to the documentation for the `--K` argument of STITCH for more information. + - nGen: + type: integer + description: Number of generations since founding of the population to use for imputation. Refer to the documentation for the `--nGen` argument of STITCH for more information. + - meta2: + type: map + description: | + Groovy Map containing information about the set of samples + e.g. `[ id:'test' ]` + - collected_crams: + type: file + description: List of sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - collected_crais: + type: file + description: List of BAM/CRAM/SAM index files + pattern: "*.{bai,crai,sai}" + - cramlist: + type: file + description: | + Text file with the path to the cram files to use in imputation, one per line. Since the cram files are staged to the working directory for the process, this file should just contain the file names without any pre-pending path. + pattern: "*.txt" + - meta3: + type: map + description: | + Groovy Map containing information about the reference genome used + e.g. `[ id:'test' ]` + - fasta: + type: file + description: FASTA reference genome file + pattern: "*.{fa,fasta}" + - fasta_fai: + type: file + description: FASTA index file + pattern: "*.{fai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - input: + type: directory + description: | + Folder of pre-generated input RData objects used when STITCH is called with the `--regenerateInput FALSE` flag. It is generated by running STITCH with the `--generateInputOnly TRUE` flag. + pattern: "input" + - rdata: + type: directory + description: | + Folder of pre-generated input RData objects used when STITCH is called with the `--regenerateInput FALSE` flag. It is generated by running STITCH with the `--generateInputOnly TRUE` flag. + pattern: "RData" + - plots: + type: directory + description: | + Folder containing plots produced by STITCH during imputation. Which plots are produced depends on the command-line arguments passed to STITCH. + pattern: "plots" + - vcf: + type: file + description: | + Imputed genotype calls for the positions in `posfile`, in vcf format. This is the default output. + pattern: ".vcf.gz" + - bgen: + type: file + description: | + Imputed genotype calls for the positions in `posfile`, in vcf format. This is the produced if `--output_format bgen` is specified. + pattern: ".bgen" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@saulpierotti" +maintainers: + - "@saulpierotti" diff --git a/modules/nf-core/stitch/stitch.diff b/modules/nf-core/stitch/stitch.diff new file mode 100644 index 00000000..0a987c1b --- /dev/null +++ b/modules/nf-core/stitch/stitch.diff @@ -0,0 +1,24 @@ +Changes in module 'nf-core/stitch' +--- modules/nf-core/stitch/main.nf ++++ modules/nf-core/stitch/main.nf +@@ -8,8 +8,8 @@ + 'biocontainers/r-stitch:1.6.10--r43h06b5641_0' }" + + input: +- tuple val(meta) , path(posfile), path(input, stageAs: "input"), path(rdata, stageAs: "RData_in"), val(chromosome_name), val(K), val(nGen) +- tuple val(meta2), path(collected_crams), path(collected_crais), path(cramlist) ++ tuple val(meta), path(collected_crams), path(collected_crais), path(cramlist) ++ tuple val(meta2), path(posfile), path(input, stageAs: "input"), path(rdata, stageAs: "RData_in"), val(chromosome_name), val(K), val(nGen) + tuple val(meta3), path(fasta), path(fasta_fai) + val seed + + +--- modules/nf-core/stitch/meta.yml ++++ modules/nf-core/stitch/meta.yml +@@ -117,4 +117,4 @@ + authors: + - "@saulpierotti" + maintainers: +- - "@saulpierotti" ++ - "@saulpierotti" +************************************************************ diff --git a/nextflow.config b/nextflow.config index ccc6b0dd..83c45c28 100644 --- a/nextflow.config +++ b/nextflow.config @@ -51,6 +51,11 @@ params { ngen = 100 buffer = 10000 + // STITCH + k_val = 2 + seed = 1 + posfile = null + // Boilerplate options outdir = null publish_dir_mode = 'copy' @@ -199,12 +204,14 @@ profiles { executor.memory = 8.GB } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } - test_sim { includeConfig 'conf/test_sim.config' } + test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } + test_sim { includeConfig 'conf/test_sim.config' } test_validate { includeConfig 'conf/test_validate.config' } test_all { includeConfig 'conf/test_all.config' } - test_quilt { includeConfig 'conf/test_quilt.config' } + test_quilt { includeConfig 'conf/test_quilt.config' } + test_stitch { includeConfig 'conf/test_stitch.config' } + } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile @@ -288,6 +295,7 @@ includeConfig 'conf/steps/panel_prep.config' includeConfig 'conf/steps/imputation.config' includeConfig 'conf/steps/imputation_glimpse1.config' includeConfig 'conf/steps/imputation_quilt.config' +includeConfig 'conf/steps/imputation_stitch.config' // validation step includeConfig 'conf/steps/validation.config' diff --git a/nextflow_schema.json b/nextflow_schema.json index 9b1f4015..f10ee0c4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -5,6 +5,69 @@ "description": "Phasing and imputation pipeline", "type": "object", "definitions": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data and save output data.", + "required": ["outdir"], + "properties": { + "input": { + "type": "string", + "format": "file-path", + "exists": true, + "schema": "assets/schema_input.json", + "mimetype": "text/csv", + "pattern": "^\\S+\\.csv$", + "description": "Path to comma-separated file containing information about the samples in the experiment.", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/phaseimpute/usage#samplesheet-input).", + "fa_icon": "fas fa-file-csv" + }, + "input_region": { + "type": "string", + "description": "Region of the genome to use (optional: if no file given, the whole genome will be used). The file should be a comma-separated file with 3 columns, and a header row.", + "schema": "assets/schema_input_region.json", + "format": "file-path", + "pattern": "^\\S+\\.csv$" + }, + "outdir": { + "type": "string", + "format": "directory-path", + "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", + "fa_icon": "fas fa-folder-open" + }, + "rename_chr": { + "type": "boolean", + "description": "Should the panel vcf files be renamed to match the reference genome (e.g. 'chr1' -> '1')", + "pattern": "true|false" + }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + }, + "multiqc_title": { + "type": "string", + "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", + "fa_icon": "fas fa-file-signature" + }, + "step": { + "type": "string", + "description": "Step to run.", + "fa_icon": "fas fa-step-forward", + "pattern": "^((all|simulate|panelprep|impute|validate)?,?)*(? 0, the program exits with an error. Set to zero to have no filter of if using –gt-validation", + "description": "Minimum coverage in validation data. If FORMAT/DP is missing and -min_val_dp > 0, the program exits with an error. Set to zero to have no filter of if using \u2013gt-validation", "default": 5, "pattern": "^\\d+$" } } }, - "input_output_options": { - "title": "Input/output options", - "type": "object", - "fa_icon": "fas fa-terminal", - "description": "Define where the pipeline should find input data and save output data.", - "required": ["outdir"], - "properties": { - "input": { - "type": "string", - "format": "file-path", - "exists": true, - "schema": "assets/schema_input.json", - "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/phaseimpute/usage#samplesheet-input).", - "fa_icon": "fas fa-file-csv" - }, - "input_region": { - "type": "string", - "description": "Region of the genome to use (optional: if no file given, the whole genome will be used). The file should be a comma-separated file with 3 columns, and a header row.", - "schema": "assets/schema_input_region.json", - "default": null, - "format": "file-path", - "pattern": "^\\S+\\.csv$" - }, - "outdir": { - "type": "string", - "format": "directory-path", - "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", - "fa_icon": "fas fa-folder-open" - }, - "rename_chr": { - "type": "boolean", - "description": "Should the panel vcf files be renamed to match the reference genome (e.g. 'chr1' -> '1')", - "pattern": "true|false" - }, - "email": { - "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" - }, - "multiqc_title": { - "type": "string", - "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", - "fa_icon": "fas fa-file-signature" - }, - "step": { - "type": "string", - "description": "Step to run.", - "fa_icon": "fas fa-step-forward", - "enum": ["all", "simulate", "panelprep", "impute", "validate"] - }, - "tools": { - "type": "string", - "description": "Step to run.", - "fa_icon": "fas fa-step-forward", - "enum": ["glimpse1", "glimpse2", "quilt"] - } - } - }, "reference_genome_options": { "title": "Reference genome options", "type": "object", @@ -392,9 +392,56 @@ "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." } } + }, + "quilt_parameters": { + "title": "QUILT parameters", + "type": "object", + "description": "Arguments to customize QUILT run", + "default": "", + "properties": { + "buffer": { + "type": "integer", + "default": 10000, + "description": "Buffer of region to perform imputation over. So imputation is run form regionStart-buffer to regionEnd+buffer, and reported for regionStart to regionEnd, including the bases of regionStart and regionEnd." + }, + "ngen": { + "type": "integer", + "default": 100, + "description": "Number of generations since founding of the population to use for imputation." + } + } + }, + "stitch_parameters": { + "title": "STITCH parameters", + "type": "object", + "description": "Arguments to customize STITCH run", + "default": "", + "properties": { + "seed": { + "type": "integer", + "default": 1 + }, + "posfile": { + "type": "string", + "description": "Path to comma-separated file containing tab-separated files describing the variable positions to be used for imputation. Refer to the documentation for the `--posfile` argument of STITCH for more information.", + "format": "file-path", + "schema": "assets/schema_posfile.json", + "pattern": "^\\S+\\.(csv|tsv|txt)$", + "mimetype": "text/csv", + "help_text": "" + }, + "k_val": { + "type": "integer", + "default": 2, + "description": "Number of ancestral haplotypes to use for imputation. Refer to the documentation for the `--K` argument of STITCH for more information." + } + } } }, "allOf": [ + { + "$ref": "#/definitions/input_output_options" + }, { "$ref": "#/definitions/simulate" }, @@ -404,9 +451,6 @@ { "$ref": "#/definitions/validation" }, - { - "$ref": "#/definitions/input_output_options" - }, { "$ref": "#/definitions/reference_genome_options" }, @@ -418,16 +462,12 @@ }, { "$ref": "#/definitions/generic_options" - } - ], - "properties": { - "ngen": { - "type": "integer", - "default": 100 }, - "buffer": { - "type": "integer", - "default": 10000 + { + "$ref": "#/definitions/quilt_parameters" + }, + { + "$ref": "#/definitions/stitch_parameters" } - } + ] } diff --git a/subworkflows/local/bam_impute_stitch/bam_impute_stitch.nf b/subworkflows/local/bam_impute_stitch/bam_impute_stitch.nf new file mode 100644 index 00000000..ea162fd0 --- /dev/null +++ b/subworkflows/local/bam_impute_stitch/bam_impute_stitch.nf @@ -0,0 +1,31 @@ +include { STITCH } from '../../../modules/nf-core/stitch/main' +include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main' + + +workflow BAM_IMPUTE_STITCH { + + take: + ch_parameters + ch_samples + ch_fasta + + main: + + ch_versions = Channel.empty() + + // Run STITCH + seed = params.seed + STITCH( ch_samples, ch_parameters, ch_fasta, seed ) + + // Index imputed annotated VCF + BCFTOOLS_INDEX(STITCH.out.vcf) + + // Join VCFs and TBIs + ch_vcf_tbi = STITCH.out.vcf.join(BCFTOOLS_INDEX.out.tbi) + + + emit: + vcf_tbi = ch_vcf_tbi // channel: [ meta, vcf, tbi ] + versions = ch_versions // channel: [ versions.yml ] + +} diff --git a/subworkflows/local/get_panel/main.nf b/subworkflows/local/get_panel/main.nf deleted file mode 100644 index 2f40dfd6..00000000 --- a/subworkflows/local/get_panel/main.nf +++ /dev/null @@ -1,88 +0,0 @@ -include { BCFTOOLS_VIEW as VIEW_VCF_SNPS } from '../../../modules/nf-core/bcftools/view/main.nf' -include { BCFTOOLS_VIEW as VIEW_VCF_SITES } from '../../../modules/nf-core/bcftools/view/main.nf' -include { BCFTOOLS_INDEX as VCF_INDEX1 } from '../../../modules/nf-core/bcftools/index/main.nf' -include { BCFTOOLS_INDEX as VCF_INDEX3 } from '../../../modules/nf-core/bcftools/index/main.nf' -include { BCFTOOLS_INDEX as VCF_INDEX4 } from '../../../modules/nf-core/bcftools/index/main.nf' -include { BCFTOOLS_INDEX as VCF_INDEX5 } from '../../../modules/nf-core/bcftools/index/main.nf' -include { BCFTOOLS_NORM } from '../../../modules/nf-core/bcftools/norm/main.nf' -include { BCFTOOLS_QUERY } from '../../../modules/nf-core/bcftools/query/main.nf' -include { TABIX_BGZIP } from '../../../modules/nf-core/tabix/bgzip/main' -include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix/main' -include { VCF_PHASE_SHAPEIT5 } from '../../../subworkflows/nf-core/vcf_phase_shapeit5/main' - - -workflow GET_PANEL { - take: - ch_vcf // channel: [ [id, chr], vcf, index ] - ch_fasta // channel: [ [genome], fasta, fai ] - - main: - - ch_versions = Channel.empty() - - BCFTOOLS_NORM(ch_vcf, ch_fasta.map{ genome, fasta, fai -> [genome, fasta] }) - ch_versions = ch_versions.mix(BCFTOOLS_NORM.out.versions.first()) - - // Extract only the SNP - VIEW_VCF_SNPS(BCFTOOLS_NORM.out.vcf // [ meta, vcf ] - .combine(Channel.of([[]])), [], [], []) - ch_versions = ch_versions.mix(VIEW_VCF_SNPS.out.versions.first()) - - VCF_INDEX3(VIEW_VCF_SNPS.out.vcf) - ch_versions = ch_versions.mix(VCF_INDEX3.out.versions.first()) - - ch_panel_norm = VIEW_VCF_SNPS.out.vcf - .combine(VCF_INDEX3.out.csi, by:0) - - // Extract sites positions - vcf_region = VIEW_VCF_SNPS.out.vcf - .combine(VCF_INDEX3.out.csi, by:0) - VIEW_VCF_SITES( ch_panel_norm, - [], [], []) - ch_versions = ch_versions.mix(VIEW_VCF_SITES.out.versions.first()) - - VCF_INDEX4(VIEW_VCF_SITES.out.vcf) - ch_versions = ch_versions.mix(VCF_INDEX4.out.versions.first()) - - ch_panel_sites = VIEW_VCF_SITES.out.vcf - .combine(VCF_INDEX4.out.csi, by:0) - - // Convert to TSV - BCFTOOLS_QUERY(ch_panel_sites, [], [], []) - ch_versions = ch_versions.mix(BCFTOOLS_QUERY.out.versions.first()) - - TABIX_BGZIP(BCFTOOLS_QUERY.out.output) - ch_versions = ch_versions.mix(TABIX_BGZIP.out.versions.first()) - - TABIX_TABIX(TABIX_BGZIP.out.output) - ch_versions = ch_versions.mix(TABIX_TABIX.out.versions.first()) - - ch_panel_tsv = TABIX_BGZIP.out.output - .combine(TABIX_TABIX.out.tbi, by: 0) - - // Phase panel - if (params.phased == false) { - VCF_PHASE_SHAPEIT5(vcf_region - .map { meta, vcf, csi -> [meta, vcf, csi, [], meta.region] }, - Channel.of([[],[],[]]).collect(), - Channel.of([[],[],[]]).collect(), - Channel.of([[],[]]).collect()) - ch_versions = ch_versions.mix(VCF_PHASE_SHAPEIT5.out.versions) - ch_panel_phased = VCF_PHASE_SHAPEIT5.out.variants_phased - .combine(VCF_PHASE_SHAPEIT5.out.variants_index, by: 0) - } else { - ch_panel_phased = vcf_region - } - - ch_panel = ch_panel_norm - .combine(ch_panel_sites, by: 0) - .combine(ch_panel_tsv, by: 0) - .combine(ch_panel_phased, by: 0) - .map{ metaIC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index - -> [[panel:metaIC.id, chr:metaIC.chr ], norm, n_index, sites, s_index, tsv, t_index, phased, p_index] - } - - emit: - panel = ch_panel // channel: [ [panel, chr], norm, n_index, sites, s_index, tsv, t_index, phased, p_index] - versions = ch_versions // channel: [ versions.yml ] -} diff --git a/subworkflows/local/impute_quilt/impute_quilt.nf b/subworkflows/local/impute_quilt/impute_quilt.nf index d8231733..5bba1604 100644 --- a/subworkflows/local/impute_quilt/impute_quilt.nf +++ b/subworkflows/local/impute_quilt/impute_quilt.nf @@ -1,7 +1,7 @@ include { QUILT_QUILT } from '../../../modules/nf-core/quilt/quilt' include { BCFTOOLS_ANNOTATE } from '../../../modules/nf-core/bcftools/annotate' -include { BCFTOOLS_INDEX as INDEX1 } from '../../../modules/nf-core/bcftools/index' -include { BCFTOOLS_INDEX as INDEX2 } from '../../../modules/nf-core/bcftools/index' +include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_1 } from '../../../modules/nf-core/bcftools/index' +include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_2 } from '../../../modules/nf-core/bcftools/index' workflow IMPUTE_QUILT { @@ -50,22 +50,22 @@ workflow IMPUTE_QUILT { ch_versions = ch_versions.mix(QUILT_QUILT.out.versions.first()) // Index imputed VCF - INDEX1(QUILT_QUILT.out.vcf) - ch_versions = ch_versions.mix(INDEX1.out.versions.first()) + BCFTOOLS_INDEX_1(QUILT_QUILT.out.vcf) + ch_versions = ch_versions.mix(BCFTOOLS_INDEX_1.out.versions.first()) // Annotate the variants BCFTOOLS_ANNOTATE(QUILT_QUILT.out.vcf - .join(INDEX1.out.tbi) + .join(BCFTOOLS_INDEX_1.out.tbi) .combine(Channel.of([[], [], [], []])) ) ch_versions = ch_versions.mix(BCFTOOLS_ANNOTATE.out.versions.first()) // Index imputed annotated VCF - INDEX2(BCFTOOLS_ANNOTATE.out.vcf) - ch_versions = ch_versions.mix(INDEX2.out.versions.first()) + BCFTOOLS_INDEX_2(BCFTOOLS_ANNOTATE.out.vcf) + ch_versions = ch_versions.mix(BCFTOOLS_INDEX_2.out.versions.first()) // Join VCFs and TBIs - ch_vcf_tbi = BCFTOOLS_ANNOTATE.out.vcf.join(INDEX2.out.tbi) + ch_vcf_tbi = BCFTOOLS_ANNOTATE.out.vcf.join(BCFTOOLS_INDEX_2.out.tbi) emit: vcf_tbi = ch_vcf_tbi // channel: [ meta, vcf, tbi ] diff --git a/subworkflows/local/make_chunks/make_chunks.nf b/subworkflows/local/make_chunks/make_chunks.nf index c0fe7924..5c4319dc 100644 --- a/subworkflows/local/make_chunks/make_chunks.nf +++ b/subworkflows/local/make_chunks/make_chunks.nf @@ -1,17 +1,9 @@ -include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main' -include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_2} from '../../../modules/nf-core/bcftools/index/main' -include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_3} from '../../../modules/nf-core/bcftools/index/main' include { GLIMPSE_CHUNK } from '../../../modules/nf-core/glimpse/chunk/main' -include { BCFTOOLS_CONVERT } from '../../../modules/nf-core/bcftools/convert/main' -include { BCFTOOLS_NORM } from '../../../modules/nf-core/bcftools/norm/main' -include { BCFTOOLS_VIEW } from '../../../modules/nf-core/bcftools/view/main' - workflow MAKE_CHUNKS { take: ch_reference // channel: [ val(meta),vcf ] - ch_fasta_fai // channel: [meta, fasta, fai] main: @@ -20,8 +12,9 @@ workflow MAKE_CHUNKS { // Make chunks ch_vcf_csi_chr = ch_reference.map{meta, vcf, csi -> [meta, vcf, csi, meta.chr]} GLIMPSE_CHUNK(ch_vcf_csi_chr) + ch_versions = ch_versions.mix(GLIMPSE_CHUNK.out.versions) - // Rearrange chunks into channel + // Rearrange chunks into channel for QUILT ch_chunks = GLIMPSE_CHUNK.out.chunk_chr .splitText() .map { metamap, line -> @@ -30,38 +23,7 @@ workflow MAKE_CHUNKS { [metamap, metamap.chr, startEnd[0], startEnd[1]] } - ch_fasta = ch_fasta_fai.map { meta, fasta, fai -> [meta, fasta] } - - // Join duplicated biallelic sites into multiallelic records - BCFTOOLS_NORM(ch_reference, ch_fasta) - - // Index multiallelic VCF - BCFTOOLS_INDEX_2(BCFTOOLS_NORM.out.vcf) - - // Join multiallelic VCF and TBI - ch_multiallelic_vcf_tbi = BCFTOOLS_NORM.out.vcf.join(BCFTOOLS_INDEX_2.out.tbi) - - // Remove all multiallelic records: - BCFTOOLS_VIEW(ch_multiallelic_vcf_tbi, [], [], []) - - // Index biallelic VCF - BCFTOOLS_INDEX_3(BCFTOOLS_VIEW.out.vcf) - - // Join biallelic VCF and TBI - ch_biallelic_vcf_tbi = BCFTOOLS_VIEW.out.vcf.join(BCFTOOLS_INDEX_3.out.tbi) - - // Convert VCF to Hap and Legend files - BCFTOOLS_CONVERT(ch_biallelic_vcf_tbi, ch_fasta, []) - - // Output hap and legend files - ch_hap_legend = BCFTOOLS_CONVERT.out.hap.join(BCFTOOLS_CONVERT.out.legend) - - - - - emit: - ch_chunks = ch_chunks // channel: [ chr, val(meta), start, end, number ] - ch_hap_legend = ch_hap_legend // channel: [ chr, val(meta), hap, legend ] + chunks = ch_chunks // channel: [ chr, val(meta), start, end, number ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf b/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf new file mode 100644 index 00000000..6b2e7a37 --- /dev/null +++ b/subworkflows/local/prepare_input_stitch/prepare_input_stitch.nf @@ -0,0 +1,51 @@ +workflow PREPARE_INPUT_STITCH { + + take: + ch_posfile + ch_fasta + ch_input_impute + + main: + + ch_versions = Channel.empty() + + // Value channels + def input_empty = [[]] + def rdata_empty = [[]] + k_val = params.k_val + ngen = params.ngen + + // Get chromosomes of posfile + ch_posfile = ch_posfile.map{meta, posfile -> return[['chr': meta.chr], posfile]} + + // Get chromosomes of fasta + ch_chromosomes = ch_fasta.map{it -> it[2]} + .splitCsv(header: ["chr", "size", "offset", "lidebase", "linewidth", "qualoffset"], sep: "\t") + .map{it -> return [[chr: it.chr], it.chr]} + + // Make final channel with parameters + stitch_parameters = ch_posfile.map { it + input_empty + rdata_empty} + .join(ch_chromosomes) + .map { it + k_val + ngen} + + // Prepare sample files for STITCH + // Group input by ID + ch_bam_bai = ch_input_impute.map {meta, bam, bai -> [[meta.id], bam, bai]}.unique() + + // Make bamlist from bam input + ch_bamlist = ch_bam_bai + .map {it[1].tokenize('/').last()} + .collectFile(name: "bamlist.txt", newLine: true, sort: true) + + // Collect all files + stitch_samples = ch_bam_bai.map {meta, bam, bai -> [["id": "all_samples"], bam, bai]} + .groupTuple() + .combine(ch_bamlist) + .collect() + + emit: + stitch_parameters + stitch_samples + versions = ch_versions // channel: [ versions.yml ] + +} diff --git a/subworkflows/local/prepare_input_stitch/prepare_posfile_tsv.nf b/subworkflows/local/prepare_input_stitch/prepare_posfile_tsv.nf new file mode 100644 index 00000000..0612d9bc --- /dev/null +++ b/subworkflows/local/prepare_input_stitch/prepare_posfile_tsv.nf @@ -0,0 +1,26 @@ +include { BCFTOOLS_QUERY } from '../../../modules/nf-core/bcftools/query/main' +include { GAWK } from '../../../modules/nf-core/gawk' + + +workflow PREPARE_POSFILE_TSV { + + take: + ch_panel_sites + ch_fasta + + main: + + ch_versions = Channel.empty() + + // Convert position file to tab-separated file + BCFTOOLS_QUERY(ch_panel_sites, [], [], []) + ch_posfile = BCFTOOLS_QUERY.out.output + + // Remove multiallelic positions from tsv + GAWK(ch_posfile, []) + + emit: + posfile = GAWK.out.output // channel: [ [id, chr], txt ] + versions = ch_versions // channel: [ versions.yml ] + +} diff --git a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf index 77f7fe42..28fada14 100644 --- a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf @@ -207,6 +207,19 @@ workflow PIPELINE_INITIALISATION { ch_genotype = Channel.of([[],[]]) } + // + // Create posfile channel + // + + if (params.posfile) { + ch_posfile = Channel + .fromSamplesheet("posfile") + .map { + meta, file -> + [ meta, file ] + }} else { + ch_posfile = [[]] + } emit: input = ch_input // [ [meta], file, index ] @@ -216,6 +229,7 @@ workflow PIPELINE_INITIALISATION { depth = ch_depth // [ [depth], depth ] regions = ch_regions // [ [chr, region], region ] map = ch_map // [ [map], map ] + posfile = ch_posfile // [ [chr], txt ] versions = ch_versions } @@ -274,7 +288,7 @@ def validateInputParameters() { assert params.step, "A step must be provided" // Check that at least one tool is provided - if (params.step == "impute" || params.step == "panel_prep") { + if (params.step.split(',').contains("impute") || params.step.split(',').contains("panelprep")) { assert params.tools, "No tools provided" } } diff --git a/subworkflows/local/vcf_concatenate_bcftools/main.nf b/subworkflows/local/vcf_concatenate_bcftools/main.nf index bc85d146..583b6070 100644 --- a/subworkflows/local/vcf_concatenate_bcftools/main.nf +++ b/subworkflows/local/vcf_concatenate_bcftools/main.nf @@ -11,8 +11,7 @@ workflow VCF_CONCATENATE_BCFTOOLS { ch_versions = Channel.empty() // Remove chromosome from meta - ch_vcf_tbi_grouped = ch_vcf_tbi - .map{ meta, vcf, tbi -> [['id' : meta.id], vcf, tbi] } + ch_vcf_tbi_grouped = ch_vcf_tbi.map{ meta, vcf, tbi -> [['id' : meta.id], vcf, tbi] } // Group by ID ch_vcf_tbi_grouped = ch_vcf_tbi_grouped.groupTuple( by:0 ) diff --git a/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf b/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf new file mode 100644 index 00000000..e96e4a02 --- /dev/null +++ b/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf @@ -0,0 +1,47 @@ +include { BCFTOOLS_NORM } from '../../../modules/nf-core/bcftools/norm/main' +include { BCFTOOLS_VIEW } from '../../../modules/nf-core/bcftools/view/main' +include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main' +include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_2} from '../../../modules/nf-core/bcftools/index/main' +include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_3} from '../../../modules/nf-core/bcftools/index/main' +include { BCFTOOLS_CONVERT } from '../../../modules/nf-core/bcftools/convert/main' + + +workflow VCF_NORMALIZE_BCFTOOLS { + take: + ch_vcf // channel: [ [id, chr], vcf, index ] + ch_fasta // channel: [ [genome], fasta, fai ] + + main: + + ch_versions = Channel.empty() + ch_fasta = ch_fasta.map { meta, fasta, fai -> [meta, fasta] } + + // Join duplicated biallelic sites into multiallelic records + BCFTOOLS_NORM(ch_vcf, ch_fasta) + + // Index multiallelic VCF + BCFTOOLS_INDEX(BCFTOOLS_NORM.out.vcf) + + // Join multiallelic VCF and TBI + ch_multiallelic_vcf_tbi = BCFTOOLS_NORM.out.vcf.join(BCFTOOLS_INDEX.out.tbi) + + // Remove all multiallelic records: + BCFTOOLS_VIEW(ch_multiallelic_vcf_tbi, [], [], []) + + // Index biallelic VCF + BCFTOOLS_INDEX_2(BCFTOOLS_VIEW.out.vcf) + + // Join biallelic VCF and TBI + ch_biallelic_vcf_tbi = BCFTOOLS_VIEW.out.vcf.join(BCFTOOLS_INDEX_2.out.tbi) + + // Convert VCF to Hap and Legend files + BCFTOOLS_CONVERT(ch_biallelic_vcf_tbi, ch_fasta, []) + + // Output hap and legend files + ch_hap_legend = BCFTOOLS_CONVERT.out.hap.join(BCFTOOLS_CONVERT.out.legend) + + emit: + vcf_tbi = ch_biallelic_vcf_tbi // channel: [ [id, chr], vcf, tbi ] + hap_legend = ch_hap_legend // channel: [ [id, chr] '.hap', '.legend' ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/vcf_phase_panel/main.nf b/subworkflows/local/vcf_phase_panel/main.nf new file mode 100644 index 00000000..a54ca8e6 --- /dev/null +++ b/subworkflows/local/vcf_phase_panel/main.nf @@ -0,0 +1,40 @@ +include { VCF_PHASE_SHAPEIT5 } from '../../../subworkflows/nf-core/vcf_phase_shapeit5/main' + +workflow VCF_PHASE_PANEL { + take: + ch_vcf // channel: [ [id, chr], vcf, index ] + ch_panel_norm + ch_panel_sites + ch_panel_tsv + + main: + + ch_versions = Channel.empty() + + // Phase panel + if (params.phased == false) { + VCF_PHASE_SHAPEIT5(ch_vcf + .map { meta, vcf, csi -> [meta, vcf, csi, [], meta.region] }, + Channel.of([[],[],[]]).collect(), + Channel.of([[],[],[]]).collect(), + Channel.of([[],[]]).collect()) + ch_versions = ch_versions.mix(VCF_PHASE_SHAPEIT5.out.versions) + ch_panel_phased = VCF_PHASE_SHAPEIT5.out.variants_phased + .combine(VCF_PHASE_SHAPEIT5.out.variants_index, by: 0) + } else { + ch_panel_phased = ch_vcf + } + + ch_panel = ch_panel_norm + .combine(ch_panel_sites, by: 0) + .combine(ch_panel_tsv, by: 0) + .combine(ch_panel_phased, by: 0) + .map{ metaIC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index + -> [[panel:metaIC.id, chr:metaIC.chr ], norm, n_index, sites, s_index, tsv, t_index, phased, p_index] + } + + emit: + vcf_tbi = ch_panel_phased + panel = ch_panel + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/vcf_sites_extract_bcftools/main.nf b/subworkflows/local/vcf_sites_extract_bcftools/main.nf new file mode 100644 index 00000000..d3d30674 --- /dev/null +++ b/subworkflows/local/vcf_sites_extract_bcftools/main.nf @@ -0,0 +1,74 @@ +include { BCFTOOLS_VIEW as VIEW_VCF_SNPS } from '../../../modules/nf-core/bcftools/view/main.nf' +include { BCFTOOLS_VIEW as VIEW_VCF_SITES } from '../../../modules/nf-core/bcftools/view/main.nf' +include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main.nf' +include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_2 } from '../../../modules/nf-core/bcftools/index/main.nf' +include { TABIX_BGZIP } from '../../../modules/nf-core/tabix/bgzip/main' +include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix/main' +include { BCFTOOLS_QUERY } from '../../../modules/nf-core/bcftools/query/main.nf' +include { BCFTOOLS_QUERY as BCFTOOLS_QUERY_STITCH} from '../../../modules/nf-core/bcftools/query/main.nf' +include { GAWK as GAWK_STITCH } from '../../../modules/nf-core/gawk' + + + +workflow VCF_SITES_EXTRACT_BCFTOOLS { + take: + ch_vcf // channel: [ [id, chr], vcf, index ] + + main: + + ch_versions = Channel.empty() + + // Extract only SNPs from VCF + VIEW_VCF_SNPS(ch_vcf, [], [], []) + ch_versions = ch_versions.mix(VIEW_VCF_SNPS.out.versions.first()) + + // Index SNPs + BCFTOOLS_INDEX(VIEW_VCF_SNPS.out.vcf) + ch_versions = ch_versions.mix(BCFTOOLS_INDEX.out.versions.first()) + + // Join VCF and Index + ch_panel_norm = VIEW_VCF_SNPS.out.vcf.combine(BCFTOOLS_INDEX.out.csi, by:0) + + // Extract sites positions + VIEW_VCF_SITES( ch_panel_norm,[], [], []) + ch_versions = ch_versions.mix(VIEW_VCF_SITES.out.versions.first()) + + // Index extracted sites + BCFTOOLS_INDEX_2(VIEW_VCF_SITES.out.vcf) + ch_versions = ch_versions.mix(BCFTOOLS_INDEX_2.out.versions.first()) + + // Join extracted sites and index + ch_panel_sites = VIEW_VCF_SITES.out.vcf.combine(BCFTOOLS_INDEX_2.out.csi, by:0) + + // Create TSVs for different tools + + // Convert to TSV with structure for Glimpse + BCFTOOLS_QUERY(ch_panel_sites, [], [], []) + ch_versions = ch_versions.mix(BCFTOOLS_QUERY.out.versions.first()) + + // Compress TSV + TABIX_BGZIP(BCFTOOLS_QUERY.out.output) + ch_versions = ch_versions.mix(TABIX_BGZIP.out.versions.first()) + + // Index compressed TSV + TABIX_TABIX(TABIX_BGZIP.out.output) + ch_versions = ch_versions.mix(TABIX_TABIX.out.versions.first()) + + // Join compressed TSV and index + ch_panel_tsv = TABIX_BGZIP.out.output.combine(TABIX_TABIX.out.tbi, by: 0) + + // TSV for STITCH + // Convert position file to tab-separated file + BCFTOOLS_QUERY_STITCH(ch_panel_sites, [], [], []) + ch_posfile = BCFTOOLS_QUERY_STITCH.out.output + + // Remove multiallelic positions from tsv + GAWK_STITCH(ch_posfile, []) + + emit: + panel_tsv = ch_panel_tsv + vcf_tbi = ch_panel_norm + panel_sites = ch_panel_sites + posfile = GAWK_STITCH.out.output + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/vcf_phase_shapeit5/main.nf b/subworkflows/nf-core/vcf_phase_shapeit5/main.nf index 966f9019..51061373 100644 --- a/subworkflows/nf-core/vcf_phase_shapeit5/main.nf +++ b/subworkflows/nf-core/vcf_phase_shapeit5/main.nf @@ -1,8 +1,8 @@ include { BEDTOOLS_MAKEWINDOWS } from '../../../modules/nf-core/bedtools/makewindows/main.nf' include { SHAPEIT5_PHASECOMMON } from '../../../modules/nf-core/shapeit5/phasecommon/main' include { SHAPEIT5_LIGATE } from '../../../modules/nf-core/shapeit5/ligate/main' -include { BCFTOOLS_INDEX as VCF_INDEX1 } from '../../../modules/nf-core/bcftools/index/main.nf' -include { BCFTOOLS_INDEX as VCF_INDEX2 } from '../../../modules/nf-core/bcftools/index/main.nf' +include { BCFTOOLS_INDEX as VCF_BCFTOOLS_INDEX_1 } from '../../../modules/nf-core/bcftools/index/main.nf' +include { BCFTOOLS_INDEX as VCF_BCFTOOLS_INDEX_2 } from '../../../modules/nf-core/bcftools/index/main.nf' workflow VCF_PHASE_SHAPEIT5 { @@ -61,11 +61,11 @@ workflow VCF_PHASE_SHAPEIT5 { ch_map ) ch_versions = ch_versions.mix(SHAPEIT5_PHASECOMMON.out.versions.first()) - VCF_INDEX1(SHAPEIT5_PHASECOMMON.out.phased_variant) - ch_versions = ch_versions.mix(VCF_INDEX1.out.versions.first()) + VCF_BCFTOOLS_INDEX_1(SHAPEIT5_PHASECOMMON.out.phased_variant) + ch_versions = ch_versions.mix(VCF_BCFTOOLS_INDEX_1.out.versions.first()) ch_ligate_input = SHAPEIT5_PHASECOMMON.out.phased_variant - .join(VCF_INDEX1.out.csi, failOnMismatch:true, failOnDuplicate:true) + .join(VCF_BCFTOOLS_INDEX_1.out.csi, failOnMismatch:true, failOnDuplicate:true) .map{ meta, vcf, csi -> newmeta = meta + [id: meta.id.split("_")[0..-2].join("_")] [newmeta, vcf, csi]} @@ -84,13 +84,13 @@ workflow VCF_PHASE_SHAPEIT5 { SHAPEIT5_LIGATE(ch_ligate_input) ch_versions = ch_versions.mix(SHAPEIT5_LIGATE.out.versions.first()) - VCF_INDEX2(SHAPEIT5_LIGATE.out.merged_variants) - ch_versions = ch_versions.mix(VCF_INDEX2.out.versions.first()) + VCF_BCFTOOLS_INDEX_2(SHAPEIT5_LIGATE.out.merged_variants) + ch_versions = ch_versions.mix(VCF_BCFTOOLS_INDEX_2.out.versions.first()) emit: bed = BEDTOOLS_MAKEWINDOWS.out.bed // channel: [ val(meta), bed ] variants_phased = SHAPEIT5_LIGATE.out.merged_variants // channel: [ val(meta), vcf ] - variants_index = VCF_INDEX2.out.csi // channel: [ val(meta), csi ] + variants_index = VCF_BCFTOOLS_INDEX_2.out.csi // channel: [ val(meta), csi ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/tests/csv/posfile.csv b/tests/csv/posfile.csv new file mode 100644 index 00000000..d5a92024 --- /dev/null +++ b/tests/csv/posfile.csv @@ -0,0 +1,2 @@ +chr,file +chr22,"https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/panel/22/chr22_posfile_stitch.txt" diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 2b653676..ab9cc73d 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -18,22 +18,41 @@ include { getAllFilesExtension } from '../../subworkflows/local/utils_nfc // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { VCF_IMPUTE_GLIMPSE as VCF_IMPUTE_GLIMPSE1 } from '../../subworkflows/nf-core/vcf_impute_glimpse' +// Simulate subworkflows include { BAM_REGION } from '../../subworkflows/local/bam_region' include { BAM_DOWNSAMPLE } from '../../subworkflows/local/bam_downsample' -include { COMPUTE_GL as GL_TRUTH } from '../../subworkflows/local/compute_gl' -include { COMPUTE_GL as GL_INPUT } from '../../subworkflows/local/compute_gl' -include { VCF_CONCORDANCE_GLIMPSE2 } from '../../subworkflows/local/vcf_concordance_glimpse2' + +// Panelprep subworkflows include { VCF_CHR_CHECK } from '../../subworkflows/local/vcf_chr_check' -include { GET_PANEL } from '../../subworkflows/local/get_panel' +include { VCF_NORMALIZE_BCFTOOLS } from '../../subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools' +include { VCF_SITES_EXTRACT_BCFTOOLS } from '../../subworkflows/local/vcf_sites_extract_bcftools' +include { VCF_PHASE_PANEL } from '../../subworkflows/local/vcf_phase_panel' +include { PREPARE_POSFILE_TSV } from '../../subworkflows/local/prepare_input_stitch/prepare_posfile_tsv' +// GLIMPSE subworkflows +include { VCF_IMPUTE_GLIMPSE as VCF_IMPUTE_GLIMPSE1 } from '../../subworkflows/nf-core/vcf_impute_glimpse' +include { COMPUTE_GL as GL_TRUTH } from '../../subworkflows/local/compute_gl' +include { COMPUTE_GL as GL_INPUT } from '../../subworkflows/local/compute_gl' +include { VCF_CONCATENATE_BCFTOOLS as CONCAT_GLIMPSE1} from '../../subworkflows/local/vcf_concatenate_bcftools' +// QUILT subworkflows include { MAKE_CHUNKS } from '../../subworkflows/local/make_chunks/make_chunks' include { IMPUTE_QUILT } from '../../subworkflows/local/impute_quilt/impute_quilt' -include { VCF_CONCATENATE_BCFTOOLS as CONCAT_IMPUT } from '../../subworkflows/local/vcf_concatenate_bcftools' +include { VCF_CONCATENATE_BCFTOOLS as CONCAT_QUILT } from '../../subworkflows/local/vcf_concatenate_bcftools' + +// STITCH subworkflows +include { PREPARE_INPUT_STITCH } from '../../subworkflows/local/prepare_input_stitch/prepare_input_stitch' +include { BAM_IMPUTE_STITCH } from '../../subworkflows/local/bam_impute_stitch/bam_impute_stitch' +include { VCF_CONCATENATE_BCFTOOLS as CONCAT_STITCH } from '../../subworkflows/local/vcf_concatenate_bcftools' + +// CONCAT subworkflows include { VCF_CONCATENATE_BCFTOOLS as CONCAT_TRUTH } from '../../subworkflows/local/vcf_concatenate_bcftools' include { VCF_CONCATENATE_BCFTOOLS as CONCAT_PANEL } from '../../subworkflows/local/vcf_concatenate_bcftools' +// Concordance subworkflows +include { VCF_CONCORDANCE_GLIMPSE2 } from '../../subworkflows/local/vcf_concordance_glimpse2' + + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -52,6 +71,7 @@ workflow PHASEIMPUTE { ch_region // channel: region to use [ [chr, region], region] ch_depth // channel: depth select [ [depth], depth ] ch_map // channel: genetic map [ [chr], map] + ch_posfile // channel: posfile [ [chr], txt] ch_versions // channel: versions of software used main: @@ -61,7 +81,7 @@ workflow PHASEIMPUTE { // // Simulate data if asked // - if (params.step == 'simulate' || params.step == 'all') { + if (params.step.split(',').contains("simulate") || params.step.split(',').contains("all")) { // Output channel of simulate process ch_sim_output = Channel.empty() @@ -99,37 +119,56 @@ workflow PHASEIMPUTE { // // Prepare panel // - if (params.step == 'impute' || params.step == 'panel_prep' || params.step == 'validate' || params.step == 'all') { - // Remove if necessary "chr" + if (params.step.split(',').contains("panelprep") || params.step.split(',').contains("validate") || params.step.split(',').contains("all")) { + // Check chr prefix and remove if necessary VCF_CHR_CHECK(ch_panel, ch_fasta) ch_versions = ch_versions.mix(VCF_CHR_CHECK.out.versions) - // Prepare the panel - GET_PANEL(VCF_CHR_CHECK.out.vcf, ch_fasta) - ch_versions = ch_versions.mix(GET_PANEL.out.versions) - ch_panel_sites_tsv = GET_PANEL.out.panel - .map{ metaPC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index - -> [metaPC, sites, tsv] - } - CONCAT_PANEL(GET_PANEL.out.panel - .map{ metaPC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index - -> [[id:metaPC.panel], sites, s_index] - } + // Normalize indels in panel + VCF_NORMALIZE_BCFTOOLS(VCF_CHR_CHECK.out.vcf, ch_fasta) + ch_versions = ch_versions.mix(VCF_NORMALIZE_BCFTOOLS.out.versions) + + // Extract sites from normalized vcf + VCF_SITES_EXTRACT_BCFTOOLS(VCF_NORMALIZE_BCFTOOLS.out.vcf_tbi) + ch_versions = ch_versions.mix(VCF_SITES_EXTRACT_BCFTOOLS.out.versions) + + // Phase panel + VCF_PHASE_PANEL(VCF_SITES_EXTRACT_BCFTOOLS.out.vcf_tbi, + VCF_SITES_EXTRACT_BCFTOOLS.out.vcf_tbi, + VCF_SITES_EXTRACT_BCFTOOLS.out.panel_sites, + VCF_SITES_EXTRACT_BCFTOOLS.out.panel_tsv) + ch_versions = ch_versions.mix(VCF_PHASE_PANEL.out.versions) + + // Generate channels (to be simplified) + ch_panel_sites_tsv = VCF_PHASE_PANEL.out.panel + .map{ metaPC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index + -> [metaPC, sites, tsv] + } + CONCAT_PANEL(VCF_PHASE_PANEL.out.panel + .map{ metaPC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index + -> [[id:metaPC.panel], sites, s_index] + } ) ch_panel_sites = CONCAT_PANEL.out.vcf_tbi_join ch_versions = ch_versions.mix(CONCAT_PANEL.out.versions) - ch_panel_phased = GET_PANEL.out.panel + ch_panel_phased = VCF_PHASE_PANEL.out.panel .map{ metaPC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index -> [metaPC, phased, p_index] } + // Prepare posfile stitch + PREPARE_POSFILE_TSV(VCF_SITES_EXTRACT_BCFTOOLS.out.panel_sites, ch_fasta) + ch_versions = ch_versions.mix(PREPARE_POSFILE_TSV.out.versions) - ch_versions = ch_versions.mix(GET_PANEL.out.versions) + // Create chunks from reference VCF + MAKE_CHUNKS(ch_panel) + ch_versions = ch_versions.mix(MAKE_CHUNKS.out.versions) + } - if (params.step == 'impute' || params.step == 'all') { + if (params.step.split(',').contains("impute") || params.step.split(',').contains("all")) { // Output channel of input process ch_impute_output = Channel.empty() - if (params.tools.contains("glimpse1")) { + if (params.tools.split(',').contains("glimpse1")) { println "Impute with Glimpse1" // Glimpse1 subworkflow GL_INPUT( // Compute GL for input data once per panel @@ -164,35 +203,73 @@ workflow PHASEIMPUTE { // Add to output channel ch_impute_output = ch_impute_output.mix(output_glimpse1) + + // Concatenate by chromosomes + CONCAT_GLIMPSE1(output_glimpse1) + ch_versions = ch_versions.mix(CONCAT_GLIMPSE1.out.versions) + + // Add results to input validate + ch_input_validate = ch_input_validate.mix(CONCAT_GLIMPSE1.out.vcf_tbi_join) + } - if (params.tools.contains("glimpse2")) { + if (params.tools.split(',').contains("glimpse2")) { error "Glimpse2 not yet implemented" // Glimpse2 subworkflow } - if (params.tools.contains("quilt")) { - print("Impute with quilt") - // Quilt subworkflow + if (params.tools.split(',').contains("stitch")) { + print("Impute with STITCH") + + // Obtain the user's posfile if provided or calculate it from ref panel file + if (params.posfile) { // User supplied posfile + ch_posfile = ch_posfile + } else if (params.panel && params.step.split(',').contains("panelprep")) { // Panelprep posfile + ch_posfile = PREPARE_POSFILE_TSV.out.posfile + } else { + error "No posfile or reference panel preparation was included" + } + // Prepare inputs + PREPARE_INPUT_STITCH(ch_posfile, ch_fasta, ch_input_impute) + ch_versions = ch_versions.mix(PREPARE_INPUT_STITCH.out.versions) + + // Impute with STITCH + BAM_IMPUTE_STITCH ( PREPARE_INPUT_STITCH.out.stitch_parameters, + PREPARE_INPUT_STITCH.out.stitch_samples, + ch_fasta ) + ch_versions = ch_versions.mix(BAM_IMPUTE_STITCH.out.versions) + + // Output channel to concat + ch_impute_output = ch_impute_output.mix(BAM_IMPUTE_STITCH.out.vcf_tbi) + + // Concatenate by chromosomes + CONCAT_STITCH(BAM_IMPUTE_STITCH.out.vcf_tbi) + ch_versions = ch_versions.mix(CONCAT_STITCH.out.versions) + + // Add results to input validate + ch_input_validate = ch_input_validate.mix(CONCAT_STITCH.out.vcf_tbi_join) - // Create chunks from reference VCF - MAKE_CHUNKS(ch_panel, ch_fasta) + } + + if (params.tools.split(',').contains("quilt")) { + print("Impute with QUILT") - // Impute BAMs with QUILT - IMPUTE_QUILT(MAKE_CHUNKS.out.ch_hap_legend, ch_input_impute, MAKE_CHUNKS.out.ch_chunks) - ch_versions = ch_versions.mix(IMPUTE_QUILT.out.versions) + // Impute BAMs with QUILT + IMPUTE_QUILT(VCF_NORMALIZE_BCFTOOLS.out.hap_legend, ch_input_impute, MAKE_CHUNKS.out.chunks) + ch_versions = ch_versions.mix(IMPUTE_QUILT.out.versions) + + // Add to output channel + ch_impute_output = ch_impute_output.mix(IMPUTE_QUILT.out.vcf_tbi) - // Add to output channel - ch_impute_output = ch_impute_output.mix(IMPUTE_QUILT.out.vcf_tbi) + // Concatenate by chromosomes + CONCAT_QUILT(IMPUTE_QUILT.out.vcf_tbi) + ch_versions = ch_versions.mix(CONCAT_QUILT.out.versions) + + // Add results to input validate + ch_input_validate = ch_input_validate.mix(CONCAT_QUILT.out.vcf_tbi_join) } - // Concatenate by chromosomes - CONCAT_IMPUT(ch_impute_output) - ch_versions = ch_versions.mix(CONCAT_IMPUT.out.versions) - ch_input_validate = ch_input_validate.mix(CONCAT_IMPUT.out.vcf_tbi_join) } - } - - if (params.step == 'validate' || params.step == 'all') { + if (params.step.split(',').contains("validate") || params.step.split(',').contains("all")) { ch_truth_vcf = Channel.empty() // Get extension of input files truth_ext = getAllFilesExtension(ch_input_validate_truth) @@ -219,13 +296,13 @@ workflow PHASEIMPUTE { .mix(GL_TRUTH.out.vcf) // Concatenate by chromosomes - CONCAT_TRUTH(ch_truth_vcf) - ch_versions = ch_versions.mix(CONCAT_TRUTH.out.versions) + // CONCAT_TRUTH(ch_truth_vcf) + // ch_versions = ch_versions.mix(CONCAT_TRUTH.out.versions) // Compute concordance analysis VCF_CONCORDANCE_GLIMPSE2( ch_input_validate, - CONCAT_TRUTH.out.vcf_tbi_join, + ch_truth_vcf, ch_panel_sites, ch_region ) @@ -233,7 +310,7 @@ workflow PHASEIMPUTE { ch_versions = ch_versions.mix(VCF_CONCORDANCE_GLIMPSE2.out.versions) } - if (params.step == 'refine') { + if (params.step.split(',').contains("refine")) { error "refine step not yet implemented" }