diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a2539e98..d93a3b47 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,7 +18,7 @@ env: NXF_ANSI_LOG: false NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity - NFTEST_VER: "0.9.0" + NFT_VER: "0.9.2" NFT_WORKDIR: "~" NFT_DIFF: "pdiff" NFT_DIFF_ARGS: "--line-numbers --expand-tabs=2" @@ -97,10 +97,9 @@ jobs: python -m pip install --upgrade pip pip install pdiff - - name: Install nf-test - run: | - wget -qO- https://code.askimed.com/install/nf-test | bash -s $NFTEST_VER - sudo mv nf-test /usr/local/bin/ + - uses: nf-core/setup-nf-test@v1 + with: + version: ${{ env.NFT_VER }} - name: "Run pipeline with test data ${{ matrix.NXF_VER }} | ${{ matrix.TEST_PROFILE }} | ${{ matrix.profile }}" run: | diff --git a/CITATIONS.md b/CITATIONS.md index ef1d9743..ed82cbf7 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,9 +10,9 @@ ## Pipeline tools -- [QUILT](https://pubmed.ncbi.nlm.nih.gov/34083788/) +- [bcftools](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3198575/) -> Davies, R. W., Kucka, M., Su, D., Shi, S., Flanagan, M., Cunniff, C. M., ... & Myers, S. (2021). Rapid genotype imputation from sequence with reference panels. Nature genetics, 53(7), 1104-1111. +> Li, H. (2011). A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data. Bioinformatics, 27(21), 2987-2993. - [GLIMPSE](https://www.nature.com/articles/s41588-020-00756-0) @@ -22,21 +22,21 @@ > Rubinacci, S., Hofmeister, R. J., Sousa da Mota, B., & Delaneau, O. (2023). Imputation of low-coverage sequencing data from 150,119 UK Biobank genomes. Nature genetics 55, 1088–1090. -- [STITCH](https://doi.org/10.1038/ng.3594) +- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) -> Davies, R. W., Flint, J., Myers, S., & Mott, R.(2016). Rapid genotype imputation from sequence without reference panels. Nature genetics 48, 965–969. +> Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. -- [Shapeit](https://doi.org/10.1038/s41588-023-01415-w) +- [QUILT](https://pubmed.ncbi.nlm.nih.gov/34083788/) -> Hofmeister RJ, Ribeiro DM, Rubinacci S., Delaneau O. (2023). Accurate rare variant phasing of whole-genome and whole-exome sequencing data in the UK Biobank. Nature Genetics doi: https://doi.org/10.1038/s41588-023-01415-w +> Davies, R. W., Kucka, M., Su, D., Shi, S., Flanagan, M., Cunniff, C. M., ... & Myers, S. (2021). Rapid genotype imputation from sequence with reference panels. Nature genetics, 53(7), 1104-1111. -- [bcftools](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3198575/) +- [Shapeit](https://doi.org/10.1038/s41588-023-01415-w) -> Li, H. (2011). A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data. Bioinformatics, 27(21), 2987-2993. +> Hofmeister RJ, Ribeiro DM, Rubinacci S., Delaneau O. (2023). Accurate rare variant phasing of whole-genome and whole-exome sequencing data in the UK Biobank. Nature Genetics doi: https://doi.org/10.1038/s41588-023-01415-w -- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) +- [STITCH](https://doi.org/10.1038/ng.3594) -> Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. +> Davies, R. W., Flint, J., Myers, S., & Mott, R. (2016). Rapid genotype imputation from sequence without reference panels. Nature genetics 48, 965–969. ## Software packaging/containerisation tools diff --git a/README.md b/README.md index 81b2dcad..215934ce 100644 --- a/README.md +++ b/README.md @@ -19,11 +19,33 @@ ## Introduction -**nf-core/phaseimpute** is a bioinformatics pipeline to phase and impute genetic data. The pipeline is constituted of five main steps: +**nf-core/phaseimpute** is a bioinformatics pipeline to phase and impute genetic data. -| Metro map | Modes | -| ------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| metromap | - **Check chromosomes names**: Validates the presence of the different contigs in all variants and alignment files, ensuring data compatibility for further processing
- **Panel preparation**: Perfoms the phasing, QC, variant filtering, variant annotation of the reference panel
- **Imputation**: Imputes genotypes in the target dataset using the reference panel
- **Simulate**: Generates simulated datasets from high-quality target data for testing and validation purposes.
- **Concordance**: Evaluates the accuracy of imputation by comparing the imputed data against a truth dataset. | +metromap + +The whole pipeline consists of five main steps, each of which can be run separately and independently. Users are not required to run all steps sequentially and can select specific steps based on their needs: + +1. **QC: Chromosome Name Check**: Ensures compatibility by validating that all expected contigs are present in the variant and alignment files. + +2. **Simulation (`--simulate`)**: Generates artificial datasets by downsampling high-density data to simulate low-pass genetic information. This enables the comparison of imputation results against a high-quality dataset (truth set). Simulations may include: + + - **Low-pass data generation** by downsampling BAM or CRAM files with [`samtools view -s`](https://www.htslib.org/doc/samtools-view.html) at different depths. + +3. **Panel Preparation (`--panelprep`)**: Prepares the reference panel through phasing, quality control, variant filtering, and annotation. Key processes include: + + - **Normalization** of the reference panel to retain essential variants. + - **Phasing** of haplotypes in the reference panel using [Shapeit5](https://odelaneau.github.io/shapeit5/). + - **Chunking** of the reference panel into specific regions across chromosomes. + - **Position Extraction** for targeted imputation sites. + +4. **Imputation (`--impute`)**: This is the primary step, where genotypes in the target dataset are imputed using the prepared reference panel. The main steps are: + + - **Imputation** of the target dataset using tools like [Glimpse1](https://odelaneau.github.io/GLIMPSE/glimpse1/index.html), [Glimpse2](https://odelaneau.github.io/GLIMPSE/), [Stitch](https://github.com/rwdavies/stitch), or [Quilt](https://github.com/rwdavies/QUILT). + - **Ligation** of imputed chunks to produce a final VCF file per sample, with all chromosomes unified. + +5. **Validation (`--validate`)**: Assesses imputation accuracy by comparing the imputed dataset to a truth dataset. This step leverages the [Glimpse2](https://odelaneau.github.io/GLIMPSE/) concordance process to summarize differences between two VCF files. + +For more detailed instructions, please refer to the [usage documentation](https://nf-co.re/phaseimpute/usage). ## Usage @@ -32,9 +54,7 @@ The primary function of this pipeline is to impute a target dataset based on a phased panel. Begin by preparing a samplesheet with your input data, formatted as follows: -`samplesheet.csv`: - -```csv +```csv title="samplesheet.csv" sample,file,index SAMPLE_1X,/path/to/.,/path/to/. ``` @@ -43,7 +63,7 @@ Each row represents either a bam or a cram file along with its corresponding ind For certain tools and steps within the pipeline, you will also need to provide a samplesheet for the reference panel. Here's an example of what a final samplesheet for a reference panel might look like, covering three chromosomes: -```csv +```csv title="panel.csv" panel,chr,vcf,index Phase3,1,ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.csi Phase3,2,ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.csi @@ -52,16 +72,11 @@ Phase3,3,ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf. ## Running the pipeline -Execute the pipeline with the following command: +Run one of the steps of the pipeline (imputation with glimpse1) using the following command and test profile: ```bash nextflow run nf-core/phaseimpute \ - -profile \ - --input \ - --genome "GRCh38" \ - --panel \ - --steps "panelprep,impute" \ - --tools "glimpse1" \ + -profile test, \ --outdir ``` @@ -70,18 +85,6 @@ nextflow run nf-core/phaseimpute \ For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/phaseimpute/usage) and the [parameter documentation](https://nf-co.re/phaseimpute/parameters). -## Description of the different steps of the pipeline - -Here is a short description of the different steps of the pipeline. -For more information please refer to the [usage documentation](https://nf-co.re/phaseimpute/usage). - -| steps | Flow chart | Description | -| --------------- | -------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **--panelprep** | Panel preparation | The preprocessing mode is responsible for preparing multiple input files that will be used by the phasing and imputation process.
The main processes are :
- **Haplotypes phasing** of the reference panel using [**Shapeit5**](https://odelaneau.github.io/shapeit5/).
- **Normalize** the reference panel to select only the necessary variants.
- **Chunking the reference panel** into a subset of regions for all the chromosomes.
- **Extract** the positions where to perform the imputation. | -| **--impute** | Impute target | The imputation mode is the core mode of this pipeline.
It consists of 3 main steps:
- **Imputation**: Impute the target dataset on the reference panel using either:
  - [**Glimpse1**](https://odelaneau.github.io/GLIMPSE/glimpse1/index.html): It comes with the necessity to compute the genotype likelihoods of the target dataset (done using [`bcftools mpileup`](https://samtools.github.io/bcftools/bcftools.html#mpileup)).
  - [**Glimpse2**](https://odelaneau.github.io/GLIMPSE/)
  - [**Stitch**](https://github.com/rwdavies/stitch) This step does not require a reference panel but needs to merge the samples.
  - [**Quilt**](https://github.com/rwdavies/QUILT)
- **Ligation**: all the different chunks are merged together then all chromosomes are reunited to output one VCF per sample. | -| **--simulate** | simulate_metro | The simulation mode is used to create artificial low informative genetic information from high density data. This allows the comparison of the imputed result to a _truth_ and therefore evaluates the quality of the imputation.
For the moment it is possible to simulate:
- Low-pass data by **downsample** BAM or CRAM using [`samtools view -s`](https://www.htslib.org/doc/samtools-view.html) at different depth. | -| **--validate** | concordance_metro | This mode compares two VCF files together to compute a summary of the differences between them.
This step uses [**Glimpse2**](https://odelaneau.github.io/GLIMPSE/) concordance process. | - ## Pipeline output To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/phaseimpute/results) tab on the nf-core website pipeline page. diff --git a/assets/chr_rename_del.txt b/assets/chr_rename_del.txt index a85016b6..f324781a 100644 --- a/assets/chr_rename_del.txt +++ b/assets/chr_rename_del.txt @@ -36,4 +36,4 @@ chr35 35 chr36 36 chr37 37 chr38 38 -chr39 X +chrX X diff --git a/docs/development.md b/docs/development.md deleted file mode 100644 index 62c5e17a..00000000 --- a/docs/development.md +++ /dev/null @@ -1,96 +0,0 @@ -# Development - -## Style - -Names of releases are composed of a color + a dog breed. - -## Features and tasks - -- [x] Add automatic detection of chromosome name to create a renaming file for the vcf files -- [] Add automatic detection of chromosome name to create a renaming file for the bam files -- [] Make the different tests workflows work - - [x] Simulation - - [x] Validation - - [] Preprocessing - - [x] Imputation - - [] Validation - - [] Postprocessing -- [] Add support of `anyOf()` or `oneOf()` in the nf-core schema for the map, panel and region files -- [] Add nf-test for all modules and subworkflows -- [] Remove all TODOs -- [] Check if panel is necessary depending on the tool selected -- [x] Set modules configuration as full path workflow:subworkflow:module -- [] Where should the map file go (separate csv or in panel csv) -- [] Add support for imputation by individuals or by groups of individuals - -## Run tests - -### Launch with Nextflow - -```bash -nextflow run main.nf -profile singularity,test --outdir results -resume -nextflow run main.nf -profile singularity,test_sim --outdir results -resume -nextflow run main.nf -profile singularity,test_validate --outdir results -resume -nextflow run main.nf -profile singularity,test_all --outdir results -resume -nextflow run main.nf -profile singularity,test_quilt --outdir results -resume -``` - -### Launch with nf-test - -```bash -nf-test test --verbose --profile singularity --tag test_all -nf-test test --verbose --profile singularity --tag test_all --update-snapshot #To update the snaps of a given test -``` - -## Problematic - -### Channel management and combination - -If only one specie at a time, then only one fasta file and only one map file (normally ?) -Do we want to be able to compute multiple panel at the same time ? -If so we need to correctly combine the different channel depending on their meta map. - -All channel need to be identified by a meta map as follow: - -- I : individual id -- P : panel id -- R : region used -- M : map used -- T : tool used -- G : reference genome used (is it needed ?) -- S : simulation (depth or genotype array) - -## Open questions - -How to use different schema ? - -- Use nf-validation - For the moment use different input / steps. - In the futur, if/else logic will be added in the yml nf-core schema. - -What's the use of dumpcustomsoftware ? -Will be deleted - -How to add to multiQC ? -Take exemple on Sarek. -All report file are in a dedicated channel. - -How to add nf-test ? -Add in `tests` folder and run with tag. -Add tags.yml - -How to run stub tests ? -Use nf-test - -How to run the tests ? -nf-test option tag - -What's the use of the template branch ? -TEMPLATE branch have the skeleton for all common part of the pipeline. -Will be asked to be merged to dev from time to time. - -When is it necessary to merge to master / main ? -First release, create a false PR to first commit that will be checked by whole community + 2 reviewers approval. - -What should be the Github action ? -All GA come from the TEMPLATE branch. diff --git a/docs/images/Logo.svg b/docs/images/Logo.svg deleted file mode 100644 index 9330090c..00000000 --- a/docs/images/Logo.svg +++ /dev/null @@ -1,208 +0,0 @@ - - - - diff --git a/docs/images/metro/metro.py b/docs/images/metro/metro.py index e74bf9ed..94417269 100644 --- a/docs/images/metro/metro.py +++ b/docs/images/metro/metro.py @@ -3,7 +3,7 @@ """ Created on 25/05/2024 @author: LouisLeNezet -Main script fo the metrop maps +Main script to the metro maps """ import argparse diff --git a/docs/images/metro/phaseimpute.drawio.png b/docs/images/metro/phaseimpute.drawio.png new file mode 100644 index 00000000..5b11e289 Binary files /dev/null and b/docs/images/metro/phaseimpute.drawio.png differ diff --git a/docs/images/metro/txt2image.md b/docs/images/metro/txt2image.md deleted file mode 100644 index 95d6b2ce..00000000 --- a/docs/images/metro/txt2image.md +++ /dev/null @@ -1,21 +0,0 @@ -# Install desktop app - -Got to `https://github.com/jgraph/drawio-desktop/releases/` and download the latest version for your OS. - -To install it on wsl - -```bash -sudo apt install /mnt/c/Users/llenezet/Dowlnoads/drawio-amd64-21.6.8.deb -``` - -To use drawio - -```bash -drawio --version -drawio docs/images/metro/MetroMap.xml --export --format png --page-index 2 --layers 1 --output docs/images/metro/MetroMap.png --scale 3 -drawio docs/images/metro/MetroMap.xml --export --format svg --page-index 2 --layers 2,3,4,5 --output docs/images/metro/MetroMap.svg -drawio docs/images/metro/MetroMap.xml --export --format png --page-index 3 --layers 1 --output docs/images/metro/Simulate.png --scale 3 -drawio docs/images/metro/MetroMap.xml --export --format png --page-index 4 --layers 0 --output docs/images/metro/PanelPrep.png --scale 3 -drawio docs/images/metro/MetroMap.xml --export --format png --page-index 5 --layers 1 --output docs/images/metro/Impute.png --scale 3 -drawio docs/images/metro/MetroMap.xml --export --format png --page-index 6 --layers 0 --output docs/images/metro/Validate.png --scale 3 -``` diff --git a/docs/output.md b/docs/output.md index 2c312fa5..d3a43a7c 100644 --- a/docs/output.md +++ b/docs/output.md @@ -10,7 +10,11 @@ The directories listed below will be created in the results directory after the ## Panel preparation outputs `--steps panelprep` -This steps of the pipeline performs a QC of the reference panel data and produces the necessary files for imputation (`--steps impute`). It has two optional modes: reference panel phasing with SHAPEIT5 and removal of specified samples from reference panel. +This step of the pipeline performs a QC of the reference panel data and produces the necessary files for imputation (`--steps impute`). +It has two optional modes: + +- reference panel phasing with SHAPEIT5 +- removal of specified samples from reference panel. - [Normalize reference panel](#panel-directory) - Remove multiallelic sites from the reference panel and compute allele frequencies if needed - [Convert](#haplegend-directory) - Convert reference panel to .hap and .legend files @@ -20,7 +24,7 @@ This steps of the pipeline performs a QC of the reference panel data and produce The directory structure from `--steps panelprep` is: -``` +```tree ├── panel ├── haplegend ├── sites @@ -36,12 +40,14 @@ The directory structure from `--steps panelprep` is: Output files - `prep_panel/panel/` - - `*.vcf.gz`: The reference panel vcf after all the preprocessing is done. + - `*.vcf.gz`: The reference panel VCF after all the preprocessing is done. - `*.tbi*`: A tbi for the prepared reference panel. -A directory containing the reference panel per chromosome after preprocessing. The files will be normalized if the flag `--normalize` is used (with `_normalized` suffix). The files will have their allele frequency computed if the flaq `--compute_freq` is used (with `_fixup` suffix). The files will be phased if the flag `--phase` is used (with `_phased` suffix). +A directory containing the reference panel per chromosome after preprocessing. +The files will be normalized if the flag `--normalize` is used (with `_normalized` suffix). The files will have their allele frequency computed if the flaq `--compute_freq` is used (with `_fixup` suffix). +The files will be phased if the flag `--phase` is used (with `_phased` suffix). ### Haplegend directory @@ -54,7 +60,7 @@ A directory containing the reference panel per chromosome after preprocessing. T -[`bcftools convert`](https://samtools.github.io/bcftools/bcftools.html#convert) aids in the conversion of vcf files to .hap and .legend files. A .samples file is also generated. Once that you have generated the hap and legend files for your reference panel, you can skip the reference preparation steps and directly submit these files for imputation. The hap and legend files are input files used with `--tools quilt`. +[`bcftools convert`](https://samtools.github.io/bcftools/bcftools.html#convert) aids in the conversion of VCF files to .hap and .legend files. A .samples file is also generated. Once that you have generated the hap and legend files for your reference panel, you can skip the reference preparation steps and directly submit these files for imputation. The hap and legend files are input files used with `--tools quilt`. ### Sites directory @@ -62,26 +68,20 @@ A directory containing the reference panel per chromosome after preprocessing. T Output files - `prep_panel/sites/` - - `vcf/` - - `*.vcf.gz`: VCF with biallelic SNPs only. - - `*.csi`: Index file for VCF. - - `tsv/` - - `*.txt.gz`: TXT file for biallelic SNPs. - - `*.tbi`: Index file for TSV. + - `*.vcf.gz`: VCF with biallelic SNPs only. + - `*.csi`: Index file for VCF. [`bcftools query`](https://samtools.github.io/bcftools/bcftools.html#query) produces VCF (`*.vcf.gz`) files per chromosome. These QCed VCFs can be gathered into a csv and used with all the tools in `--steps impute` using the flag `--panel`. -In addition, [bcftools query](https://samtools.github.io/bcftools/bcftools.html#query) produces tab-delimited files (`*_tsv.txt`) and, together with the VCFs, they can be gathered into a samplesheet and directly submitted for imputation with `--tools glimpse1,stitch` and `--posfile`. - ### Chunks directory
Output files - `prep_panel/chunks/` - - `*.txt`: TXT file containing the chunks obtained from running Glimpse chunks. + - `*.txt`: TXT file containing the chunks obtained from running `GLIMPSE1_CHUNK`.
@@ -95,7 +95,7 @@ In addition, [bcftools query](https://samtools.github.io/bcftools/bcftools.html# - `prep_panel/csv/` - `chunks.csv`: A csv containing the list of chunks obtained for each chromosome and panel. - `panel.csv`: A csv containing the final phased and prepared for each chromosome and input panel. - - `posfile.csv`: A csv containing the final list of panel positions, in vcf and tsv, for each chromosome and input panel. + - `posfile.csv`: A csv containing the final list of panel positions, in VCF and tsv, for each chromosome and input panel. @@ -107,12 +107,12 @@ The results from steps impute will have the following directory structure: Output files - `imputation/csv/` - - `impute.csv`: A single csv containing the path to a vcf and its index, of each imputed sample with their corresponding tool. + - `impute.csv`: A single csv containing the path to a VCF and its index, of each imputed sample with their corresponding tool. - `imputation/[glimpse1,glimpse2,quilt,stitch]/` - - `concat/*.vcf.gz`: A vcf of each imputed sample. - - `concat/*.vcf.gz.tbi`: A tbi for the imputed vcf. - - `samples/*.vcf.gz`: A vcf of each imputed sample. - - `samples/*.vcf.gz.tbi`: A tbi for the imputed vcf. + - `concat/*.vcf.gz`: A VCF of each imputed sample. + - `concat/*.vcf.gz.tbi`: A tbi for the imputed VCF. + - `samples/*.vcf.gz`: A VCF of each imputed sample. + - `samples/*.vcf.gz.tbi`: A tbi for the imputed VCF. @@ -122,8 +122,8 @@ The results from steps impute will have the following directory structure: Reports contain useful metrics and pipeline information for the different modes. -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline. +- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution. ### MultiQC @@ -137,7 +137,6 @@ Reports contain useful metrics and pipeline information for the different modes. -[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. [MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . diff --git a/docs/usage.md b/docs/usage.md index 9a5dbd26..986a7e5f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -27,16 +27,13 @@ sample,file,index SAMPLE1,AEG588A1.bam,AEG588A1.bai SAMPLE2,AEG588A2.bam,AEG588A2.bai SAMPLE3,AEG588A3.bam,AEG588A3.bai -SAMPLE4,AEG588A4.bam,AEG588A4.bai -SAMPLE5,AEG588A5.bam,AEG588A5.bai -SAMPLE6,AEG588A6.bam,AEG588A6.bai ``` -| Column | Description | -| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. Spaces in sample names are automatically converted to underscores (`_`). | -| `file` | Full path to an alignment or variant file. File has to have the extension ".bam", ".cram" or ".vcf", ".bcf" optionally compressed with bgzip ".gz". All files need to have the same extension. | -| `index` | Full path to index file. File has to be have the extension ".bai", ".crai", "csi", or "tbi". All files need to have the same extension. | +| Column | Description | +| -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --- | +| `sample` | Custom sample name. Spaces in sample names are automatically converted to underscores (`_`). | +| `file` | Full path to an alignment or variant file. File has to have the extension ".bam", ".cram" or ".vcf", ".bcf" and optionally compressed with bgzip ".gz". All files in this column need to have the same extension. | +| `index` | Full path to index file. File has to be have the extension ".bai", ".crai", "csi", or "tbi". All files in this column need to have the same extension. | | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. @@ -128,11 +125,11 @@ or you can specify a custom genome using: --fasta Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz ``` -## Running the pipeline +## Running the pipeline: quick example A quick running example only with the imputation step can be performed as follows: -``` +```bash nextflow run nf-core/phaseimpute \ --input samplesheet.csv \ --steps impute \ @@ -143,7 +140,6 @@ nextflow run nf-core/phaseimpute \ --panel panel.csv \ --tools glimpse1 \ -profile docker - ``` The typical command for running the pre-processing of the panel and imputation of samples is shown below: @@ -191,19 +187,15 @@ Do not use `-c ` to specify parameters as this will result in errors. Cust You can also generate `YAML` or `JSON` files easily using the [nf-core/launch](https://nf-co.re/launch) tool, which guides you creating the files that can be used directly with `-params-file`. -### Check of the contigs name - -The pipeline parallelize the imputation process across contigs. To do so it will use either the `--regions` samplesheet or the `.fai` to extract the genomic region to process. -From all those contigs some might not be present in the `--panel`, `--posfile`, `--chunks`, `--map` (column `chr`) or in the `--fasta`. In this case the pipeline will warn you that some of the contigs are absent in some of the file specified and will only parallelize on the intersection of all contigs. -Afterwards the remaining contigs presence will be checked with the `CHECKCHR` pipeline to ensure that they are present in each `--input` and `--input_truth` file and that also in the individuals reference panel files. - -### Running the pipeline +## Running the pipeline: detailed instructions nf-core/phaseimpute can be started at different points in the analysis by setting the flag `--steps` and the available options `[simulate, panelprep, impute, validate, all]`. You can also run several steps simultaneously by listing the required processes as `--steps panelprep,impute` or you can choose to run all steps sequentially by using `--steps all`. -### Start with simulation `--steps simulate` +## Start with simulation `--steps simulate` + +simulate_metro -This steps of the pipeline allows to create synthetic low-coverage input files by downsizing high density input data. A typical use case is to obtain low-coverage input data from a sequenced sample. This method is useful for comparing the imputation results to the truth and evaluate the quality of the imputation. You can skip this steps if you already have low-pass genome sequencing data. A sample command for this steps is: +This step of the pipeline allows to create synthetic low-coverage input files by downsizing high density input data. A typical use case is to obtain low-coverage input data from a sequenced sample. This method is useful for comparing the imputation results to the truth and evaluate the quality of the imputation. You can skip this steps if you already have low-pass genome sequencing data. A sample command for this steps is: ```bash nextflow run nf-core/phaseimpute \ @@ -224,7 +216,9 @@ The required flags for this mode are: You can find an overview of the results produced by this step in the [Output](output.md). -### Start with panel preparation `--steps panelprep` +## Start with panel preparation `--steps panelprep` + +Panel preparation This steps pre-processes the reference panel in order to be ready for imputation. There are a few quality control steps that are applied to reference panels. These include actions such as removing multiallelic SNPs and indels and removing certain samples from the reference panel (such as related samples). In addition, chunks are produced which are then used in the imputation steps. It is recommended that this steps is run once and the produced files are saved, to minimize the cost of reading the reference panel each time. Then, the output files from `--steps panelprep` can be used as input in the subsequent imputation steps, such as `--steps impute`. @@ -249,7 +243,9 @@ The required flags for this mode are: You can find an overview of the results produced by this steps in the [Output](output.md). -### Start with imputation `--steps impute` +## Start with imputation `--steps impute` + +Impute target For starting from the imputation steps, the required flags are: @@ -284,26 +280,27 @@ Here is a representation on how the input files will be processed depending on t The `--batch_size` argument is used to specify the number of samples to be processed at once. This is useful when the number of samples is large and the memory is limited. The default value is 100 but it might need to be adapted to the size of each individuals data, the number of samples to be processed in parallel and the available memory. -Imputation softwares algorithm are time consuming. The computational load depend on the number of individuals, the region size and the panel size. [Some steps are computationally fixed](https://doi.org/10.1038/s41588-023-01438-3), meaning they run similarly whether you are imputing 2 individuals or 200. By grouping individuals into larger batches, these fixed-cost steps are shared among more samples, reducing the per-individual computational overhead and improving overall efficiency. This step is recommended -On the other hand we also need to limit the memory usage when working with a huge amount of individuals within a process. -Hence the necessity to use a batch_size large enough to reduce the fixed-cost stepts / individuals and not to large for the memory usage to be sustainable. +Imputation software algorithms are time-consuming, with computational load dependent on the number of individuals, region size, and panel size. [Some steps have fixed computational costs](https://doi.org/10.1038/s41588-023-01438-3), meaning they take a similar amount of time whether imputing 2 or 200 individuals. By grouping individuals into larger batches, these fixed-cost steps are shared among more samples, reducing per-individual computational overhead and improving overall efficiency. However, memory usage must also be managed carefully when processing a large number of individuals within a single batch. Therefore, it is crucial to select a `batch_size` that is large enough to minimize fixed costs per individual but not so large that memory usage becomes unsustainable. -When the number of samples exceeds the batch size, the pipeline will split the samples into batches and process them sequentially. The files used in each batch are stored in the `${outputdir}/imputation/batch` folder. -[STITCH](#stitch) and [GLIMPSE1](#glimpse1) do not support a batch size inferior to the number of samples. This limit is set up to not induce batch effect in the imputation process, as this two tools take into account the information of the target file to perform the imputation. This does on the other hand enhances the accuracy of phasing and imputation, as the target individuals might provide more informative genetic context (e.g. you have related individuals in the target). +When the number of samples exceeds the batch size, the pipeline will split the samples into batches and process them sequentially. The files for each batch are stored in the `${outputdir}/imputation/batch` folder. + +[STITCH](#stitch) and [GLIMPSE1](#glimpse1) do not support a batch size smaller than the total number of samples. This limit is set to prevent batch effects in the imputation process, as these tools rely on the genetic information from the entire target file to perform imputation. This approach, however, enhances the accuracy of phasing and imputation, as target individuals may provide a more informative genetic context (e.g., when related individuals are present in the target). + +> [!NOTE] +> If you want to disable this option and run each sample separately you can set `--batch_size 1` To summarize: -- If you have Variant Calling Format file you should join them in one and choose either GLIMPSE1 or GLIMPSE2 -- If you have alignment files all the tools are available and their will be processed in batch_size - - Glimpse1 and Stitch might induce batch effect so all the samples need to be imputed together - - Glimpse2 and Quilt can process the samples in different batches -- If you want to disable this option and run each sample separately you can set `--batch_size 1` +- If you have Variant Calling Format (VCF) files, join them into a single file and choose either GLIMPSE1 or GLIMPSE2. +- If you have alignment files (e.g., BAM or CRAM), all tools are available, and processing will occur in `batch_size`: + - GLIMPSE1 and STITCH may induce batch effects, so all samples need to be imputed together. + - GLIMPSE2 and QUILT can process samples in separate batches. -#### Imputation tools `--steps impute --tools [glimpse1, glimpse2, quilt, stitch]` +## Imputation tools `--steps impute --tools [glimpse1, glimpse2, quilt, stitch]` You can choose different software to perform the imputation. In the following sections, the typical commands for running the pipeline with each software are included. Multiple tools can be selected by separating them with a comma (eg. `--tools glimpse1,quilt`). -##### QUILT +### QUILT [QUILT](https://github.com/rwdavies/QUILT) is an R and C++ program for rapid genotype imputation from low-coverage sequence using a large reference panel. The required inputs for this program are bam samples provided in the input samplesheet (`--input`) and a csv file with the genomic chunks (`--chunks`). @@ -319,14 +316,14 @@ nextflow run nf-core/phaseimpute \ -profile docker ``` -The csv provided in `--posfile` must contain at least four columns [panel, chr, hap, legend]. The first column is the name of the panel, the second is the chromosome, then the hap and legend files produced by `--steps panelprep` unique to each chromosome. The hap and legend files are mandatory to use QUILT. +The csv provided in `--posfile` has been described before and is produced by `--steps panelprep`. The hap and legend files in this csv are mandatory to use QUILT. ```console title="posfile.csv" panel,chr,hap,legend 1000GP,chr22,1000GP.s.norel_chr22.hap.gz,1000GP.s.norel_chr22.legend.gz ``` -The csv provided in `--chunks` must contain two columns [chr, file]. The first column is the chromosome and the file column are txt with the chunks produced by GLIMPSE1, unique to each chromosome. +The csv provided in `--chunks` has been described before in this document and is necessary to run this tool. ```console title="chunks.csv" panel,chr,file @@ -350,7 +347,7 @@ nextflow run nf-core/phaseimpute \ -profile docker ``` -##### STITCH +### STITCH [STITCH](https://github.com/rwdavies/STITCH) is an R program for low coverage sequencing genotype imputation without using a reference panel. The required inputs for this program are bam samples provided in the input samplesheet (`--input`) and a `.legend.gz` file with the list of positions to genotype (`--posfile`). See [Posfile section](#samplesheet-posfile) for more information. @@ -395,7 +392,7 @@ bcftools view -G -m 2 -M 2 -v ${vcf} bcftools convert --haplegendsample ${vcf} ``` -##### GLIMPSE1 +### GLIMPSE1 [GLIMPSE1](https://github.com/odelaneau/GLIMPSE/tree/glimpse1) is a set of tools for phasing and imputation for low-coverage sequencing datasets. Recommended for many samples at >0.5x coverage and small reference panels. Glimpse1 works with alignment (i.e. BAM or CRAM) as well as variant (i.e. VCF or BCF) files as input. This is an example command to run this tool from the `--steps impute`: @@ -421,7 +418,7 @@ panel,chr,legend The csv provided in `--panel` must be prepared with `--steps panelprep` and must contain two columns [panel, chr, vcf, index]. -##### GLIMPSE2 +### GLIMPSE2 [GLIMPSE2](https://github.com/odelaneau/GLIMPSE) is a set of tools for phasing and imputation for low-coverage sequencing datasets. This is an example command to run this tool from the `--steps impute`: @@ -439,9 +436,11 @@ nextflow run nf-core/phaseimpute \ Make sure the csv with the input panel is the output from `--step panelprep` or has been previously prepared. -### Start with validation `--steps validate` +## Start with validation `--steps validate` -This steps compares a _truth_ VCF to an _imputed_ VCF in order to compute imputation accuracy. +concordance_metro + +This step compares a _truth_ VCF to an _imputed_ VCF in order to compute imputation accuracy. This also needs the frequency of the alleles. They can be computed from the reference panel by running the `--steps panelprep` and using the `--panel` with the `--compute_freq` flag ; or by using `--posfile samplesheet.csv`. ```bash @@ -471,7 +470,7 @@ panel,chr,vcf,index 1000GP,chr22,1000GP.s.norel_chr22.sites.vcf.gz,1000GP.s.norel_chr22.sites.csi ``` -### Run all steps sequentially `--steps all` +## Run all steps sequentially `--steps all` This mode runs all the previous steps. This requires several flags: @@ -486,6 +485,14 @@ This mode runs all the previous steps. This requires several flags: This can also accept `bam` or `cram` files as input but will need the additional `legend` file in the `--posfile` to call the variants. The structure of the `input_truth.csv` is the same as the `input.csv` file. See [Samplesheet input](#samplesheet-input) for more information. +### Contig Name Validation and QC + +The first step of the pipeline is to validate the consistency of contig names across all input files. Since the pipeline parallelizes the imputation process by contig, it needs to ensure that the contigs are consistently defined across several files. This step uses either the `--regions` samplesheet or the `.fai` file to identify the genomic regions to process. + +However, some contigs specified in these files may be absent from other key files, such as the `--panel`, `--posfile`, `--chunks`, `--map` (column `chr`), or `--fasta`. When this happens, the pipeline generates a warning to notify you of the missing contigs. It then narrows down the process to only the contigs that are **common across all required files**. + +Finally, the pipeline performs a detailed check with the `CHECKCHR` tool to verify that these contigs are present in every `--input` and `--input_truth` file, as well as in the individual reference panel files. This prevents inconsistencies in downstream steps. + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: @@ -581,7 +588,7 @@ A pipeline might not always support every possible argument or option of a parti One of the parameters that you might want to modify could be specific to each imputation software. As an example, running the pipeline, you may encounter that to reduce the impact of individual reads (for example in QUILT), you might need to lower coverage. This can be achieved by including any modification to a Nextflow process as an external argument using `ext.args`. You would customize the run by providing: -``` +```groovy process { withName:'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_IMPUTE_QUILT:QUILT_QUILT' { ext.args = "--downsampleToCov=1" diff --git a/modules/local/add_columns/tests/tags.yml b/modules/local/add_columns/tests/tags.yml deleted file mode 100644 index 16c32e86..00000000 --- a/modules/local/add_columns/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -add_columns: - - "modules/local/add_columns/**" diff --git a/modules/local/bam_chr_extract/tests/tags.yml b/modules/local/bam_chr_extract/tests/tags.yml deleted file mode 100644 index c2e7b397..00000000 --- a/modules/local/bam_chr_extract/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -bam_chr_extract: - - "modules/local/bam_chr_extract/**" diff --git a/modules/local/list_to_file/tests/tags.yml b/modules/local/list_to_file/tests/tags.yml deleted file mode 100644 index 16c32e86..00000000 --- a/modules/local/list_to_file/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -add_columns: - - "modules/local/add_columns/**" diff --git a/modules/local/vcf_chr_extract/tests/tags.yml b/modules/local/vcf_chr_extract/tests/tags.yml deleted file mode 100644 index aa867837..00000000 --- a/modules/local/vcf_chr_extract/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -vcf_chr_extract: - - "modules/local/vcf_chr_extract/**" diff --git a/nextflow.config b/nextflow.config index 14781b4a..e3773c9b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,7 +10,7 @@ params { // steps - steps = null + steps = null // Input options input = null diff --git a/nextflow_schema.json b/nextflow_schema.json index c51892b8..34ce5ffd 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -38,8 +38,7 @@ }, "rename_chr": { "type": "boolean", - "description": "Should the panel vcf files be renamed to match the reference genome (e.g. 'chr1' -> '1')", - "pattern": "true|false" + "description": "Should the panel VCF files be renamed to match the reference genome (e.g. 'chr1' -> '1')" }, "max_chr_names": { "type": "integer", @@ -49,7 +48,8 @@ }, "remove_samples": { "type": "string", - "description": "Comma-separated list of samples to remove from the reference panel. Useful for benchmarking purposes." + "description": "Comma-separated list of samples to remove from the reference panel. Useful for benchmarking purposes.", + "pattern": "^([a-zA-Z0-9]+)(,[a-zA-Z0-9]+)*$" }, "email": { "type": "string", @@ -126,8 +126,7 @@ }, "compute_freq": { "description": "Should the allele frequency for each variant (AC/AN fields necessary for Glimpse1 and the validation step) be computed using VCFFIXUP tool. This can be necessary if the fields are absent from the panel or if samples have been removed.", - "type": "boolean", - "pattern": "true|false" + "type": "boolean" }, "binaryref": { "type": "string", @@ -144,8 +143,7 @@ "batch_size": { "type": "integer", "description": "Maximal number of individuals per batch for imputation.", - "default": 100, - "pattern": "^\\d+$" + "default": 100 }, "chunks": { "type": "string", @@ -188,9 +186,10 @@ }, "min_val_gl": { "type": "number", - "description": "Minimum genotype likelihood probability P(G|R) in validation data. Set to zero to have no filter of if using gt-validation", + "description": "Minimum genotype likelihood probability P(G|R) in validation data. Set to zero to have no filter, if using gt-validation", "default": 0.9, - "pattern": "^\\d+(\\.\\d+)?$" + "pattern": "^\\d+(\\.\\d+)?$", + "minimum": 0 }, "min_val_dp": { "type": "integer",