molgenis · dennishendriksen · Oct 26, 2023 · Nov 1, 2023 · Nov 1, 2023 · Nov 1, 2023
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@ bash vip/install.sh
 ### Usage
 ```bash
 usage: vip -w <arg> -i <arg> -o <arg>
-  -w, --workflow <arg>  workflow to execute. allowed values: cram, fastq, gvcf, vcf
+  -w, --workflow <arg>  workflow to execute. allowed values: cram, fastq, gvcf, vcf, pod5
   -i, --input    <arg>  path to sample sheet .tsv
   -o, --output   <arg>  output folder
   -c, --config   <arg>  path to additional nextflow .cfg (optional)
@@ -39,5 +39,41 @@ pip install mkdocs mkdocs-mermaid2-plugin
 mkdocs serve
 ```
 
+## Proof of Concept - Methylation
+All the files and directories that are adapted or added for the support of base modification and POD5 data
+```
+config/nxf_pod5.config
+config/nxf_vcf.config
+docs/
+modules/pod5/
+modules/vcf/report.nf
+modules/vcf/templates/report.sh
+resources/pod5/
+test/suites/pod5/
+utils/build.sh
+vip_pod5.nf
+vip_vcf.nf
+vip.sh
+install.sh
+```
+
+## How to install VIP and test this branch
+```
+# Clone repository and switch to PoC/Methylation branch
+git clone https://github.com/molgenis/vip.git
+cd vip
+git checkout PoC/Methylation
+
+# Install to download tools
+bash install.sh
+
+# Test the pod5 workflow
+cd test
+ml awscli
+bash test.sh -t pod5
+
+# Output can be found in test/output/
+```
+
 ### License
 VIP is an aggregate work of many works, each covered by their own licence(s). For the purposes of determining what you can do with specific works in VIP, this policy should be read together with the licence(s) of the relevant tools. For the avoidance of doubt, where any other licence grants rights, this policy does not modify or reduce those rights under those licences.
diff --git a/config/nxf_pod5.config b/config/nxf_pod5.config
@@ -0,0 +1,39 @@
+includeConfig 'nxf.config'
+includeConfig 'nxf_cram.config'
+
+// Environmental commands
+env {
+	CMD_DORADO = "apptainer exec --nv --no-mount home --bind \${TMPDIR} ${APPTAINER_CACHEDIR}/dorado-shac28cd94f2303b0493a4b16ca86e711852c2b8525.sif dorado"
+  CMD_MODKIT = "apptainer exec --no-mount home --bind \${TMPDIR} ${APPTAINER_CACHEDIR}/modkit-sha3745cd8f97213eaf908f5fbf4f2f8b8e2cedfc30.sif modkit"
+}
+
+// Process how to execute
+process {
+	withLabel: 'dorado'{
+		executor = 'slurm'
+		memory = '40GB'
+		time = '10h'
+		cpus = 20
+		clusterOptions = '--gres=gpu:a40:1 --qos=priority'
+	}
+
+	withLabel: 'sort_bam'{
+		executor = 'slurm'
+		memory = '10GB'
+		time = '10h'
+		cpus = 10
+	}
+
+	withLabel: 'modkit'{
+		executor = 'slurm'
+		memory = '30GB'
+		time = '10h'
+		cpus = 5
+	}
+}
+
+// Parameters used in workflow pod5
+params {
+	dorado_model = "${projectDir}/resources/pod5/[email protected]/"
+}
+
diff --git a/config/nxf_vcf.config b/config/nxf_vcf.config
@@ -37,7 +37,7 @@ process {
   }
 
   withLabel: 'vcf_report' {
-    memory = '4GB'
+    memory = '100GB'
   }
 }
 
@@ -106,9 +106,11 @@ params {
 
     report {
       include_crams = true
+      include_bedmethyls = true
       max_records = ""
       max_samples = ""
-      template = ""
+      template = "${projectDir}/resources/pod5/pod5_template.html"
+      vcf_report_jar = "${projectDir}/resources/pod5/pod5-vcf-report.jar"
 
       GRCh38 {
         genes = "${projectDir}/resources/GRCh38/GCF_000001405.39_GRCh38.p13_genomic_mapped.gff.gz"

diff --git a/docs/about/acknowledgements.md b/docs/about/acknowledgements.md
@@ -23,4 +23,6 @@ Standing on the shoulders of giants. This project could not have possible withou
 - [cuteSV](https://github.com/tjiangHIT/cuteSV)
 - [Straglr](https://github.com/philres/straglr)
 - [Stranger](https://github.com/Clinical-Genomics/stranger)
-- [fastp](https://github.com/OpenGene/fastp)
+- [fastp](https://github.com/OpenGene/fastp)
+- [Dorado](https://github.com/nanoporetech/dorado)
+- [Modkit](https://github.com/nanoporetech/modkit)
diff --git a/docs/examples/pod5.md b/docs/examples/pod5.md
@@ -0,0 +1,18 @@
+# POD5
+To run vip with POD5 data, just specify the POD5 paths in your sample sheet.
+
+## Samplesheet
+See an example for the samplesheet below, the example shows the samplesheet for a run starting from the `pod5`.
+
+```
+individual_id	pod5
+your_sample_id		path/to/your/data_1.pod5,path/to/your/data_2.pod5
+```
+
+## Run the pipeline
+```bash
+cd vip
+vip --workflow pod5 --input path/to/samplesheet.tsv --output path/to/output/folder
+```
+
+For an example on how to execute the `pod5` workflow see [here](https://github.com/molgenis/vip/blob/229fc8c6d01bfb9e0dcdfee85d6e903b31f71f7a/test/suites/pod5/hg001_giab_2023.05.sh#L16C1-L16C28)
diff --git a/docs/get_started/requirements.md b/docs/get_started/requirements.md
@@ -5,7 +5,7 @@ Before installing VIP please check whether your system meets the following requi
 - Bash ≥ 3.2
 - Java ≥ 11
 - [Apptainer](https://apptainer.org/docs/admin/main/installation.html#install-from-pre-built-packages) (setuid installation)
-- 8GB RAM <sup>1</sup>
+- 100GB RAM <sup>1</sup>
 - 150GB disk space
 
 1) The memory requirements differ per workflow and depend, on the size of your input data, the scheduler that you use, the amount of parallelization. For example, executing VIP using a job scheduler will reduce the memory requirements on the system submitting the jobs to 1-2GB.

diff --git a/docs/home/key_features.md b/docs/home/key_features.md
@@ -2,9 +2,11 @@
 VIP is an easy to install, easy to use, portable and flexible pipeline implemented using [Nextflow](https://www.nextflow.io/).
 Features include:
 
-- Workflows for a broad range of input file types: `bam`, `cram`, `fastq`, `g.vcf`, `vcf`
+- Workflows for a broad range of input file types: `pod5`, `bam`, `cram`, `fastq`, `g.vcf`, `vcf`
 - Produces stand-alone variant interpretation HTML report with integrated genome browser  
 - Long-read sequencing support (Oxford Nanopore, PacBio HiFi)
+- Supports base modification in `cram` files with methylation tags: [SAMtags](https://samtools.github.io/hts-specs/SAMtags.pdf)
+- Supports bedmethyl visualisation in genome browser 
 - Short-read sequencing support (Illumina, both single and paired-end reads)
 - Supports GRCh38, supports GRCh37 and T2T via liftover
 - Short variant detection

diff --git a/docs/index.md b/docs/index.md
@@ -1,15 +1,15 @@
 # Variant Interpretation Pipeline (VIP)
 VIP is a flexible human variant interpretation pipeline for rare disease using state-of-the-art pathogenicity prediction ([CAPICE](https://github.com/molgenis/capice)) and template-based interactive reporting to facilitate decision support.
 
-The VIP pipeline can be used starting from either your `fastq`, `bam/cram` or `.g.vcf/vcf` data,
+The VIP pipeline can be used starting from either your `pod5`, `fastq`, `bam/cram` or `.g.vcf/vcf` data,
 every entry point will result in a `vcf` file with your annotated, classified and filtered variants 
 as well as a interactive HTML report with the same variants, prioritized by the CAPICE pathogenicity score 
 and providing additional aids like a genome browser and a representation of the decisions leading to the VIP classification.
 VIP can be used for single patients, families or cohort data.
 
 [Click here for a live example](vip_giab_hg001.html)
 
-![Example report](img/report_example.png)]
+![Example report](img/report_example.png)
 
 *Above: report example*
 

diff --git a/docs/usage/command-line-options.md b/docs/usage/command-line-options.md
@@ -7,7 +7,7 @@ In addition to the `.vcf.gz` an interactive `.html` report is produced that can
 
 ```
 usage: vip -w <arg> -i <arg> -o <arg>
-  -w, --workflow <arg>  workflow to execute. allowed values: cram, fastq, gvcf, vcf
+  -w, --workflow <arg>  workflow to execute. allowed values: cram, fastq, gvcf, vcf, pod5
   -i, --input    <arg>  path to sample sheet .tsv
   -o, --output   <arg>  output folder
   -c, --config   <arg>  path to additional nextflow .cfg (optional)
@@ -30,6 +30,7 @@ usage: vip -w <arg> -i <arg> -o <arg>
 By default `vip`:
 
 - Assumes an Illumina sequencing platform was used to generate the input data
+- Assumes Nanopore sequencing was used to generate input data for `pod5` workflow
 - Assumes whole-genome sequencing (WGS) method was used to generate the input data
 - Uses a GRCh38 reference genome ([GCA_000001405.15 / GCF_000001405.26](https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.26/))
 - Provides classification trees for default variant filtration. For details, see [here](../advanced/classification_trees.md)

diff --git a/docs/usage/config.md b/docs/usage/config.md
@@ -22,6 +22,11 @@ An additional configuration file can be supplied on the command-line to overwrit
 **Warning:**
 Please take note of the fact that for a different reference fasta.gz the  unzipped referenfasta file is also required. Both the zipped and unzipped fasta should have an index.
 
+### POD5
+| key                       | default     | description                                                                                            |
+|---------------------------|-------------|--------------------------------------------------------------------------------------------------------|
+| dorado_model				| *installed* | for details, see [here](https://github.com/nanoporetech/dorado)                                               |
+
 ### FASTQ
 | key                       | default     | description                                                                                            |
 |---------------------------|-------------|--------------------------------------------------------------------------------------------------------|

diff --git a/docs/usage/input.md b/docs/usage/input.md
@@ -38,6 +38,11 @@ The following sections describe the columns that can be used in every sample-she
 
 <sup>1</sup> Exception: if no probands are defined in the sample-sheet then all samples are considered to be probands.
 
+## Columns: POD5
+| column                  | type     | required | default      | description                                                                                                 |
+|-------------------------|----------|----------|--------------|-------------------------------------------------------------------------------------------------------------|
+| ``pod5``                | ``file`` | yes      |              | allowed file extensions: ``pod5``                                                       					 |
+
 ## Columns: FASTQ
 | column                  | type          | required        | default      | description                                                                                                 |
 |-------------------------|---------------|-----------------|--------------|-------------------------------------------------------------------------------------------------------------|
@@ -68,3 +73,4 @@ The following sections describe the columns that can be used in every sample-she
 | ``assembly`` | ``enum`` |          | ``GRCh38`` | allowed values: [``GRCh37``, ``GRCh38``, ``T2T``], value must be the same for all project samples                                             |
 | ``vcf``      | ``file`` | yes      |            | allowed file extensions: [``vcf``, ``vcf.gz``, ``vcf.bgz``, ``bcf``, ``bcf.gz``, ``bcf.bgz``], value must be the same for all project samples |
 | ``cram``     | ``file`` |          |            | allowed file extensions: [``bam``, ``cram``, ``sam``]                                                                                         |
+| ``bedmethyl``| ``file`` |          |            | allowed file extensions: ``bedmethyl``                                                       					 |
diff --git a/docs/usage/workflow.md b/docs/usage/workflow.md
@@ -1,8 +1,19 @@
 # Workflow
-VIP consists of four workflows depending on the type of input data: fastq, bam/cram, gvcf or vcf.
-The `fastq` workflow is an extension of the `cram` workflow. The `cram` and `gvcf` workflows are extensions of the `vcf` workflow.
+VIP consists of five workflows depending on the type of input data: pod5, fastq, bam/cram, gvcf or vcf.
+The `fastq` and `pod5` workflows RE an extension of the `cram` workflow. The `cram` and `gvcf` workflows are extensions of the `vcf` workflow.
-The `fastq` and `pod5` workflows RE an extension of the `cram` workflow. The `cram` and `gvcf` workflows are extensions of the `vcf` workflow.
+The `fastq` and `pod5` workflows are extensions of the `cram` workflow. The `cram` and `gvcf` workflows are extensions of the `vcf` workflow.
+
-The `fastq` and `pod5` workflows RE an extension of the `cram` workflow. The `cram` and `gvcf` workflows are extensions of the `vcf` workflow.
+The `fastq` and `pod5` workflows are extensions of the `cram` workflow. The `cram` and `gvcf` workflows are extensions of the `vcf` workflow.
+
 The `vcf` workflow produces the pipeline outputs as described [here](./output.md).
-The following sections provide an overview of the steps of each of these workflows. 
+The following sections provide an overview of the steps of each of these workflows.
+
+## POD5
+The `pod5` workflow consists of the following steps:
+
+1. Parallelize sample sheet per sample and for each sample
+2. Modified basecalling and alignment using [Dorado](https://github.com/nanoporetech/dorado) producing a `bam` file per sample
+3. Sorting the `bam` file per sample and create an index and stats file using [Samtools](http://samtools.github.io/)
+4. Perform pileup with [Modkit](https://github.com/nanoporetech/modkit) to construct a bedMethyl table per sample
-4. Perform pileup with [Modkit](https://github.com/nanoporetech/modkit) to construct a bedMethyl table per sample
+4. Perform pileup with [Modkit](https://github.com/nanoporetech/modkit) to construct a bedMethyl file per sample
+
-4. Perform pileup with [Modkit](https://github.com/nanoporetech/modkit) to construct a bedMethyl table per sample
+4. Perform pileup with [Modkit](https://github.com/nanoporetech/modkit) to construct a bedMethyl file per sample
+
+5. Continue with step 3. of the `cram` workflow
+
+For details, see [here](https://github.com/molgenis/vip/blob/main/vip_pod5.nf).
 
 ## FASTQ
 The `fastq` workflow consists of the following steps:
@@ -24,7 +35,7 @@ The `cram` workflow consists of the following steps:
     1. Using [ExpansionHunter](https://github.com/Illumina/ExpansionHunter) for Illumina short read data.
     2. Using this [fork of Straglr](https://github.com/philres/straglr) for PacBio and Nanopore long read data, this fork is chosen over the original [Straglr](https://github.com/bcgsc/straglr) because of the VCF output that enables VIP to combine it with the SV and SNV data in the VCF workflow.
 4. Parallelize cram in chunks consisting of one or more contigs and for each chunk
-    1. Perform short variant calling with [DeepVariant](https://github.com/google/deepvariant) producing a `gvcf` file per chunk per sample, the gvcfs of the samples in a project are than merged to one vcf per project (using [GLnexus](https://github.com/dnanexus-rnd/GLnexus).
+    1. Perform short variant calling with [DeepVariant](https://github.com/google/deepvariant) producing a `gvcf` file per chunk per sample, the gvcfs of the samples in a project are than merged to one vcf per project (using [GLnexus](https://github.com/dnanexus-rnd/GLnexus)).
     2. Perform structural variant calling with [Manta](https://github.com/Illumina/manta) or [cuteSV](https://github.com/tjiangHIT/cuteSV) producing a `vcf` file per chunk per project.
 5. Concatenate short variant calling and structural variant calling `vcf` files per chunk per sample
 6. Continue with step 3. of the `vcf` workflow

diff --git a/install.sh b/install.sh
@@ -74,12 +74,14 @@ download_files() {
   urls+=("c7655e4ffce0178a1a0dcc0ed097cd8f" "images/cutesv-2.0.3.sif")
   urls+=("8efa3c0f6c0f5378ca22d16074f50dfe" "images/deepvariant-1.6.0.sif")
   urls+=("b67e8c1d774c0d22de70b7be79aaa05e" "images/deepvariant_deeptrio-1.6.0.sif")
+  urls+=("8d7a34c469bbd1d27c324a867713cd4b" "images/dorado-shac28cd94f2303b0493a4b16ca86e711852c2b8525.sif")
   urls+=("78a8ce16c9d8bac53e5fbca4f763dcef" "images/expansionhunter-5.0.0.sif")
   urls+=("afed919dc16ccdae1869cf6dbc5a19d5" "images/fastp-0.23.4.sif")
   urls+=("494c8c9e1031828f48027e34032de423" "images/gado-1.0.3.sif")
   urls+=("d25ba2124ef883b1b6f7a2eff2cb8201" "images/glnexus_v1.4.5-patched.sif")
   urls+=("ff8aceb2c9f185307a69b981ba08efd8" "images/manta-1.6.0.sif")
   urls+=("1e0caddbdd755bf608ef024e3d0a2f19" "images/minimap2-2.26.sif")
+  urls+=("7422915ce79a9dc120cb82fa4f2c06dd" "images/modkit-sha3745cd8f97213eaf908f5fbf4f2f8b8e2cedfc30.sif")
   urls+=("06ac8a76a307fa42fffd80ab906fd24b" "images/picard-3.1.1.sif")
   urls+=("9a4b685b26744113d3ea0a3904c02706" "images/samtools-1.17-patch1.sif")
   urls+=("2c18fcda2660792a7c8ba390463ae7ac" "images/straglr-philres-1.4.2.sif")

diff --git a/modules/pod5/dorado.nf b/modules/pod5/dorado.nf
@@ -0,0 +1,19 @@
+process dorado {
+	// Basecall pod5 files using Dorado
+	label 'dorado'
+	publishDir "$params.output/intermediates", mode: 'link'
+
+	input:
+	tuple val(meta), path(pod5)
+
+	output:
+	tuple val(meta), path(bam)
+
+  	shell:
+	reference=params[params.assembly].reference.fasta
+	bam="${meta.project.id}_${meta.sample.family_id}_${meta.sample.individual_id}.bam"
+
+	template "dorado.sh"
+
+
+}  
diff --git a/modules/pod5/modkit.nf b/modules/pod5/modkit.nf
@@ -0,0 +1,25 @@
+process modkit {
+	// Proccess bam files using Modkit tool
+
+	label 'modkit'
+	publishDir "$params.output/intermediates", mode: 'link'
+
+	input:
+	tuple val(meta), path(sorted_bam), path(sorted_bam_index)
+
+	output:
+	tuple val(meta), path(bedmethyl)
+
+  	shell:
+	refSeqPath = params[params.assembly].reference.fasta
+    reference = refSeqPath.substring(0, refSeqPath.lastIndexOf('.'))
+	name = "${meta.project.id}_${meta.sample.family_id}_${meta.sample.individual_id}"
+	bedmethyl = "${name}.bedmethyl"
+	converted_bam = "${name}_converted.bam"
+	converted_bam_index = "${name}_converted.bam.csi"
+	summary_modkit = "${name}_summary_modkit.txt"
+	log_modkit = "${name}_modkit.log"
+
+	template 'modkit.sh'
+
+}  
diff --git a/modules/pod5/samtools.nf b/modules/pod5/samtools.nf
@@ -0,0 +1,19 @@
+process sort_bam {
+	// Sort bam files using SAMTools
+	label "sort_bam"
+	publishDir "$params.output/intermediates", mode: 'link'
+
+	input:
+	tuple val(meta), path(bam)
+
+	output:
+	tuple val(meta), path(sortedBam), path(sortedBamIndex), path(sortedBamStats)
+
+  	shell:
+	sortedBam="${meta.project.id}_${meta.sample.family_id}_${meta.sample.individual_id}_sorted.bam"
+	sortedBamIndex="${meta.project.id}_${meta.sample.family_id}_${meta.sample.individual_id}_sorted.bam.csi"
+	sortedBamStats="${meta.project.id}_${meta.sample.family_id}_${meta.sample.individual_id}_sorted.bam.stats"
+
+	template 'samtools.sh'
+
+}
diff --git a/modules/pod5/templates/dorado.sh b/modules/pod5/templates/dorado.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -euo pipefail
+
+mod_basecaller() {
+  # Command for Dorado tool
+  echo "working"
+  ${CMD_DORADO} basecaller !{params.dorado_model} ./ --modified-bases 5mCG_5hmCG --reference !{reference} > !{bam}
-  ${CMD_DORADO} basecaller !{params.dorado_model} ./ --modified-bases 5mCG_5hmCG --reference !{reference} > !{bam}
+  ${CMD_DORADO} basecaller "!{params.dorado_model}" ./ --modified-bases 5mCG_5hmCG --reference "!{reference}" > "!{bam}"
-  ${CMD_DORADO} basecaller !{params.dorado_model} ./ --modified-bases 5mCG_5hmCG --reference !{reference} > !{bam}
+  ${CMD_DORADO} basecaller "!{params.dorado_model}" ./ --modified-bases 5mCG_5hmCG --reference "!{reference}" > "!{bam}"
+}
+
+main() {
+  mod_basecaller    
+}
+
+main "$@"
diff --git a/modules/pod5/templates/modkit.sh b/modules/pod5/templates/modkit.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+set -euo pipefail
+
+summary() {
+  # Use modkit tool to summarize bam files
+  ${CMD_MODKIT} summary !{sorted_bam} > !{summary_modkit}
-  ${CMD_MODKIT} summary !{sorted_bam} > !{summary_modkit}
+  ${CMD_MODKIT} summary "!{sorted_bam}" > "!{summary_modkit}"
-  ${CMD_MODKIT} summary !{sorted_bam} > !{summary_modkit}
+  ${CMD_MODKIT} summary "!{sorted_bam}" > "!{summary_modkit}"
+}
+
+adjust_mod() {
+  ${CMD_MODKIT} adjust-mods !{sorted_bam} !{converted_bam} --convert h m
-  ${CMD_MODKIT} adjust-mods !{sorted_bam} !{converted_bam} --convert h m
+  ${CMD_MODKIT} adjust-mods "!{sorted_bam}" "!{converted_bam}" --convert h m
-  ${CMD_MODKIT} adjust-mods !{sorted_bam} !{converted_bam} --convert h m
+  ${CMD_MODKIT} adjust-mods "!{sorted_bam}" "!{converted_bam}" --convert h m
+  ${CMD_SAMTOOLS} index -c !{converted_bam}
-  ${CMD_SAMTOOLS} index -c !{converted_bam}
+  ${CMD_SAMTOOLS} index -c "!{converted_bam}"
-  ${CMD_SAMTOOLS} index -c !{converted_bam}
+  ${CMD_SAMTOOLS} index -c "!{converted_bam}"
+}
+
+pileup() {
+  # Use modkit tool to process bam to bedmethyl file
+	${CMD_MODKIT} pileup !{converted_bam} !{bedmethyl} --cpg --ref !{reference} --only-tabs --log-filepath !{log_modkit}
-	${CMD_MODKIT} pileup !{converted_bam} !{bedmethyl} --cpg --ref !{reference} --only-tabs --log-filepath !{log_modkit}
+	${CMD_MODKIT} pileup "!{converted_bam}" "!{bedmethyl}" --cpg --ref "!{reference}" --only-tabs --log-filepath "!{log_modkit}"
-	${CMD_MODKIT} pileup !{converted_bam} !{bedmethyl} --cpg --ref !{reference} --only-tabs --log-filepath !{log_modkit}
+	${CMD_MODKIT} pileup "!{converted_bam}" "!{bedmethyl}" --cpg --ref "!{reference}" --only-tabs --log-filepath "!{log_modkit}"
+}
+
+main() {
+  summary
+  adjust_mod
+  pileup    
+}
+
+main "$@"