-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Methylation #543
base: main
Are you sure you want to change the base?
Methylation #543
Changes from all commits
7f341d2
7bc46f6
b83599b
ab020d6
36bf8e7
a8e4baf
47a37b5
3d9ca87
44fb4a6
d76589e
9aa3601
e4028d0
da3d75f
8ef202f
ddd2212
32520ca
29b1611
3778998
bc5cade
a06be6b
6017920
3ae3dc9
3ee0bab
a0aeedb
ebdd11e
de6c4fd
dda383e
ac46037
15299e7
8325265
879e6fe
9b4eed8
2d47c88
32c886a
9117441
3ee31d8
2369eb5
8fa5388
da010a8
93e19e4
6f0bf24
45f5f91
591a3c9
5dd56d4
9892628
a10fa30
5c01e15
f5edd79
300b8e6
229fc8c
e9e9012
a96f6eb
fe06d5f
51080df
0a5a4ff
3d9f831
bac6588
b0fc308
c45f66f
7ba380c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
includeConfig 'nxf.config' | ||
includeConfig 'nxf_cram.config' | ||
|
||
// Environmental commands | ||
env { | ||
CMD_DORADO = "apptainer exec --nv --no-mount home --bind \${TMPDIR} ${APPTAINER_CACHEDIR}/dorado-shac28cd94f2303b0493a4b16ca86e711852c2b8525.sif dorado" | ||
CMD_MODKIT = "apptainer exec --no-mount home --bind \${TMPDIR} ${APPTAINER_CACHEDIR}/modkit-sha3745cd8f97213eaf908f5fbf4f2f8b8e2cedfc30.sif modkit" | ||
} | ||
|
||
// Process how to execute | ||
process { | ||
withLabel: 'dorado'{ | ||
executor = 'slurm' | ||
memory = '40GB' | ||
time = '10h' | ||
cpus = 20 | ||
clusterOptions = '--gres=gpu:a40:1 --qos=priority' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do not hardcore qos, this is selected by the user There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do not assume that gpus are available. I suggest to make it configurable somehow. |
||
} | ||
|
||
withLabel: 'sort_bam'{ | ||
executor = 'slurm' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove |
||
memory = '10GB' | ||
time = '10h' | ||
cpus = 10 | ||
} | ||
|
||
withLabel: 'modkit'{ | ||
executor = 'slurm' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove |
||
memory = '30GB' | ||
time = '10h' | ||
cpus = 5 | ||
} | ||
} | ||
|
||
// Parameters used in workflow pod5 | ||
params { | ||
dorado_model = "${projectDir}/resources/pod5/[email protected]/" | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,7 +37,7 @@ process { | |
} | ||
|
||
withLabel: 'vcf_report' { | ||
memory = '4GB' | ||
memory = '100GB' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please document why the process requires 100GB instead of 4GB |
||
} | ||
} | ||
|
||
|
@@ -106,9 +106,11 @@ params { | |
|
||
report { | ||
include_crams = true | ||
include_bedmethyls = true | ||
max_records = "" | ||
max_samples = "" | ||
template = "" | ||
template = "${projectDir}/resources/pod5/pod5_template.html" | ||
vcf_report_jar = "${projectDir}/resources/pod5/pod5-vcf-report.jar" | ||
|
||
GRCh38 { | ||
genes = "${projectDir}/resources/GRCh38/GCF_000001405.39_GRCh38.p13_genomic_mapped.gff.gz" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# POD5 | ||
To run vip with POD5 data, just specify the POD5 paths in your sample sheet. | ||
|
||
## Samplesheet | ||
See an example for the samplesheet below, the example shows the samplesheet for a run starting from the `pod5`. | ||
|
||
``` | ||
individual_id pod5 | ||
your_sample_id path/to/your/data_1.pod5,path/to/your/data_2.pod5 | ||
``` | ||
|
||
## Run the pipeline | ||
```bash | ||
cd vip | ||
vip --workflow pod5 --input path/to/samplesheet.tsv --output path/to/output/folder | ||
``` | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This example would benefit from an actual report that displays methylation in a report. Admittedly other sections of the VIP documentation could benefit from more examples as well. As a user it is hard to understand how methylation is beneficial. |
||
For an example on how to execute the `pod5` workflow see [here](https://github.com/molgenis/vip/blob/229fc8c6d01bfb9e0dcdfee85d6e903b31f71f7a/test/suites/pod5/hg001_giab_2023.05.sh#L16C1-L16C28) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,7 @@ Before installing VIP please check whether your system meets the following requi | |
- Bash ≥ 3.2 | ||
- Java ≥ 11 | ||
- [Apptainer](https://apptainer.org/docs/admin/main/installation.html#install-from-pre-built-packages) (setuid installation) | ||
- 8GB RAM <sup>1</sup> | ||
- 100GB RAM <sup>1</sup> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. note to self: update when config ram is updated |
||
- 150GB disk space | ||
|
||
1) The memory requirements differ per workflow and depend, on the size of your input data, the scheduler that you use, the amount of parallelization. For example, executing VIP using a job scheduler will reduce the memory requirements on the system submitting the jobs to 1-2GB. | ||
|
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
@@ -1,8 +1,19 @@ | ||||||||
# Workflow | ||||||||
VIP consists of four workflows depending on the type of input data: fastq, bam/cram, gvcf or vcf. | ||||||||
The `fastq` workflow is an extension of the `cram` workflow. The `cram` and `gvcf` workflows are extensions of the `vcf` workflow. | ||||||||
VIP consists of five workflows depending on the type of input data: pod5, fastq, bam/cram, gvcf or vcf. | ||||||||
The `fastq` and `pod5` workflows RE an extension of the `cram` workflow. The `cram` and `gvcf` workflows are extensions of the `vcf` workflow. | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
The `vcf` workflow produces the pipeline outputs as described [here](./output.md). | ||||||||
The following sections provide an overview of the steps of each of these workflows. | ||||||||
The following sections provide an overview of the steps of each of these workflows. | ||||||||
|
||||||||
## POD5 | ||||||||
The `pod5` workflow consists of the following steps: | ||||||||
|
||||||||
1. Parallelize sample sheet per sample and for each sample | ||||||||
2. Modified basecalling and alignment using [Dorado](https://github.com/nanoporetech/dorado) producing a `bam` file per sample | ||||||||
3. Sorting the `bam` file per sample and create an index and stats file using [Samtools](http://samtools.github.io/) | ||||||||
4. Perform pileup with [Modkit](https://github.com/nanoporetech/modkit) to construct a bedMethyl table per sample | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
5. Continue with step 3. of the `cram` workflow | ||||||||
|
||||||||
For details, see [here](https://github.com/molgenis/vip/blob/main/vip_pod5.nf). | ||||||||
|
||||||||
## FASTQ | ||||||||
The `fastq` workflow consists of the following steps: | ||||||||
|
@@ -24,7 +35,7 @@ The `cram` workflow consists of the following steps: | |||||||
1. Using [ExpansionHunter](https://github.com/Illumina/ExpansionHunter) for Illumina short read data. | ||||||||
2. Using this [fork of Straglr](https://github.com/philres/straglr) for PacBio and Nanopore long read data, this fork is chosen over the original [Straglr](https://github.com/bcgsc/straglr) because of the VCF output that enables VIP to combine it with the SV and SNV data in the VCF workflow. | ||||||||
4. Parallelize cram in chunks consisting of one or more contigs and for each chunk | ||||||||
1. Perform short variant calling with [DeepVariant](https://github.com/google/deepvariant) producing a `gvcf` file per chunk per sample, the gvcfs of the samples in a project are than merged to one vcf per project (using [GLnexus](https://github.com/dnanexus-rnd/GLnexus). | ||||||||
1. Perform short variant calling with [DeepVariant](https://github.com/google/deepvariant) producing a `gvcf` file per chunk per sample, the gvcfs of the samples in a project are than merged to one vcf per project (using [GLnexus](https://github.com/dnanexus-rnd/GLnexus)). | ||||||||
2. Perform structural variant calling with [Manta](https://github.com/Illumina/manta) or [cuteSV](https://github.com/tjiangHIT/cuteSV) producing a `vcf` file per chunk per project. | ||||||||
5. Concatenate short variant calling and structural variant calling `vcf` files per chunk per sample | ||||||||
6. Continue with step 3. of the `vcf` workflow | ||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
process dorado { | ||
// Basecall pod5 files using Dorado | ||
label 'dorado' | ||
publishDir "$params.output/intermediates", mode: 'link' | ||
|
||
input: | ||
tuple val(meta), path(pod5) | ||
|
||
output: | ||
tuple val(meta), path(bam) | ||
|
||
shell: | ||
reference=params[params.assembly].reference.fasta | ||
bam="${meta.project.id}_${meta.sample.family_id}_${meta.sample.individual_id}.bam" | ||
|
||
template "dorado.sh" | ||
|
||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
process modkit { | ||
// Proccess bam files using Modkit tool | ||
|
||
label 'modkit' | ||
publishDir "$params.output/intermediates", mode: 'link' | ||
|
||
input: | ||
tuple val(meta), path(sorted_bam), path(sorted_bam_index) | ||
|
||
output: | ||
tuple val(meta), path(bedmethyl) | ||
|
||
shell: | ||
refSeqPath = params[params.assembly].reference.fasta | ||
reference = refSeqPath.substring(0, refSeqPath.lastIndexOf('.')) | ||
name = "${meta.project.id}_${meta.sample.family_id}_${meta.sample.individual_id}" | ||
bedmethyl = "${name}.bedmethyl" | ||
converted_bam = "${name}_converted.bam" | ||
converted_bam_index = "${name}_converted.bam.csi" | ||
summary_modkit = "${name}_summary_modkit.txt" | ||
log_modkit = "${name}_modkit.log" | ||
|
||
template 'modkit.sh' | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
process sort_bam { | ||
// Sort bam files using SAMTools | ||
label "sort_bam" | ||
publishDir "$params.output/intermediates", mode: 'link' | ||
|
||
input: | ||
tuple val(meta), path(bam) | ||
|
||
output: | ||
tuple val(meta), path(sortedBam), path(sortedBamIndex), path(sortedBamStats) | ||
|
||
shell: | ||
sortedBam="${meta.project.id}_${meta.sample.family_id}_${meta.sample.individual_id}_sorted.bam" | ||
sortedBamIndex="${meta.project.id}_${meta.sample.family_id}_${meta.sample.individual_id}_sorted.bam.csi" | ||
sortedBamStats="${meta.project.id}_${meta.sample.family_id}_${meta.sample.individual_id}_sorted.bam.stats" | ||
|
||
template 'samtools.sh' | ||
|
||
} |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,14 @@ | ||||||
#!/bin/bash | ||||||
set -euo pipefail | ||||||
|
||||||
mod_basecaller() { | ||||||
# Command for Dorado tool | ||||||
echo "working" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can be removed |
||||||
${CMD_DORADO} basecaller !{params.dorado_model} ./ --modified-bases 5mCG_5hmCG --reference !{reference} > !{bam} | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. command fails if variables contain characters such as a space:
Suggested change
|
||||||
} | ||||||
|
||||||
main() { | ||||||
mod_basecaller | ||||||
} | ||||||
|
||||||
main "$@" |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,25 @@ | ||||||
#!/bin/bash | ||||||
set -euo pipefail | ||||||
|
||||||
summary() { | ||||||
# Use modkit tool to summarize bam files | ||||||
${CMD_MODKIT} summary !{sorted_bam} > !{summary_modkit} | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
} | ||||||
|
||||||
adjust_mod() { | ||||||
${CMD_MODKIT} adjust-mods !{sorted_bam} !{converted_bam} --convert h m | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
${CMD_SAMTOOLS} index -c !{converted_bam} | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
} | ||||||
|
||||||
pileup() { | ||||||
# Use modkit tool to process bam to bedmethyl file | ||||||
${CMD_MODKIT} pileup !{converted_bam} !{bedmethyl} --cpg --ref !{reference} --only-tabs --log-filepath !{log_modkit} | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
} | ||||||
|
||||||
main() { | ||||||
summary | ||||||
adjust_mod | ||||||
pileup | ||||||
} | ||||||
|
||||||
main "$@" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do not hardcode, the executor is selected by the user or auto-selected elsewhere