Skip to content

Commit

Permalink
Take a stab at estimating depth based on filesize.
Browse files Browse the repository at this point in the history
- updated parameter_meta
- updated inputs.json
- cleaned up some whitespace
- added comments
- using fasta filesize to estimate depth rather than a separate task; based on Greg's experiments, an uncompressed 10x FASTA is ~60GB
  • Loading branch information
williamrowell committed Sep 28, 2023
1 parent 6c02327 commit 49acf5c
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 179 deletions.
11 changes: 6 additions & 5 deletions workflows/assemble_genome/assemble_genome.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ workflow assemble_genome {
parameter_meta {
sample_id: {help: "Sample ID; used for naming files"}
reads_fastas: {help: "Reads in fasta format to be used for assembly; one for each movie bam to be used in assembly. Reads fastas from one or more sample may be combined to use in the assembly"}
reference: {help: "Reference genome data"}
references: {help: "Array of Reference genomes data"}
hiiasm_extra_params: {help: "[OPTIONAL] Additional parameters to pass to hifiasm assembly"}
father_yak: {help: "[OPTIONAL] kmer counts for the father; required if running trio-based assembly"}
mother_yak: {help: "[OPTIONAL] kmer counts for the mother; required if running trio-based assembly"}
Expand All @@ -98,7 +98,7 @@ task hifiasm_assemble {
String prefix = "~{sample_id}.asm"
Int threads = 48
Int mem_gb = threads * 6
Int disk_size = ceil((size(reads_fastas[0], "GB") * length(reads_fastas)) * 4 + 20)
Int disk_size = ceil(size(reads_fastas, "GB") * 4 + 20)
command <<<
set -euo pipefail
Expand Down Expand Up @@ -202,7 +202,8 @@ task align_hifiasm {
}
Int threads = 16
Int disk_size = ceil((size(query_sequences[0], "GB") * length(query_sequences) + size(reference, "GB")) * 2 + 20)
Int mem_gb = threads * 8
Int disk_size = ceil((size(query_sequences, "GB") + size(reference, "GB")) * 2 + 20)
command <<<
set -euo pipefail
Expand All @@ -218,7 +219,7 @@ task align_hifiasm {
~{reference} \
~{sep=' ' query_sequences} \
| samtools sort \
-@ 4 \
-@ 3 \
-T ./TMP \
-m 8G \
-O BAM \
Expand All @@ -235,7 +236,7 @@ task align_hifiasm {
runtime {
docker: "~{runtime_attributes.container_registry}/align_hifiasm@sha256:3968cb152a65163005ffed46297127536701ec5af4c44e8f3e7051f7b01f80fe"
cpu: threads
memory: "128 GB"
memory: mem_gb + " GB"
disk: disk_size + " GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: runtime_attributes.preemptible_tries
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ workflow de_novo_assembly_sample {
sample_id = sample.sample_id,
reads_fastas = samtools_fasta.reads_fasta,
references = references,
hifiasm_extra_params = "",
backend = backend,
default_runtime_attributes = default_runtime_attributes,
on_demand_runtime_attributes = on_demand_runtime_attributes
Expand Down Expand Up @@ -82,7 +81,7 @@ workflow de_novo_assembly_sample {

parameter_meta {
sample: {help: "Sample information and associated data files"}
reference: {help: "Reference genome data"}
references: {help: "Array of Reference genomes data"}
default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"}
on_demand_runtime_attributes: {help: "RuntimeAttributes for tasks that require dedicated instances"}
}
Expand Down
164 changes: 21 additions & 143 deletions workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,6 @@ workflow de_novo_assembly_trio {
}
}
# For yak, we need to know the total input coverage so we can set cloud memory resources accordingly
scatter (fasta in samtools_fasta_father.reads_fasta) {
call fasta_basecount as fasta_bc_father {
input:
reads_fasta = fasta,
runtime_attributes = default_runtime_attributes
}
}
call get_total_gbp as get_total_gbp_father {
input:
sample_id = father.sample_id,
fasta_totals = fasta_bc_father.read_total_bp,
runtime_attributes = default_runtime_attributes
}
scatter (movie_bam in mother.movie_bams) {
call SamtoolsFasta.samtools_fasta as samtools_fasta_mother {
input:
Expand All @@ -65,41 +49,32 @@ workflow de_novo_assembly_trio {
}
}
# For yak, we need to know the total input coverage so we can set cloud memory resources accordingly
scatter (fasta in samtools_fasta_mother.reads_fasta) {
call fasta_basecount as fasta_bc_mother {
input:
reads_fasta = fasta,
runtime_attributes = default_runtime_attributes
}
}
call get_total_gbp as get_total_gbp_mother {
input:
sample_id = mother.sample_id,
fasta_totals = fasta_bc_mother.read_total_bp,
runtime_attributes = default_runtime_attributes
}
# if parental coverage is low (<15x), keep singleton kmers from parents and use them to bin child reads
# if parental coverage is high (>=15x), use bloom filter and require that a kmer occur >= 5 times in
# one parent and <2 times in the other parent to be used for binning
# 60GB uncompressed FASTA ~= 10x coverage
# memory for 24 threads is 48GB with bloom filter (<=50x coverage) and 65GB without bloom filter (<=30x coverage)
Boolean bloom_filter = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false
call determine_yak_options {
input:
father_total_gbp = get_total_gbp_father.sample_total_gbp,
mother_total_gbp = get_total_gbp_mother.sample_total_gbp,
}
String yak_params = if (bloom_filter) then "-b37" else ""
Int yak_mem_gb = if (bloom_filter) then 50 else 70
String hifiasm_extra_params = if (bloom_filter) then "" else "-c1 -d1"
call yak_count as yak_count_father {
input:
sample_id = father.sample_id,
reads_fastas = samtools_fasta_father.reads_fasta,
yak_options = determine_yak_options.yak_options,
yak_params = yak_params,
mem_gb = yak_mem_gb,
runtime_attributes = default_runtime_attributes
}
call yak_count as yak_count_mother {
input:
sample_id = mother.sample_id,
reads_fastas = samtools_fasta_mother.reads_fasta,
yak_options = determine_yak_options.yak_options,
yak_params = yak_params,
mem_gb = yak_mem_gb,
runtime_attributes = default_runtime_attributes
}
Expand All @@ -125,7 +100,7 @@ workflow de_novo_assembly_trio {
sample_id = "~{cohort.cohort_id}.~{child.sample_id}",
reads_fastas = samtools_fasta_child.reads_fasta,
references = references,
hifiasm_extra_params = "-c1 -d1",
hifiasm_extra_params = hifiasm_extra_params,
father_yak = yak_count_father.yak,
mother_yak = yak_count_mother.yak,
backend = backend,
Expand All @@ -142,12 +117,11 @@ workflow de_novo_assembly_trio {
Array[Array[File]] zipped_assembly_fastas = flatten(assemble_genome.zipped_assembly_fastas)
Array[Array[File]] assembly_stats = flatten(assemble_genome.assembly_stats)
Array[Array[IndexData]] asm_bams = flatten(assemble_genome.asm_bams)

}

parameter_meta {
cohort: {help: "Sample information for the cohort"}
references: {help: "List of reference genome data"}
references: {help: "Array of Reference genomes data"}
default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"}
on_demand_runtime_attributes: {help: "RuntimeAttributes for tasks that require dedicated instances"}
}
Expand Down Expand Up @@ -186,47 +160,27 @@ task parse_families {
}
}
task determine_yak_options {
input {
Int mother_total_gbp
Int father_total_gbp
}

command {
set -e
if [ ~{father_total_gbp} -lt 48 ] && [ ~{mother_total_gbp} -lt 48 ]; then
options=""
else
options="-b37"
fi
echo $options
}
output {
String yak_options = read_string(stdout())
}
}

task yak_count {
input {
String sample_id
Array[File] reads_fastas
String yak_options

String yak_params
String mem_gb

RuntimeAttributes runtime_attributes
}
Int threads = 10
# Usage up to 140 GB @ 10 threads for Revio samples
Int mem_gb = 16 * threads
Int threads = 24
Int disk_size = ceil(size(reads_fastas, "GB") * 2 + 20)
command <<<
set -euo pipefail
yak count \
-t ~{threads} \
-o ~{sample_id}.yak \
~{yak_options} \
~{yak_params} \
~{sep=' ' reads_fastas}
>>>
Expand All @@ -247,79 +201,3 @@ task yak_count {
zones: runtime_attributes.zones
}
}
task fasta_basecount {
input {
File reads_fasta
String reads_fasta_basename = basename(reads_fasta)

RuntimeAttributes runtime_attributes
}
Int threads = 1
Int mem_gb = 4 * threads
Int disk_size = ceil(size(reads_fasta, "GB") * 2 + 20)
command <<<
set -euo pipefail
grep -v "^>" ~{reads_fasta} | tr -d '\n' | wc -c > ~{reads_fasta_basename}.total
>>>
output {
File read_total_bp = "~{reads_fasta_basename}.total"
}

runtime {
docker: "~{runtime_attributes.container_registry}/python@sha256:e4d921e252c3c19fe64097aa619c369c50cc862768d5fcb5e19d2877c55cfdd2"
cpu: threads
memory: mem_gb + " GB"
disk: disk_size + " GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: runtime_attributes.preemptible_tries
maxRetries: runtime_attributes.max_retries
awsBatchRetryAttempts: runtime_attributes.max_retries
queueArn: runtime_attributes.queue_arn
zones: runtime_attributes.zones
}
}
task get_total_gbp {
input {
String sample_id
Array[File] fasta_totals

RuntimeAttributes runtime_attributes
}
Int threads = 1
Int mem_gb = 4 * threads
Int disk_size = ceil(size(fasta_totals[0], "GB") * 2 + 20)
command <<<
set -euo pipefail
awk '{sum+=$1}END{print sum/1000000000}' ~{sep=' ' fasta_totals} > ~{sample_id}.total
>>>
output {
Int sample_total_gbp = round(read_float("~{sample_id}.total"))
}

runtime {
docker: "~{runtime_attributes.container_registry}/python@sha256:e4d921e252c3c19fe64097aa619c369c50cc862768d5fcb5e19d2877c55cfdd2"
cpu: threads
memory: mem_gb + " GB"
disk: disk_size + " GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: runtime_attributes.preemptible_tries
maxRetries: runtime_attributes.max_retries
awsBatchRetryAttempts: runtime_attributes.max_retries
queueArn: runtime_attributes.queue_arn
zones: runtime_attributes.zones
}
}
58 changes: 29 additions & 29 deletions workflows/input_template.json
Original file line number Diff line number Diff line change
@@ -1,31 +1,31 @@
{
"de_novo_assembly.cohort": {
"cohort_id": "String",
"samples": [
{
"sample_id": "String",
"movie_bams": "Array[File]",
"sex": "String?",
"father_id": "String?",
"mother_id": "String?",
"run_de_novo_assembly": "Boolean"
}
"de_novo_assembly.cohort": {
"cohort_id": "String",
"samples": [
{
"sample_id": "String",
"movie_bams": "Array[File]",
"sex": "String?",
"father_id": "String?",
"mother_id": "String?",
"run_de_novo_assembly": "Boolean"
}
],
"run_de_novo_assembly_trio": "Boolean"
},
"de_novo_assembly.references": [
{
"name": "String",
"fasta": {
"data": "File",
"data_index": "File"
}
}
],
"run_de_novo_assembly_trio": "Boolean"
},
"de_novo_assembly.references": [
{
"name": "String",
"fasta": {
"data": "File",
"data_index": "File"
}
],
"de_novo_assembly.zones": "String? (optional); required if backend is set to 'AWS'",
"de_novo_assembly.aws_spot_queue_arn": "String? (optional); required if backend is set to 'AWS'",
"de_novo_assembly.aws_on_demand_queue_arn": "String? (optional)",
"de_novo_assembly.preemptible": "Boolean",
"de_novo_assembly.backend": "String ['GCP', 'Azure', 'AWS', or 'HPC']",
"de_novo_assembly.container_registry": "String? (optional)",
}
}
"de_novo_assembly.zones": "String? (optional); required if backend is set to 'AWS'",
"de_novo_assembly.aws_spot_queue_arn": "String? (optional); required if backend is set to 'AWS'",
"de_novo_assembly.aws_on_demand_queue_arn": "String? (optional)",
"de_novo_assembly.preemptible": "Boolean",
"de_novo_assembly.backend": "String ['GCP', 'Azure', 'AWS', or 'HPC']",
"de_novo_assembly.container_registry": "String? (optional)"
}

0 comments on commit 49acf5c

Please sign in to comment.