From 49acf5ccc5c2c1b42d8827b690a6a4e98ebcdecf Mon Sep 17 00:00:00 2001 From: William Rowell Date: Thu, 28 Sep 2023 11:10:47 -0700 Subject: [PATCH 1/7] Take a stab at estimating depth based on filesize. - updated parameter_meta - updated inputs.json - cleaned up some whitespace - added comments - using fasta filesize to estimate depth rather than a separate task; based on Greg's experiments, an uncompressed 10x FASTA is ~60GB --- workflows/assemble_genome/assemble_genome.wdl | 11 +- .../de_novo_assembly_sample.wdl | 3 +- .../de_novo_assembly_trio.wdl | 164 +++--------------- workflows/input_template.json | 58 +++---- 4 files changed, 57 insertions(+), 179 deletions(-) diff --git a/workflows/assemble_genome/assemble_genome.wdl b/workflows/assemble_genome/assemble_genome.wdl index 66acedf..1e6807f 100644 --- a/workflows/assemble_genome/assemble_genome.wdl +++ b/workflows/assemble_genome/assemble_genome.wdl @@ -73,7 +73,7 @@ workflow assemble_genome { parameter_meta { sample_id: {help: "Sample ID; used for naming files"} reads_fastas: {help: "Reads in fasta format to be used for assembly; one for each movie bam to be used in assembly. Reads fastas from one or more sample may be combined to use in the assembly"} - reference: {help: "Reference genome data"} + references: {help: "Array of Reference genomes data"} hiiasm_extra_params: {help: "[OPTIONAL] Additional parameters to pass to hifiasm assembly"} father_yak: {help: "[OPTIONAL] kmer counts for the father; required if running trio-based assembly"} mother_yak: {help: "[OPTIONAL] kmer counts for the mother; required if running trio-based assembly"} @@ -98,7 +98,7 @@ task hifiasm_assemble { String prefix = "~{sample_id}.asm" Int threads = 48 Int mem_gb = threads * 6 - Int disk_size = ceil((size(reads_fastas[0], "GB") * length(reads_fastas)) * 4 + 20) + Int disk_size = ceil(size(reads_fastas, "GB") * 4 + 20) command <<< set -euo pipefail @@ -202,7 +202,8 @@ task align_hifiasm { } Int threads = 16 - Int disk_size = ceil((size(query_sequences[0], "GB") * length(query_sequences) + size(reference, "GB")) * 2 + 20) + Int mem_gb = threads * 8 + Int disk_size = ceil((size(query_sequences, "GB") + size(reference, "GB")) * 2 + 20) command <<< set -euo pipefail @@ -218,7 +219,7 @@ task align_hifiasm { ~{reference} \ ~{sep=' ' query_sequences} \ | samtools sort \ - -@ 4 \ + -@ 3 \ -T ./TMP \ -m 8G \ -O BAM \ @@ -235,7 +236,7 @@ task align_hifiasm { runtime { docker: "~{runtime_attributes.container_registry}/align_hifiasm@sha256:3968cb152a65163005ffed46297127536701ec5af4c44e8f3e7051f7b01f80fe" cpu: threads - memory: "128 GB" + memory: mem_gb + " GB" disk: disk_size + " GB" disks: "local-disk " + disk_size + " HDD" preemptible: runtime_attributes.preemptible_tries diff --git a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl index 6481d5f..43354fc 100644 --- a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl +++ b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl @@ -32,7 +32,6 @@ workflow de_novo_assembly_sample { sample_id = sample.sample_id, reads_fastas = samtools_fasta.reads_fasta, references = references, - hifiasm_extra_params = "", backend = backend, default_runtime_attributes = default_runtime_attributes, on_demand_runtime_attributes = on_demand_runtime_attributes @@ -82,7 +81,7 @@ workflow de_novo_assembly_sample { parameter_meta { sample: {help: "Sample information and associated data files"} - reference: {help: "Reference genome data"} + references: {help: "Array of Reference genomes data"} default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"} on_demand_runtime_attributes: {help: "RuntimeAttributes for tasks that require dedicated instances"} } diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 0f155ef..f06513c 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -41,22 +41,6 @@ workflow de_novo_assembly_trio { } } - # For yak, we need to know the total input coverage so we can set cloud memory resources accordingly - scatter (fasta in samtools_fasta_father.reads_fasta) { - call fasta_basecount as fasta_bc_father { - input: - reads_fasta = fasta, - runtime_attributes = default_runtime_attributes - } - } - - call get_total_gbp as get_total_gbp_father { - input: - sample_id = father.sample_id, - fasta_totals = fasta_bc_father.read_total_bp, - runtime_attributes = default_runtime_attributes - } - scatter (movie_bam in mother.movie_bams) { call SamtoolsFasta.samtools_fasta as samtools_fasta_mother { input: @@ -65,33 +49,23 @@ workflow de_novo_assembly_trio { } } - # For yak, we need to know the total input coverage so we can set cloud memory resources accordingly - scatter (fasta in samtools_fasta_mother.reads_fasta) { - call fasta_basecount as fasta_bc_mother { - input: - reads_fasta = fasta, - runtime_attributes = default_runtime_attributes - } - } - - call get_total_gbp as get_total_gbp_mother { - input: - sample_id = mother.sample_id, - fasta_totals = fasta_bc_mother.read_total_bp, - runtime_attributes = default_runtime_attributes - } + # if parental coverage is low (<15x), keep singleton kmers from parents and use them to bin child reads + # if parental coverage is high (>=15x), use bloom filter and require that a kmer occur >= 5 times in + # one parent and <2 times in the other parent to be used for binning + # 60GB uncompressed FASTA ~= 10x coverage + # memory for 24 threads is 48GB with bloom filter (<=50x coverage) and 65GB without bloom filter (<=30x coverage) + Boolean bloom_filter = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false - call determine_yak_options { - input: - father_total_gbp = get_total_gbp_father.sample_total_gbp, - mother_total_gbp = get_total_gbp_mother.sample_total_gbp, - } + String yak_params = if (bloom_filter) then "-b37" else "" + Int yak_mem_gb = if (bloom_filter) then 50 else 70 + String hifiasm_extra_params = if (bloom_filter) then "" else "-c1 -d1" call yak_count as yak_count_father { input: sample_id = father.sample_id, reads_fastas = samtools_fasta_father.reads_fasta, - yak_options = determine_yak_options.yak_options, + yak_params = yak_params, + mem_gb = yak_mem_gb, runtime_attributes = default_runtime_attributes } @@ -99,7 +73,8 @@ workflow de_novo_assembly_trio { input: sample_id = mother.sample_id, reads_fastas = samtools_fasta_mother.reads_fasta, - yak_options = determine_yak_options.yak_options, + yak_params = yak_params, + mem_gb = yak_mem_gb, runtime_attributes = default_runtime_attributes } @@ -125,7 +100,7 @@ workflow de_novo_assembly_trio { sample_id = "~{cohort.cohort_id}.~{child.sample_id}", reads_fastas = samtools_fasta_child.reads_fasta, references = references, - hifiasm_extra_params = "-c1 -d1", + hifiasm_extra_params = hifiasm_extra_params, father_yak = yak_count_father.yak, mother_yak = yak_count_mother.yak, backend = backend, @@ -142,12 +117,11 @@ workflow de_novo_assembly_trio { Array[Array[File]] zipped_assembly_fastas = flatten(assemble_genome.zipped_assembly_fastas) Array[Array[File]] assembly_stats = flatten(assemble_genome.assembly_stats) Array[Array[IndexData]] asm_bams = flatten(assemble_genome.asm_bams) - } parameter_meta { cohort: {help: "Sample information for the cohort"} - references: {help: "List of reference genome data"} + references: {help: "Array of Reference genomes data"} default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"} on_demand_runtime_attributes: {help: "RuntimeAttributes for tasks that require dedicated instances"} } @@ -186,47 +160,27 @@ task parse_families { } } -task determine_yak_options { - input { - Int mother_total_gbp - Int father_total_gbp - } - - command { - set -e - if [ ~{father_total_gbp} -lt 48 ] && [ ~{mother_total_gbp} -lt 48 ]; then - options="" - else - options="-b37" - fi - echo $options - } - output { - String yak_options = read_string(stdout()) - } -} - task yak_count { input { String sample_id Array[File] reads_fastas - String yak_options + + String yak_params + String mem_gb RuntimeAttributes runtime_attributes } - Int threads = 10 - # Usage up to 140 GB @ 10 threads for Revio samples - Int mem_gb = 16 * threads + Int threads = 24 Int disk_size = ceil(size(reads_fastas, "GB") * 2 + 20) - + command <<< set -euo pipefail yak count \ -t ~{threads} \ -o ~{sample_id}.yak \ - ~{yak_options} \ + ~{yak_params} \ ~{sep=' ' reads_fastas} >>> @@ -247,79 +201,3 @@ task yak_count { zones: runtime_attributes.zones } } - -task fasta_basecount { - input { - File reads_fasta - String reads_fasta_basename = basename(reads_fasta) - - RuntimeAttributes runtime_attributes - } - - Int threads = 1 - Int mem_gb = 4 * threads - - Int disk_size = ceil(size(reads_fasta, "GB") * 2 + 20) - - command <<< - set -euo pipefail - - grep -v "^>" ~{reads_fasta} | tr -d '\n' | wc -c > ~{reads_fasta_basename}.total - >>> - - output { - File read_total_bp = "~{reads_fasta_basename}.total" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/python@sha256:e4d921e252c3c19fe64097aa619c369c50cc862768d5fcb5e19d2877c55cfdd2" - cpu: threads - memory: mem_gb + " GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } -} - -task get_total_gbp { - input { - String sample_id - Array[File] fasta_totals - - RuntimeAttributes runtime_attributes - } - - Int threads = 1 - Int mem_gb = 4 * threads - - Int disk_size = ceil(size(fasta_totals[0], "GB") * 2 + 20) - - command <<< - set -euo pipefail - - awk '{sum+=$1}END{print sum/1000000000}' ~{sep=' ' fasta_totals} > ~{sample_id}.total - - >>> - - output { - Int sample_total_gbp = round(read_float("~{sample_id}.total")) - } - - runtime { - docker: "~{runtime_attributes.container_registry}/python@sha256:e4d921e252c3c19fe64097aa619c369c50cc862768d5fcb5e19d2877c55cfdd2" - cpu: threads - memory: mem_gb + " GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } - -} diff --git a/workflows/input_template.json b/workflows/input_template.json index e97ef8d..64e5d62 100644 --- a/workflows/input_template.json +++ b/workflows/input_template.json @@ -1,31 +1,31 @@ { - "de_novo_assembly.cohort": { - "cohort_id": "String", - "samples": [ - { - "sample_id": "String", - "movie_bams": "Array[File]", - "sex": "String?", - "father_id": "String?", - "mother_id": "String?", - "run_de_novo_assembly": "Boolean" - } + "de_novo_assembly.cohort": { + "cohort_id": "String", + "samples": [ + { + "sample_id": "String", + "movie_bams": "Array[File]", + "sex": "String?", + "father_id": "String?", + "mother_id": "String?", + "run_de_novo_assembly": "Boolean" + } + ], + "run_de_novo_assembly_trio": "Boolean" + }, + "de_novo_assembly.references": [ + { + "name": "String", + "fasta": { + "data": "File", + "data_index": "File" + } + } ], - "run_de_novo_assembly_trio": "Boolean" - }, - "de_novo_assembly.references": [ - { - "name": "String", - "fasta": { - "data": "File", - "data_index": "File" - } - ], - "de_novo_assembly.zones": "String? (optional); required if backend is set to 'AWS'", - "de_novo_assembly.aws_spot_queue_arn": "String? (optional); required if backend is set to 'AWS'", - "de_novo_assembly.aws_on_demand_queue_arn": "String? (optional)", - "de_novo_assembly.preemptible": "Boolean", - "de_novo_assembly.backend": "String ['GCP', 'Azure', 'AWS', or 'HPC']", - "de_novo_assembly.container_registry": "String? (optional)", - } -} + "de_novo_assembly.zones": "String? (optional); required if backend is set to 'AWS'", + "de_novo_assembly.aws_spot_queue_arn": "String? (optional); required if backend is set to 'AWS'", + "de_novo_assembly.aws_on_demand_queue_arn": "String? (optional)", + "de_novo_assembly.preemptible": "Boolean", + "de_novo_assembly.backend": "String ['GCP', 'Azure', 'AWS', or 'HPC']", + "de_novo_assembly.container_registry": "String? (optional)" +} \ No newline at end of file From f24e1e6e95ab4329744252a4dcd6b98d3a3aae80 Mon Sep 17 00:00:00 2001 From: William Rowell Date: Thu, 28 Sep 2023 11:14:50 -0700 Subject: [PATCH 2/7] changed flag name and fixed my flipped logic --- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index f06513c..d3aa695 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -54,11 +54,11 @@ workflow de_novo_assembly_trio { # one parent and <2 times in the other parent to be used for binning # 60GB uncompressed FASTA ~= 10x coverage # memory for 24 threads is 48GB with bloom filter (<=50x coverage) and 65GB without bloom filter (<=30x coverage) - Boolean bloom_filter = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false + Boolean low_depth = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false - String yak_params = if (bloom_filter) then "-b37" else "" - Int yak_mem_gb = if (bloom_filter) then 50 else 70 - String hifiasm_extra_params = if (bloom_filter) then "" else "-c1 -d1" + String yak_params = if (low_depth) then "" else "-b37" + Int yak_mem_gb = if (low_depth) then 70 else 50 + String hifiasm_extra_params = if (low_depth) then "-c1 -d1" else "" call yak_count as yak_count_father { input: From 42d0d847d5c06f4ea16fbc8d9890f29a408ef45a Mon Sep 17 00:00:00 2001 From: William Rowell Date: Thu, 28 Sep 2023 11:23:58 -0700 Subject: [PATCH 3/7] Adjusted tests. --- wdl-ci.config.json | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index b2e5d4f..792f307 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -150,7 +150,8 @@ "${resources_file_path}/m64017_200108_232219.hifi_reads.fasta", "${resources_file_path}/m64017_200112_090459.hifi_reads.fasta" ], - "yak_options": "-b37", + "yak_params": "-b37", + "mem_gb": 70, "runtime_attributes": "${default_runtime_attributes}" }, "output_tests": { @@ -164,21 +165,6 @@ } } ] - }, - "fasta_basecount": { - "key": "fasta_basecount", - "digest": "", - "tests": [] - }, - "get_total_gbp": { - "key": "get_total_gbp", - "digest": "", - "tests": [] - }, - "determine_yak_options": { - "key": "determine_yak_options", - "digest": "", - "tests": [] } } }, From c8a9b8d9d3a628d10e3355608acabb355cf78225 Mon Sep 17 00:00:00 2001 From: William Rowell Date: Thu, 28 Sep 2023 11:31:20 -0700 Subject: [PATCH 4/7] Memory is an int. --- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index d3aa695..2d93adb 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -166,7 +166,7 @@ task yak_count { Array[File] reads_fastas String yak_params - String mem_gb + Int mem_gb RuntimeAttributes runtime_attributes } From 42735154d3c3c8736697dbb89001c4ced04d61f0 Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 28 Sep 2023 18:51:21 +0000 Subject: [PATCH 5/7] update wdl-ci config file after successful tests --- wdl-ci.config.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 792f307..081474a 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -137,7 +137,7 @@ }, "yak_count": { "key": "yak_count", - "digest": "qysjdjudeldfcf6pm2unping3zkh4qve", + "digest": "6hlh6n3b3cqohtmjweg57of626he4c4v", "tests": [ { "inputs": { @@ -175,7 +175,7 @@ "tasks": { "hifiasm_assemble": { "key": "hifiasm_assemble", - "digest": "r4ikydzmdaed4hzsmc3t7efh6mz5e4mx", + "digest": "vhkzwee3f754jcjksog22uyps3j6myow", "tests": [ { "inputs": { @@ -264,7 +264,7 @@ }, "align_hifiasm": { "key": "align_hifiasm", - "digest": "77gs34t4c2i6epsg2epukfoaign2fmnt", + "digest": "4qf5jeepfn3jv3g2socql6xh7vmd4b7s", "tests": [ { "inputs": { From 2dc27601fd4183bcbead838d8359c5db9af59668 Mon Sep 17 00:00:00 2001 From: William Rowell Date: Thu, 28 Sep 2023 12:16:53 -0700 Subject: [PATCH 6/7] added warning comment about estimating depth by filesize --- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 2d93adb..5db4c29 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -52,7 +52,7 @@ workflow de_novo_assembly_trio { # if parental coverage is low (<15x), keep singleton kmers from parents and use them to bin child reads # if parental coverage is high (>=15x), use bloom filter and require that a kmer occur >= 5 times in # one parent and <2 times in the other parent to be used for binning - # 60GB uncompressed FASTA ~= 10x coverage + # 60GB uncompressed FASTA ~= 10x coverage (this is not robust to big changes in mean read length) # memory for 24 threads is 48GB with bloom filter (<=50x coverage) and 65GB without bloom filter (<=30x coverage) Boolean low_depth = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false From ebe6afeb0d5ec2409d6b31aaf5aa838976672bdc Mon Sep 17 00:00:00 2001 From: William Rowell Date: Fri, 29 Sep 2023 13:35:14 -0700 Subject: [PATCH 7/7] Explicitly pass default `yak count` and `hifiasm` params. --- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 5db4c29..aa07ffc 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -56,9 +56,9 @@ workflow de_novo_assembly_trio { # memory for 24 threads is 48GB with bloom filter (<=50x coverage) and 65GB without bloom filter (<=30x coverage) Boolean low_depth = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false - String yak_params = if (low_depth) then "" else "-b37" + String yak_params = if (low_depth) then "-b0" else "-b37" Int yak_mem_gb = if (low_depth) then 70 else 50 - String hifiasm_extra_params = if (low_depth) then "-c1 -d1" else "" + String hifiasm_extra_params = if (low_depth) then "-c1 -d1" else "-c2 -d5" call yak_count as yak_count_father { input: