Take a stab at estimating depth based on filesize.

- updated parameter_meta - updated inputs.json - cleaned up some whitespace - added comments - using fasta filesize to estimate depth rather than a separate task; based on Greg's experiments, an uncompressed 10x FASTA is ~60GB
PacificBiosciences · Sep 28, 2023 · 49acf5c · 49acf5c
1 parent 6c02327
commit 49acf5c
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 179 deletions.
diff --git a/workflows/assemble_genome/assemble_genome.wdl b/workflows/assemble_genome/assemble_genome.wdl
@@ -73,7 +73,7 @@ workflow assemble_genome {
 	parameter_meta {
 		sample_id: {help: "Sample ID; used for naming files"}
 		reads_fastas: {help: "Reads in fasta format to be used for assembly; one for each movie bam to be used in assembly. Reads fastas from one or more sample may be combined to use in the assembly"}
-		reference: {help: "Reference genome data"}
+		references: {help: "Array of Reference genomes data"}
 		hiiasm_extra_params: {help: "[OPTIONAL] Additional parameters to pass to hifiasm assembly"}
 		father_yak: {help: "[OPTIONAL] kmer counts for the father; required if running trio-based assembly"}
 		mother_yak: {help: "[OPTIONAL] kmer counts for the mother; required if running trio-based assembly"}
@@ -98,7 +98,7 @@ task hifiasm_assemble {
 	String prefix = "~{sample_id}.asm"
 	Int threads = 48
 	Int mem_gb = threads * 6
-	Int disk_size = ceil((size(reads_fastas[0], "GB") * length(reads_fastas)) * 4 + 20)
+	Int disk_size = ceil(size(reads_fastas, "GB") * 4 + 20)
 
 	command <<<
 		set -euo pipefail
@@ -202,7 +202,8 @@ task align_hifiasm {
 	}
 
 	Int threads = 16
-	Int disk_size = ceil((size(query_sequences[0], "GB") * length(query_sequences) + size(reference, "GB")) * 2 + 20)
+	Int mem_gb = threads * 8
+	Int disk_size = ceil((size(query_sequences, "GB") + size(reference, "GB")) * 2 + 20)
 
 	command <<<
 		set -euo pipefail
@@ -218,7 +219,7 @@ task align_hifiasm {
 			~{reference} \
 			~{sep=' ' query_sequences} \
 		| samtools sort \
-			-@ 4 \
+			-@ 3 \
 			-T ./TMP \
 			-m 8G \
 			-O BAM \
@@ -235,7 +236,7 @@ task align_hifiasm {
 	runtime {
 		docker: "~{runtime_attributes.container_registry}/align_hifiasm@sha256:3968cb152a65163005ffed46297127536701ec5af4c44e8f3e7051f7b01f80fe"
 		cpu: threads
-		memory: "128 GB"
+		memory: mem_gb + " GB"
 		disk: disk_size + " GB"
 		disks: "local-disk " + disk_size + " HDD"
 		preemptible: runtime_attributes.preemptible_tries

diff --git a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl
@@ -32,7 +32,6 @@ workflow de_novo_assembly_sample {
 			sample_id = sample.sample_id,
 			reads_fastas = samtools_fasta.reads_fasta,
 			references = references,
-			hifiasm_extra_params = "",
 			backend = backend,
 			default_runtime_attributes = default_runtime_attributes,
 			on_demand_runtime_attributes = on_demand_runtime_attributes
@@ -82,7 +81,7 @@ workflow de_novo_assembly_sample {
 
 	parameter_meta {
 		sample: {help: "Sample information and associated data files"}
-		reference: {help: "Reference genome data"}
+		references: {help: "Array of Reference genomes data"}
 		default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"}
 		on_demand_runtime_attributes: {help: "RuntimeAttributes for tasks that require dedicated instances"}
 	}

diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl
@@ -41,22 +41,6 @@ workflow de_novo_assembly_trio {
 			}
 		}
 
-	    # For yak, we need to know the total input coverage so we can set cloud memory resources accordingly
-		scatter (fasta in samtools_fasta_father.reads_fasta) {
-			call fasta_basecount as fasta_bc_father {
-				input:
-					reads_fasta = fasta,
-					runtime_attributes = default_runtime_attributes
-			}
-		}
-
-		call get_total_gbp as get_total_gbp_father {
-			input:
-				sample_id = father.sample_id,
-				fasta_totals = fasta_bc_father.read_total_bp,
-				runtime_attributes = default_runtime_attributes
-		}
-
 		scatter (movie_bam in mother.movie_bams) {
 			call SamtoolsFasta.samtools_fasta as samtools_fasta_mother {
 				input:
@@ -65,41 +49,32 @@ workflow de_novo_assembly_trio {
 			}
 		}
 
-	    # For yak, we need to know the total input coverage so we can set cloud memory resources accordingly
-		scatter (fasta in samtools_fasta_mother.reads_fasta) {
-			call fasta_basecount as fasta_bc_mother {
-				input:
-					reads_fasta = fasta,
-					runtime_attributes = default_runtime_attributes
-			}
-		}
-
-		call get_total_gbp as get_total_gbp_mother {
-			input:
-				sample_id = mother.sample_id,
-				fasta_totals = fasta_bc_mother.read_total_bp,
-				runtime_attributes = default_runtime_attributes
-		}
+		# if parental coverage is low (<15x), keep singleton kmers from parents and use them to bin child reads
+		# if parental coverage is high (>=15x), use bloom filter and require that a kmer occur >= 5 times in
+		#     one parent and <2 times in the other parent to be used for binning
+		# 60GB uncompressed FASTA ~= 10x coverage
+		# memory for 24 threads is 48GB with bloom filter (<=50x coverage) and 65GB without bloom filter (<=30x coverage)
+		Boolean bloom_filter = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false
 
-		call determine_yak_options {
-			input:
-			father_total_gbp = get_total_gbp_father.sample_total_gbp,
-			mother_total_gbp = get_total_gbp_mother.sample_total_gbp,				
-		}
+		String yak_params = if (bloom_filter) then "-b37" else ""
+		Int yak_mem_gb = if (bloom_filter) then 50 else 70
+		String hifiasm_extra_params = if (bloom_filter) then "" else "-c1 -d1"
 
 		call yak_count as yak_count_father {
 			input:
 				sample_id = father.sample_id,
 				reads_fastas = samtools_fasta_father.reads_fasta,
-				yak_options = determine_yak_options.yak_options,
+				yak_params = yak_params,
+				mem_gb = yak_mem_gb,
 				runtime_attributes = default_runtime_attributes
 		}
 
 		call yak_count as yak_count_mother {
 			input:
 				sample_id = mother.sample_id,
 				reads_fastas = samtools_fasta_mother.reads_fasta,
-				yak_options = determine_yak_options.yak_options,
+				yak_params = yak_params,
+				mem_gb = yak_mem_gb,
 				runtime_attributes = default_runtime_attributes
 		}
 
@@ -125,7 +100,7 @@ workflow de_novo_assembly_trio {
 					sample_id = "~{cohort.cohort_id}.~{child.sample_id}",
 					reads_fastas = samtools_fasta_child.reads_fasta,
 					references = references,
-					hifiasm_extra_params = "-c1 -d1",
+					hifiasm_extra_params = hifiasm_extra_params,
 					father_yak = yak_count_father.yak,
 					mother_yak = yak_count_mother.yak,
 					backend = backend,
@@ -142,12 +117,11 @@ workflow de_novo_assembly_trio {
 		Array[Array[File]] zipped_assembly_fastas = flatten(assemble_genome.zipped_assembly_fastas)
 		Array[Array[File]] assembly_stats = flatten(assemble_genome.assembly_stats)
 		Array[Array[IndexData]] asm_bams = flatten(assemble_genome.asm_bams)
-
 	}
 
 	parameter_meta {
 		cohort: {help: "Sample information for the cohort"}
-		references: {help: "List of reference genome data"}
+		references: {help: "Array of Reference genomes data"}
 		default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"}
 		on_demand_runtime_attributes: {help: "RuntimeAttributes for tasks that require dedicated instances"}
 	}
@@ -186,47 +160,27 @@ task parse_families {
 	}
 }
 
-task determine_yak_options {
-	input {
-		Int mother_total_gbp 
-		Int father_total_gbp
-	}
-
-	command {
-		set -e
-		if [ ~{father_total_gbp} -lt 48 ] && [ ~{mother_total_gbp} -lt 48 ]; then
-			options=""
-		else
-			options="-b37"
-		fi
-		echo $options
-	}
-	output {
-		String yak_options = read_string(stdout())
-	}
-}
-
 task yak_count {
 	input {
 		String sample_id
 		Array[File] reads_fastas
-		String yak_options
+
+		String yak_params
+		String mem_gb
 
 		RuntimeAttributes runtime_attributes
 	}
-	Int threads = 10
 
-	# Usage up to 140 GB @ 10 threads for Revio samples
-	Int mem_gb = 16 * threads
+	Int threads = 24
 	Int disk_size = ceil(size(reads_fastas, "GB") * 2 + 20)
-	
+
 	command <<<
 		set -euo pipefail
 
 		yak count \
 			-t ~{threads} \
 			-o ~{sample_id}.yak \
-			~{yak_options} \
+			~{yak_params} \
 			~{sep=' ' reads_fastas}
 	>>>
 
@@ -247,79 +201,3 @@ task yak_count {
 		zones: runtime_attributes.zones
 	}
 }
-
-task fasta_basecount {
-	input {
-		File reads_fasta
-		String reads_fasta_basename = basename(reads_fasta)
-
-		RuntimeAttributes runtime_attributes
-	}
-
-	Int threads = 1
-	Int mem_gb = 4 * threads
-
-	Int disk_size = ceil(size(reads_fasta, "GB") * 2 + 20)
-
-	command <<<
-		set -euo pipefail
-
-		grep -v "^>" ~{reads_fasta} | tr -d '\n' | wc -c > ~{reads_fasta_basename}.total
-	>>>
-
-	output {
-		File read_total_bp = "~{reads_fasta_basename}.total"
-	}
-
-	runtime {
-		docker: "~{runtime_attributes.container_registry}/python@sha256:e4d921e252c3c19fe64097aa619c369c50cc862768d5fcb5e19d2877c55cfdd2"
-		cpu: threads
-		memory: mem_gb + " GB"
-		disk: disk_size + " GB"
-		disks: "local-disk " + disk_size + " HDD"
-		preemptible: runtime_attributes.preemptible_tries
-		maxRetries: runtime_attributes.max_retries
-		awsBatchRetryAttempts: runtime_attributes.max_retries
-		queueArn: runtime_attributes.queue_arn
-		zones: runtime_attributes.zones
-	}
-}
-
-task get_total_gbp {
-	input {
-		String sample_id
-		Array[File] fasta_totals
-
-		RuntimeAttributes runtime_attributes
-	}
-
-	Int threads = 1
-	Int mem_gb = 4 * threads
-
-	Int disk_size = ceil(size(fasta_totals[0], "GB") * 2 + 20)
-
-	command <<<
-		set -euo pipefail
-
-		awk '{sum+=$1}END{print sum/1000000000}' ~{sep=' ' fasta_totals} > ~{sample_id}.total
-
-	>>>
-
-	output {
-		Int sample_total_gbp = round(read_float("~{sample_id}.total"))
-	}
-
-	runtime {
-		docker: "~{runtime_attributes.container_registry}/python@sha256:e4d921e252c3c19fe64097aa619c369c50cc862768d5fcb5e19d2877c55cfdd2"
-		cpu: threads
-		memory: mem_gb + " GB"
-		disk: disk_size + " GB"
-		disks: "local-disk " + disk_size + " HDD"
-		preemptible: runtime_attributes.preemptible_tries
-		maxRetries: runtime_attributes.max_retries
-		awsBatchRetryAttempts: runtime_attributes.max_retries
-		queueArn: runtime_attributes.queue_arn
-		zones: runtime_attributes.zones
-	}
-
-}
diff --git a/workflows/input_template.json b/workflows/input_template.json
@@ -1,31 +1,31 @@
 {
-  "de_novo_assembly.cohort": {
-    "cohort_id": "String",
-    "samples": [
-      {
-        "sample_id": "String",
-        "movie_bams": "Array[File]",
-        "sex": "String?",
-        "father_id": "String?",
-        "mother_id": "String?",
-        "run_de_novo_assembly": "Boolean"
-      }
+    "de_novo_assembly.cohort": {
+        "cohort_id": "String",
+        "samples": [
+            {
+                "sample_id": "String",
+                "movie_bams": "Array[File]",
+                "sex": "String?",
+                "father_id": "String?",
+                "mother_id": "String?",
+                "run_de_novo_assembly": "Boolean"
+            }
+        ],
+        "run_de_novo_assembly_trio": "Boolean"
+    },
+    "de_novo_assembly.references": [
+        {
+            "name": "String",
+            "fasta": {
+                "data": "File",
+                "data_index": "File"
+            }
+        }
     ],
-    "run_de_novo_assembly_trio": "Boolean"
-  },
-  "de_novo_assembly.references": [
-    {
-    "name": "String",
-    "fasta": {
-      "data": "File",
-      "data_index": "File"
-    }
-  ],
-  "de_novo_assembly.zones": "String? (optional); required if backend is set to 'AWS'",
-  "de_novo_assembly.aws_spot_queue_arn": "String? (optional); required if backend is set to 'AWS'",
-  "de_novo_assembly.aws_on_demand_queue_arn": "String? (optional)",
-  "de_novo_assembly.preemptible": "Boolean",
-  "de_novo_assembly.backend": "String ['GCP', 'Azure', 'AWS', or 'HPC']",
-  "de_novo_assembly.container_registry": "String? (optional)",                                                            
-  }
-}
+    "de_novo_assembly.zones": "String? (optional); required if backend is set to 'AWS'",
+    "de_novo_assembly.aws_spot_queue_arn": "String? (optional); required if backend is set to 'AWS'",
+    "de_novo_assembly.aws_on_demand_queue_arn": "String? (optional)",
+    "de_novo_assembly.preemptible": "Boolean",
+    "de_novo_assembly.backend": "String ['GCP', 'Azure', 'AWS', or 'HPC']",
+    "de_novo_assembly.container_registry": "String? (optional)"
+}