Merge pull request #76 from PacificBiosciences/feature/combine-short-…

…tasks Combine short tasks.
PacificBiosciences · Oct 5, 2023 · 86d268c · 86d268c
2 parents d8d539e + 57dc6b0
commit 86d268c
Show file tree

Hide file tree

Showing 4 changed files with 156 additions and 464 deletions.
diff --git a/README.md b/README.md
@@ -264,12 +264,11 @@ The Docker image used by a particular step of the workflow can be identified by
 | htslib | <ul><li>[htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/htslib) |
 | mosdepth | <ul><li>[mosdepth 0.2.9](https://github.com/brentp/mosdepth/releases/tag/v0.2.9)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/mosdepth) |
 | paraphase | <ul><li>[minimap2 2.17](https://github.com/lh3/minimap2/releases/tag/v2.17)</li><li>[samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)</li><li>[paraphase 2.2.3](https://github.com/PacificBiosciences/paraphase/releases/tag/v2.2.3)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/paraphase) |
-| parse-cohort | <ul><li>python 3.8.10; custom scripts</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/5b3e15e5da2963bb81a51170f82e37209407d5fc/docker/parse-cohort) |
 | pb-cpg-tools | <ul><li>[pb-CpG-tools v2.3.2](https://github.com/PacificBiosciences/pb-CpG-tools/releases/tag/v2.3.2)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/7481837d3b0f539adf4f64209a65cf28eebf3dba/docker/pb-cpg-tools) |
 | pbmm2 | <ul><li>[pbmm2 1.10.0](https://github.com/PacificBiosciences/pbmm2/releases/tag/v1.10.0)</li><li>[datamash 1.1.0](https://ftp.gnu.org/gnu/datamash/)</li><li>[pysam 0.16.0.1](https://github.com/pysam-developers/pysam/releases/tag/v0.16.0.1)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/pbmm2) |
 | pbsv | <ul><li>[pbsv 2.9.0](https://github.com/PacificBiosciences/pbsv/releases/tag/v2.9.0)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/pbsv) |
-| pyyaml | <ul><li>[pyyaml 5.3.1](https://github.com/yaml/pyyaml/releases/tag/5.3.1)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/pyyaml) |
+| pyyaml | <ul><li>[pyyaml 5.3.1](https://github.com/yaml/pyyaml/releases/tag/5.3.1)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/f72e862bca2f209b9909e6043ef0197975762f27/docker/pyyaml) |
 | samtools | <ul><li>[samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/samtools) |
 | slivar | <ul><li>[slivar 0.2.2](https://github.com/brentp/slivar/releases/tag/v0.2.2)</li><li>[bcftools 1.14](https://github.com/samtools/bcftools/releases/tag/1.14)</li><li>[vcfpy 0.13.3](https://github.com/bihealth/vcfpy/releases/tag/v0.13.3)</li><li>[pysam 0.19.1](https://github.com/pysam-developers/pysam/releases/tag/v0.19.1)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/slivar) |
-| svpack | <ul><li>[svpack 36180ae6](https://github.com/PacificBiosciences/svpack/tree/a82598ebc4013bf32e70295b83b380ada6302c4a)</li><li>[pysam 0.16.0.1](https://github.com/pysam-developers/pysam/releases/tag/v0.16.0.1)</li> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/svpack) |
+| svpack | <ul><li>[svpack 36180ae6](https://github.com/PacificBiosciences/svpack/tree/a82598ebc4013bf32e70295b83b380ada6302c4a)</li><li>[htslib 1.18](https://github.com/samtools/htslib/releases/tag/1.18)</li><li>[pysam 0.21.0](https://github.com/pysam-developers/pysam/releases/tag/v0.21.0)</li> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/8edbc516abc0ff43ac279b48018003923721b054/docker/svpack) |
 | trgt | <ul><li>[trgt 0.5.0](https://github.com/PacificBiosciences/trgt/releases/tag/v0.5.0)</li><li>[samtools 1.18](https://github.com/samtools/samtools/releases/tag/1.18)</li><li>[bcftools 1.18](https://github.com/samtools/bcftools/releases/tag/1.18)</li><li>[pysam 0.21.0](https://github.com/pysam-developers/pysam/releases/tag/v0.21.0)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/d2a45e0213ac3fa631a51a48757c442d3ed550b6/docker/trgt) |
diff --git a/wdl-ci.config.json b/wdl-ci.config.json
@@ -77,13 +77,15 @@
             }
           ]
         },
-        "bcftools_roh": {
-          "key": "bcftools_roh",
-          "digest": "wyp43tacw5ovlm24ypisltgmgilpudcp",
+        "bcftools": {
+          "key": "bcftools",
+          "digest": "cbfxlhk575vhxbh6spw7ceyhn2ljf7vu",
           "tests": [
             {
               "inputs": {
                 "vcf": "${resources_file_path}/HG005.GRCh38.deepvariant.vcf.gz",
+                "stats_params": "--apply-filters PASS --samples HG005",
+                "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta",
                 "runtime_attributes": "${default_runtime_attributes}"
               },
               "output_tests": {
@@ -101,6 +103,13 @@
                     "count_bed_columns",
                     "check_tab_delimited"
                   ]
+                },
+                "stats": {
+                  "value": "${resources_file_path}/HG005.GRCh38.deepvariant.vcf.stats.txt",
+                  "test_tasks": [
+                    "compare_file_basename",
+                    "check_empty_lines"
+                  ]
                 }
               }
             }
@@ -332,36 +341,6 @@
       "description": "",
       "tasks": {}
     },
-    "workflows/wdl-common/wdl/tasks/bcftools_stats.wdl": {
-      "key": "workflows/wdl-common/wdl/tasks/bcftools_stats.wdl",
-      "name": "",
-      "description": "",
-      "tasks": {
-        "bcftools_stats": {
-          "key": "bcftools_stats",
-          "digest": "cu73ojtpnhesxaa2jh7a7l23vlieds3i",
-          "tests": [
-            {
-              "inputs": {
-                "vcf": "${resources_file_path}/HG005.GRCh38.deepvariant.vcf.gz",
-                "params": "--apply-filters PASS --samples ${sample_id}",
-                "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta",
-                "runtime_attributes": "${default_runtime_attributes}"
-              },
-              "output_tests": {
-                "stats": {
-                  "value": "${resources_file_path}/HG005.GRCh38.deepvariant.vcf.stats.txt",
-                  "test_tasks": [
-                    "compare_file_basename",
-                    "check_empty_lines"
-                  ]
-                }
-              }
-            }
-          ]
-        }
-      }
-    },
     "workflows/wdl-common/wdl/tasks/pbsv_discover.wdl": {
       "key": "workflows/wdl-common/wdl/tasks/pbsv_discover.wdl",
       "name": "",
@@ -728,14 +707,18 @@
       "name": "",
       "description": "",
       "tasks": {
-        "write_cohort_yaml": {
-          "key": "write_cohort_yaml",
-          "digest": "sqxqqo3fiojgj6t5ldw5druizzqwh2v5",
+        "write_yaml_ped_phrank": {
+          "key": "write_yaml_ped_phrank",
+          "digest": "e4yxyjj6vw35pxz434pgfalxpa4xh72n",
           "tests": [
             {
               "inputs": {
                 "cohort_id": "hg005-small-cohort",
                 "cohort_json": "${resources_file_path}/cohort.json",
+                "hpo_terms": "${datasets_file_path}/hpo/hpoTerms.txt",
+                "hpo_dag": "${datasets_file_path}/hpo/hpoDag.txt",
+                "hpo_annotations": "${datasets_file_path}/hpo/ensembl.hpoPhenotype.tsv",
+                "ensembl_to_hgnc": "${datasets_file_path}/hpo/ensembl.hgncSymbol.tsv",
                 "runtime_attributes": "${default_runtime_attributes}"
               },
               "output_tests": {
@@ -746,22 +729,7 @@
                     "compare_file_basename",
                     "check_yaml"
                   ]
-                }
-              }
-            }
-          ]
-        },
-        "write_ped": {
-          "key": "write_ped",
-          "digest": "opte5yq6pvlotpxywpqd33km3j3jb6y3",
-          "tests": [
-            {
-              "inputs": {
-                "cohort_id": "hg005-small-cohort",
-                "cohort_yaml": "${resources_file_path}/hg005-small-cohort.yml",
-                "runtime_attributes": "${default_runtime_attributes}"
-              },
-              "output_tests": {
+                },
                 "pedigree": {
                   "value": "${resources_file_path}/hg005-small-cohort.ped",
                   "test_tasks": [
@@ -770,26 +738,7 @@
                     "check_tab_delimited",
                     "count_columns"
                   ]
-                }
-              }
-            }
-          ]
-        },
-        "calculate_phrank": {
-          "key": "calculate_phrank",
-          "digest": "jpck2axvvdp6dxrf2msxn3z2p3lkcoip",
-          "tests": [
-            {
-              "inputs": {
-                "cohort_id": "hg005-small-cohort",
-                "cohort_yaml": "${resources_file_path}/hg005-small-cohort.yml",
-                "hpo_terms": "${datasets_file_path}/hpo/hpoTerms.txt",
-                "hpo_dag": "${datasets_file_path}/hpo/hpoDag.txt",
-                "hpo_annotations": "${datasets_file_path}/hpo/ensembl.hpoPhenotype.tsv",
-                "ensembl_to_hgnc": "${datasets_file_path}/hpo/ensembl.hgncSymbol.tsv",
-                "runtime_attributes": "${default_runtime_attributes}"
-              },
-              "output_tests": {
+                },
                 "phrank_lookup": {
                   "value": "${resources_file_path}/hg005-small-cohort_phrank.tsv",
                   "test_tasks": [
@@ -802,44 +751,24 @@
             }
           ]
         },
-        "bcftools_norm": {
-          "key": "bcftools_norm",
-          "digest": "5nl66yjctlih3vcwe6gio7upi6zm6st5",
-          "tests": [
-            {
-              "inputs": {
-                "vcf": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.vcf.gz",
-                "vcf_index": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.vcf.gz.tbi",
-                "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta",
-                "runtime_attributes": "${default_runtime_attributes}"
-              },
-              "output_tests": {
-                "normalized_bcf": {
-                  "value": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.bcf",
-                  "test_tasks": [
-                    "compare_file_basename",
-                    "check_sorted_vcf_bcf"
-                  ]
-                }
-              }
-            }
-          ]
-        },
         "slivar_small_variant": {
           "key": "slivar_small_variant",
-          "digest": "3olxcrbpuemodr32rtp5reu7hkxpvh3n",
+          "digest": "rrak4b2uphyuonanbjtyjnub2vu5mkkl",
           "tests": [
             {
               "inputs": {
-                "bcf": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.bcf",
-                "bcf_index": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.bcf.csi",
+                "vcf": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.vcf.gz",
+                "vcf_index": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.vcf.gz.tbi",
                 "pedigree": "${resources_file_path}/hg005-small-cohort.ped",
                 "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta",
                 "reference_index": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai",
                 "slivar_js": "${datasets_file_path}/slivar/slivar-functions.v0.2.8.js",
                 "gnomad_af": "${datasets_file_path}/GRCh38/slivar_gnotate/gnomad.hg38.v3.custom.v1.zip",
                 "hprc_af": "${datasets_file_path}/GRCh38/slivar_gnotate/hprc.deepvariant.glnexus.hg38.v1.zip",
                 "gff": "${datasets_file_path}/GRCh38/ensembl.GRCh38.101.reformatted.gff3.gz",
+                "lof_lookup": "${datasets_file_path}/slivar/lof_lookup.v2.1.1.txt",
+                "clinvar_lookup": "${datasets_file_path}/slivar/clinvar_gene_desc.20221214T183140.txt",
+                "phrank_lookup": "${resources_file_path}/hg005-small-cohort_phrank.tsv",
                 "runtime_attributes": "${default_runtime_attributes}"
               },
               "output_tests": {
@@ -850,51 +779,15 @@
                     "vcftools_validator",
                     "check_gzip"
                   ]
-                }
-              }
-            }
-          ]
-        },
-        "slivar_compound_hets": {
-          "key": "slivar_compound_hets",
-          "digest": "cj5kbogcwjmjpyeb7qwpg5pomygx2vil",
-          "tests": [
-            {
-              "inputs": {
-                "filtered_vcf": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.slivar.vcf.gz",
-                "filtered_vcf_index": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.slivar.vcf.gz.tbi",
-                "pedigree": "${resources_file_path}/hg005-small-cohort.ped",
-                "runtime_attributes": "${default_runtime_attributes}"
-              },
-              "output_tests": {
+                },
                 "compound_het_vcf": {
                   "value": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.slivar.compound_hets.vcf.gz",
                   "test_tasks": [
-                    "calculate_md5sum",
                     "compare_file_basename",
                     "vcftools_validator",
                     "check_gzip"
                   ]
-                }
-              }
-            }
-          ]
-        },
-        "slivar_tsv": {
-          "key": "slivar_tsv",
-          "digest": "tz264zjvupikaa74waq76wa3vrduejx7",
-          "tests": [
-            {
-              "inputs": {
-                "filtered_vcf": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.slivar.vcf.gz",
-                "compound_het_vcf": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.slivar.compound_hets.vcf.gz",
-                "pedigree": "${resources_file_path}/hg005-small-cohort.ped",
-                "lof_lookup": "${datasets_file_path}/slivar/lof_lookup.v2.1.1.txt",
-                "clinvar_lookup": "${datasets_file_path}/slivar/clinvar_gene_desc.20221214T183140.txt",
-                "phrank_lookup": "${resources_file_path}/hg005-small-cohort_phrank.tsv",
-                "runtime_attributes": "${default_runtime_attributes}"
-              },
-              "output_tests": {
+                },
                 "filtered_tsv": {
                   "value": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.slivar.tsv",
                   "test_tasks": [
@@ -919,7 +812,7 @@
         },
         "svpack_filter_annotated": {
           "key": "svpack_filter_annotated",
-          "digest": "picjo4pk7b7gy2nkcae4ssak6pztevqv",
+          "digest": "iyov6j7rcjp3llujj37q3clgpcbfduzh",
           "tests": [
             {
               "inputs": {
@@ -941,7 +834,7 @@
               },
               "output_tests": {
                 "svpack_vcf": {
-                  "value": "${resources_file_path}/hg005-small-cohort.GRCh38.pbsv.svpack.vcf",
+                  "value": "${resources_file_path}/hg005-small-cohort.GRCh38.pbsv.svpack.vcf.gz",
                   "test_tasks": [
                     "calculate_md5sum",
                     "compare_file_basename",

diff --git a/workflows/sample_analysis/sample_analysis.wdl b/workflows/sample_analysis/sample_analysis.wdl
@@ -5,7 +5,6 @@ version 1.0
 import "../humanwgs_structs.wdl"
 import "../wdl-common/wdl/tasks/pbsv_discover.wdl" as PbsvDiscover
 import "../wdl-common/wdl/workflows/deepvariant/deepvariant.wdl" as DeepVariant
-import "../wdl-common/wdl/tasks/bcftools_stats.wdl" as BcftoolsStats
 import "../wdl-common/wdl/tasks/mosdepth.wdl" as Mosdepth
 import "../wdl-common/wdl/tasks/pbsv_call.wdl" as PbsvCall
 import "../wdl-common/wdl/tasks/concat_vcf.wdl" as ConcatVcf
@@ -61,20 +60,14 @@ workflow sample_analysis {
 			default_runtime_attributes = default_runtime_attributes
 	}
 
-	call BcftoolsStats.bcftools_stats {
+	call bcftools {
 		input:
 			vcf = deepvariant.vcf.data,
-			params = "--apply-filters PASS --samples ~{sample.sample_id}",
+			stats_params = "--apply-filters PASS --samples ~{sample.sample_id}",
 			reference = reference.fasta.data,
 			runtime_attributes = default_runtime_attributes
 	}
 
-	call bcftools_roh {
-		input:
-			vcf = deepvariant.vcf.data,
-			runtime_attributes = default_runtime_attributes
-	}
-
 	scatter (region_set in pbsv_splits) {
 		call PbsvCall.pbsv_call {
 			input:
@@ -208,9 +201,9 @@ workflow sample_analysis {
 
 		# per sample small variant calls
 		IndexData small_variant_gvcf = deepvariant.gvcf
-		File small_variant_vcf_stats = bcftools_stats.stats
-		File small_variant_roh_out = bcftools_roh.roh_out
-		File small_variant_roh_bed = bcftools_roh.roh_bed
+		File small_variant_vcf_stats = bcftools.stats
+		File small_variant_roh_out = bcftools.roh_out
+		File small_variant_roh_bed = bcftools.roh_bed
 
 		# per sample final phased variant calls and haplotagged alignments
 		# phased_vcfs order: small variants, SVs
@@ -334,22 +327,34 @@ task pbmm2_align {
 	}
 }
 
-task bcftools_roh {
+task bcftools {
 	input {
 		File vcf
 
+		String? stats_params
+		File reference
+
 		RuntimeAttributes runtime_attributes
 	}
 
 	String vcf_basename = basename(vcf, ".vcf.gz")
+
 	Int threads = 2
-	Int disk_size = ceil(size(vcf, "GB") * 2 + 20)
+	Int reference_size = if (defined(reference)) then ceil(size(reference, "GB")) else 0
+	Int disk_size = ceil((size(vcf, "GB") + reference_size) * 2 + 20)
 
 	command <<<
 		set -euo pipefail
 
 		bcftools --version
 
+		bcftools stats \
+			--threads ~{threads - 1} \
+			~{stats_params} \
+			~{"--fasta-ref " + reference} \
+			~{vcf} \
+		> ~{vcf_basename}.vcf.stats.txt
+
 		bcftools roh \
 			--threads ~{threads - 1} \
 			--AF-dflt 0.4 \
@@ -363,6 +368,7 @@ task bcftools_roh {
 	>>>
 
 	output {
+		File stats = "~{vcf_basename}.vcf.stats.txt"
 		File roh_out = "~{vcf_basename}.bcftools_roh.out"
 		File roh_bed = "~{vcf_basename}.roh.bed"
 	}