Skip to content

Commit

Permalink
Merge pull request #76 from PacificBiosciences/feature/combine-short-…
Browse files Browse the repository at this point in the history
…tasks

Combine short tasks.
  • Loading branch information
williamrowell authored Oct 5, 2023
2 parents d8d539e + 57dc6b0 commit 86d268c
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 464 deletions.
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -264,12 +264,11 @@ The Docker image used by a particular step of the workflow can be identified by
| htslib | <ul><li>[htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/htslib) |
| mosdepth | <ul><li>[mosdepth 0.2.9](https://github.com/brentp/mosdepth/releases/tag/v0.2.9)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/mosdepth) |
| paraphase | <ul><li>[minimap2 2.17](https://github.com/lh3/minimap2/releases/tag/v2.17)</li><li>[samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)</li><li>[paraphase 2.2.3](https://github.com/PacificBiosciences/paraphase/releases/tag/v2.2.3)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/paraphase) |
| parse-cohort | <ul><li>python 3.8.10; custom scripts</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/5b3e15e5da2963bb81a51170f82e37209407d5fc/docker/parse-cohort) |
| pb-cpg-tools | <ul><li>[pb-CpG-tools v2.3.2](https://github.com/PacificBiosciences/pb-CpG-tools/releases/tag/v2.3.2)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/7481837d3b0f539adf4f64209a65cf28eebf3dba/docker/pb-cpg-tools) |
| pbmm2 | <ul><li>[pbmm2 1.10.0](https://github.com/PacificBiosciences/pbmm2/releases/tag/v1.10.0)</li><li>[datamash 1.1.0](https://ftp.gnu.org/gnu/datamash/)</li><li>[pysam 0.16.0.1](https://github.com/pysam-developers/pysam/releases/tag/v0.16.0.1)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/pbmm2) |
| pbsv | <ul><li>[pbsv 2.9.0](https://github.com/PacificBiosciences/pbsv/releases/tag/v2.9.0)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/pbsv) |
| pyyaml | <ul><li>[pyyaml 5.3.1](https://github.com/yaml/pyyaml/releases/tag/5.3.1)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/pyyaml) |
| pyyaml | <ul><li>[pyyaml 5.3.1](https://github.com/yaml/pyyaml/releases/tag/5.3.1)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/f72e862bca2f209b9909e6043ef0197975762f27/docker/pyyaml) |
| samtools | <ul><li>[samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/samtools) |
| slivar | <ul><li>[slivar 0.2.2](https://github.com/brentp/slivar/releases/tag/v0.2.2)</li><li>[bcftools 1.14](https://github.com/samtools/bcftools/releases/tag/1.14)</li><li>[vcfpy 0.13.3](https://github.com/bihealth/vcfpy/releases/tag/v0.13.3)</li><li>[pysam 0.19.1](https://github.com/pysam-developers/pysam/releases/tag/v0.19.1)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/slivar) |
| svpack | <ul><li>[svpack 36180ae6](https://github.com/PacificBiosciences/svpack/tree/a82598ebc4013bf32e70295b83b380ada6302c4a)</li><li>[pysam 0.16.0.1](https://github.com/pysam-developers/pysam/releases/tag/v0.16.0.1)</li> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/svpack) |
| svpack | <ul><li>[svpack 36180ae6](https://github.com/PacificBiosciences/svpack/tree/a82598ebc4013bf32e70295b83b380ada6302c4a)</li><li>[htslib 1.18](https://github.com/samtools/htslib/releases/tag/1.18)</li><li>[pysam 0.21.0](https://github.com/pysam-developers/pysam/releases/tag/v0.21.0)</li> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/8edbc516abc0ff43ac279b48018003923721b054/docker/svpack) |
| trgt | <ul><li>[trgt 0.5.0](https://github.com/PacificBiosciences/trgt/releases/tag/v0.5.0)</li><li>[samtools 1.18](https://github.com/samtools/samtools/releases/tag/1.18)</li><li>[bcftools 1.18](https://github.com/samtools/bcftools/releases/tag/1.18)</li><li>[pysam 0.21.0](https://github.com/pysam-developers/pysam/releases/tag/v0.21.0)</li></ul> | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/d2a45e0213ac3fa631a51a48757c442d3ed550b6/docker/trgt) |
169 changes: 31 additions & 138 deletions wdl-ci.config.json
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,15 @@
}
]
},
"bcftools_roh": {
"key": "bcftools_roh",
"digest": "wyp43tacw5ovlm24ypisltgmgilpudcp",
"bcftools": {
"key": "bcftools",
"digest": "cbfxlhk575vhxbh6spw7ceyhn2ljf7vu",
"tests": [
{
"inputs": {
"vcf": "${resources_file_path}/HG005.GRCh38.deepvariant.vcf.gz",
"stats_params": "--apply-filters PASS --samples HG005",
"reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta",
"runtime_attributes": "${default_runtime_attributes}"
},
"output_tests": {
Expand All @@ -101,6 +103,13 @@
"count_bed_columns",
"check_tab_delimited"
]
},
"stats": {
"value": "${resources_file_path}/HG005.GRCh38.deepvariant.vcf.stats.txt",
"test_tasks": [
"compare_file_basename",
"check_empty_lines"
]
}
}
}
Expand Down Expand Up @@ -332,36 +341,6 @@
"description": "",
"tasks": {}
},
"workflows/wdl-common/wdl/tasks/bcftools_stats.wdl": {
"key": "workflows/wdl-common/wdl/tasks/bcftools_stats.wdl",
"name": "",
"description": "",
"tasks": {
"bcftools_stats": {
"key": "bcftools_stats",
"digest": "cu73ojtpnhesxaa2jh7a7l23vlieds3i",
"tests": [
{
"inputs": {
"vcf": "${resources_file_path}/HG005.GRCh38.deepvariant.vcf.gz",
"params": "--apply-filters PASS --samples ${sample_id}",
"reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta",
"runtime_attributes": "${default_runtime_attributes}"
},
"output_tests": {
"stats": {
"value": "${resources_file_path}/HG005.GRCh38.deepvariant.vcf.stats.txt",
"test_tasks": [
"compare_file_basename",
"check_empty_lines"
]
}
}
}
]
}
}
},
"workflows/wdl-common/wdl/tasks/pbsv_discover.wdl": {
"key": "workflows/wdl-common/wdl/tasks/pbsv_discover.wdl",
"name": "",
Expand Down Expand Up @@ -728,14 +707,18 @@
"name": "",
"description": "",
"tasks": {
"write_cohort_yaml": {
"key": "write_cohort_yaml",
"digest": "sqxqqo3fiojgj6t5ldw5druizzqwh2v5",
"write_yaml_ped_phrank": {
"key": "write_yaml_ped_phrank",
"digest": "e4yxyjj6vw35pxz434pgfalxpa4xh72n",
"tests": [
{
"inputs": {
"cohort_id": "hg005-small-cohort",
"cohort_json": "${resources_file_path}/cohort.json",
"hpo_terms": "${datasets_file_path}/hpo/hpoTerms.txt",
"hpo_dag": "${datasets_file_path}/hpo/hpoDag.txt",
"hpo_annotations": "${datasets_file_path}/hpo/ensembl.hpoPhenotype.tsv",
"ensembl_to_hgnc": "${datasets_file_path}/hpo/ensembl.hgncSymbol.tsv",
"runtime_attributes": "${default_runtime_attributes}"
},
"output_tests": {
Expand All @@ -746,22 +729,7 @@
"compare_file_basename",
"check_yaml"
]
}
}
}
]
},
"write_ped": {
"key": "write_ped",
"digest": "opte5yq6pvlotpxywpqd33km3j3jb6y3",
"tests": [
{
"inputs": {
"cohort_id": "hg005-small-cohort",
"cohort_yaml": "${resources_file_path}/hg005-small-cohort.yml",
"runtime_attributes": "${default_runtime_attributes}"
},
"output_tests": {
},
"pedigree": {
"value": "${resources_file_path}/hg005-small-cohort.ped",
"test_tasks": [
Expand All @@ -770,26 +738,7 @@
"check_tab_delimited",
"count_columns"
]
}
}
}
]
},
"calculate_phrank": {
"key": "calculate_phrank",
"digest": "jpck2axvvdp6dxrf2msxn3z2p3lkcoip",
"tests": [
{
"inputs": {
"cohort_id": "hg005-small-cohort",
"cohort_yaml": "${resources_file_path}/hg005-small-cohort.yml",
"hpo_terms": "${datasets_file_path}/hpo/hpoTerms.txt",
"hpo_dag": "${datasets_file_path}/hpo/hpoDag.txt",
"hpo_annotations": "${datasets_file_path}/hpo/ensembl.hpoPhenotype.tsv",
"ensembl_to_hgnc": "${datasets_file_path}/hpo/ensembl.hgncSymbol.tsv",
"runtime_attributes": "${default_runtime_attributes}"
},
"output_tests": {
},
"phrank_lookup": {
"value": "${resources_file_path}/hg005-small-cohort_phrank.tsv",
"test_tasks": [
Expand All @@ -802,44 +751,24 @@
}
]
},
"bcftools_norm": {
"key": "bcftools_norm",
"digest": "5nl66yjctlih3vcwe6gio7upi6zm6st5",
"tests": [
{
"inputs": {
"vcf": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.vcf.gz",
"vcf_index": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.vcf.gz.tbi",
"reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta",
"runtime_attributes": "${default_runtime_attributes}"
},
"output_tests": {
"normalized_bcf": {
"value": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.bcf",
"test_tasks": [
"compare_file_basename",
"check_sorted_vcf_bcf"
]
}
}
}
]
},
"slivar_small_variant": {
"key": "slivar_small_variant",
"digest": "3olxcrbpuemodr32rtp5reu7hkxpvh3n",
"digest": "rrak4b2uphyuonanbjtyjnub2vu5mkkl",
"tests": [
{
"inputs": {
"bcf": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.bcf",
"bcf_index": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.bcf.csi",
"vcf": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.vcf.gz",
"vcf_index": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.vcf.gz.tbi",
"pedigree": "${resources_file_path}/hg005-small-cohort.ped",
"reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta",
"reference_index": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai",
"slivar_js": "${datasets_file_path}/slivar/slivar-functions.v0.2.8.js",
"gnomad_af": "${datasets_file_path}/GRCh38/slivar_gnotate/gnomad.hg38.v3.custom.v1.zip",
"hprc_af": "${datasets_file_path}/GRCh38/slivar_gnotate/hprc.deepvariant.glnexus.hg38.v1.zip",
"gff": "${datasets_file_path}/GRCh38/ensembl.GRCh38.101.reformatted.gff3.gz",
"lof_lookup": "${datasets_file_path}/slivar/lof_lookup.v2.1.1.txt",
"clinvar_lookup": "${datasets_file_path}/slivar/clinvar_gene_desc.20221214T183140.txt",
"phrank_lookup": "${resources_file_path}/hg005-small-cohort_phrank.tsv",
"runtime_attributes": "${default_runtime_attributes}"
},
"output_tests": {
Expand All @@ -850,51 +779,15 @@
"vcftools_validator",
"check_gzip"
]
}
}
}
]
},
"slivar_compound_hets": {
"key": "slivar_compound_hets",
"digest": "cj5kbogcwjmjpyeb7qwpg5pomygx2vil",
"tests": [
{
"inputs": {
"filtered_vcf": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.slivar.vcf.gz",
"filtered_vcf_index": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.slivar.vcf.gz.tbi",
"pedigree": "${resources_file_path}/hg005-small-cohort.ped",
"runtime_attributes": "${default_runtime_attributes}"
},
"output_tests": {
},
"compound_het_vcf": {
"value": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.slivar.compound_hets.vcf.gz",
"test_tasks": [
"calculate_md5sum",
"compare_file_basename",
"vcftools_validator",
"check_gzip"
]
}
}
}
]
},
"slivar_tsv": {
"key": "slivar_tsv",
"digest": "tz264zjvupikaa74waq76wa3vrduejx7",
"tests": [
{
"inputs": {
"filtered_vcf": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.slivar.vcf.gz",
"compound_het_vcf": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.slivar.compound_hets.vcf.gz",
"pedigree": "${resources_file_path}/hg005-small-cohort.ped",
"lof_lookup": "${datasets_file_path}/slivar/lof_lookup.v2.1.1.txt",
"clinvar_lookup": "${datasets_file_path}/slivar/clinvar_gene_desc.20221214T183140.txt",
"phrank_lookup": "${resources_file_path}/hg005-small-cohort_phrank.tsv",
"runtime_attributes": "${default_runtime_attributes}"
},
"output_tests": {
},
"filtered_tsv": {
"value": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.slivar.tsv",
"test_tasks": [
Expand All @@ -919,7 +812,7 @@
},
"svpack_filter_annotated": {
"key": "svpack_filter_annotated",
"digest": "picjo4pk7b7gy2nkcae4ssak6pztevqv",
"digest": "iyov6j7rcjp3llujj37q3clgpcbfduzh",
"tests": [
{
"inputs": {
Expand All @@ -941,7 +834,7 @@
},
"output_tests": {
"svpack_vcf": {
"value": "${resources_file_path}/hg005-small-cohort.GRCh38.pbsv.svpack.vcf",
"value": "${resources_file_path}/hg005-small-cohort.GRCh38.pbsv.svpack.vcf.gz",
"test_tasks": [
"calculate_md5sum",
"compare_file_basename",
Expand Down
34 changes: 20 additions & 14 deletions workflows/sample_analysis/sample_analysis.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ version 1.0
import "../humanwgs_structs.wdl"
import "../wdl-common/wdl/tasks/pbsv_discover.wdl" as PbsvDiscover
import "../wdl-common/wdl/workflows/deepvariant/deepvariant.wdl" as DeepVariant
import "../wdl-common/wdl/tasks/bcftools_stats.wdl" as BcftoolsStats
import "../wdl-common/wdl/tasks/mosdepth.wdl" as Mosdepth
import "../wdl-common/wdl/tasks/pbsv_call.wdl" as PbsvCall
import "../wdl-common/wdl/tasks/concat_vcf.wdl" as ConcatVcf
Expand Down Expand Up @@ -61,20 +60,14 @@ workflow sample_analysis {
default_runtime_attributes = default_runtime_attributes
}

call BcftoolsStats.bcftools_stats {
call bcftools {
input:
vcf = deepvariant.vcf.data,
params = "--apply-filters PASS --samples ~{sample.sample_id}",
stats_params = "--apply-filters PASS --samples ~{sample.sample_id}",
reference = reference.fasta.data,
runtime_attributes = default_runtime_attributes
}
call bcftools_roh {
input:
vcf = deepvariant.vcf.data,
runtime_attributes = default_runtime_attributes
}
scatter (region_set in pbsv_splits) {
call PbsvCall.pbsv_call {
input:
Expand Down Expand Up @@ -208,9 +201,9 @@ workflow sample_analysis {

# per sample small variant calls
IndexData small_variant_gvcf = deepvariant.gvcf
File small_variant_vcf_stats = bcftools_stats.stats
File small_variant_roh_out = bcftools_roh.roh_out
File small_variant_roh_bed = bcftools_roh.roh_bed
File small_variant_vcf_stats = bcftools.stats
File small_variant_roh_out = bcftools.roh_out
File small_variant_roh_bed = bcftools.roh_bed

# per sample final phased variant calls and haplotagged alignments
# phased_vcfs order: small variants, SVs
Expand Down Expand Up @@ -334,22 +327,34 @@ task pbmm2_align {
}
}
task bcftools_roh {
task bcftools {
input {
File vcf

String? stats_params
File reference

RuntimeAttributes runtime_attributes
}
String vcf_basename = basename(vcf, ".vcf.gz")
Int threads = 2
Int disk_size = ceil(size(vcf, "GB") * 2 + 20)
Int reference_size = if (defined(reference)) then ceil(size(reference, "GB")) else 0
Int disk_size = ceil((size(vcf, "GB") + reference_size) * 2 + 20)
command <<<
set -euo pipefail
bcftools --version
bcftools stats \
--threads ~{threads - 1} \
~{stats_params} \
~{"--fasta-ref " + reference} \
~{vcf} \
> ~{vcf_basename}.vcf.stats.txt
bcftools roh \
--threads ~{threads - 1} \
--AF-dflt 0.4 \
Expand All @@ -363,6 +368,7 @@ task bcftools_roh {
>>>
output {
File stats = "~{vcf_basename}.vcf.stats.txt"
File roh_out = "~{vcf_basename}.bcftools_roh.out"
File roh_bed = "~{vcf_basename}.roh.bed"
}
Expand Down
Loading

0 comments on commit 86d268c

Please sign in to comment.