From edee46dc4af12f1d36e14116bf2598730feb092e Mon Sep 17 00:00:00 2001 From: William Rowell Date: Fri, 19 May 2023 17:24:29 -0700 Subject: [PATCH 01/62] Added version print statements to all tasks. - Also updated to parse_cohort:1.0.2 with an option to print version. --- workflows/assemble_genome/assemble_genome.wdl | 12 ++++++++++++ .../de_novo_assembly_sample.wdl | 2 ++ .../de_novo_assembly_trio/de_novo_assembly_trio.wdl | 6 +++++- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/workflows/assemble_genome/assemble_genome.wdl b/workflows/assemble_genome/assemble_genome.wdl index 96a733d..bdf7568 100644 --- a/workflows/assemble_genome/assemble_genome.wdl +++ b/workflows/assemble_genome/assemble_genome.wdl @@ -93,6 +93,8 @@ task hifiasm_assemble { command <<< set -euo pipefail + echo "hifiasm version: " $(hifiasm --version) + hifiasm \ -o ~{prefix} \ -t ~{threads} \ @@ -144,16 +146,22 @@ task gfa2fa { command <<< set -euo pipefail + echo "gfatools version: " $(gfatools version) + gfatools gfa2fa \ ~{gfa} \ > ~{gfa_basename}.fasta + bgzip --version + bgzip \ --threads ~{threads} \ --stdout \ ~{gfa_basename}.fasta \ > ~{gfa_basename}.fasta.gz + echo "calN50.js version: " $(k8 /opt/calN50/calN50.js -v) + # Calculate assembly stats k8 \ /opt/calN50/calN50.js \ @@ -198,6 +206,10 @@ task align_hifiasm { command <<< set -euo pipefail + echo "minimap2 version: " $(minimap2 --version) + + samtools --version + minimap2 \ -t ~{threads - 4} \ -L \ diff --git a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl index 72f7957..361eb84 100644 --- a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl +++ b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl @@ -98,6 +98,8 @@ task htsbox { # Ensure the sample is named based on the bam basename (not the full path) cp ~{bam} . + htsbox 2>&1 | grep -Eo 'Version: htslib [0-9a-z-]+, htsbox [0-9a-z-]+' + htsbox pileup \ -q20 \ -c \ diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 38431c8..d9774a6 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -122,6 +122,8 @@ task parse_families { command <<< set -euo pipefail + parse_cohort.py --version + parse_cohort.py \ --cohort_json ~{cohort_json} \ --parse_families @@ -132,7 +134,7 @@ task parse_families { } runtime { - docker: "~{runtime_attributes.container_registry}/parse-cohort@sha256:94444e7e3fd151936c9bbcb8a64b6a5e7d8c59de53b256a83f15c4ea203977b4" + docker: "~{runtime_attributes.container_registry}/parse-cohort@sha256:e6a8ac24ada706644e62878178790a0006db9a6abec7a312232052bb0666fe8f" cpu: 2 memory: "4 GB" disk: "20 GB" @@ -162,6 +164,8 @@ task yak_count { command <<< set -euo pipefail + echo "yak version: " $(yak version) + yak count \ -t ~{threads} \ -o ~{sample_id}.yak \ From 74d14fa824b25621d2de3d4b93946c8e6aa3d625 Mon Sep 17 00:00:00 2001 From: William Rowell Date: Fri, 19 May 2023 17:49:23 -0700 Subject: [PATCH 02/62] CommandShellCheck, SC2046 Quote this to prevent word splitting. --- workflows/assemble_genome/assemble_genome.wdl | 8 ++++---- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/workflows/assemble_genome/assemble_genome.wdl b/workflows/assemble_genome/assemble_genome.wdl index bdf7568..8d19de6 100644 --- a/workflows/assemble_genome/assemble_genome.wdl +++ b/workflows/assemble_genome/assemble_genome.wdl @@ -93,7 +93,7 @@ task hifiasm_assemble { command <<< set -euo pipefail - echo "hifiasm version: " $(hifiasm --version) + echo "hifiasm version: $(hifiasm --version)" hifiasm \ -o ~{prefix} \ @@ -146,7 +146,7 @@ task gfa2fa { command <<< set -euo pipefail - echo "gfatools version: " $(gfatools version) + echo "gfatools version: $(gfatools version)" gfatools gfa2fa \ ~{gfa} \ @@ -160,7 +160,7 @@ task gfa2fa { ~{gfa_basename}.fasta \ > ~{gfa_basename}.fasta.gz - echo "calN50.js version: " $(k8 /opt/calN50/calN50.js -v) + echo "calN50.js version: $(k8 /opt/calN50/calN50.js -v)" # Calculate assembly stats k8 \ @@ -206,7 +206,7 @@ task align_hifiasm { command <<< set -euo pipefail - echo "minimap2 version: " $(minimap2 --version) + echo "minimap2 version: $(minimap2 --version)" samtools --version diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index d9774a6..ea8779a 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -164,7 +164,7 @@ task yak_count { command <<< set -euo pipefail - echo "yak version: " $(yak version) + echo "yak version: $(yak version)" yak count \ -t ~{threads} \ From bc86fa69d5918499db91c7cd1c3b8db4336da5f3 Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 20 May 2023 01:10:27 +0000 Subject: [PATCH 03/62] update wdl-ci config file after successful tests --- wdl-ci.config.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index e29a369..d44772b 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -115,7 +115,7 @@ "tasks": { "parse_families": { "key": "parse_families", - "digest": "fczc72mlu6iw3glecpbme5jj4ceqyqtw", + "digest": "rprxafsnidgno35awynatngwbnuw6suo", "tests": [ { "inputs": { @@ -137,7 +137,7 @@ }, "yak_count": { "key": "yak_count", - "digest": "32uc62rlhwfjfatcb4qfpr44ohg2wdlb", + "digest": "2e7nclylibk6b77wjef66o55uzs4kyyc", "tests": [ { "inputs": { @@ -173,7 +173,7 @@ "tasks": { "hifiasm_assemble": { "key": "hifiasm_assemble", - "digest": "r4ikydzmdaed4hzsmc3t7efh6mz5e4mx", + "digest": "byghh3pc6vhxsm46fw4m46wuk6rimuuy", "tests": [ { "inputs": { @@ -230,7 +230,7 @@ }, "gfa2fa": { "key": "gfa2fa", - "digest": "liyb2m4cbkovxctcgaxwunqkn5az77ev", + "digest": "kxv2q6xr3x5u2f63tch6rtw2frtvqg7n", "tests": [ { "inputs": { @@ -262,7 +262,7 @@ }, "align_hifiasm": { "key": "align_hifiasm", - "digest": "77gs34t4c2i6epsg2epukfoaign2fmnt", + "digest": "2q7e6w6bxx2vue2rzwrhpoxkbuii25fi", "tests": [ { "inputs": { From c6311029aa232e6b677310638436a73994e87755 Mon Sep 17 00:00:00 2001 From: William Rowell Date: Fri, 19 May 2023 20:33:08 -0700 Subject: [PATCH 04/62] Fix htsbox version command. --- workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl index 361eb84..734731d 100644 --- a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl +++ b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl @@ -98,7 +98,8 @@ task htsbox { # Ensure the sample is named based on the bam basename (not the full path) cp ~{bam} . - htsbox 2>&1 | grep -Eo 'Version: htslib [0-9a-z-]+, htsbox [0-9a-z-]+' + # htsbox has no version option; grep the version from the help output; ignore errors + htsbox 2>&1 | grep -Eo 'Version: htslib [0-9a-z-]+, htsbox [0-9a-z-]+' || true htsbox pileup \ -q20 \ From 28d0011b1facf1b9eab87e666dd7fa319b146f4e Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 20 May 2023 03:42:06 +0000 Subject: [PATCH 05/62] update wdl-ci config file after successful tests --- wdl-ci.config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index d44772b..2d86da3 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -297,7 +297,7 @@ "tasks": { "htsbox": { "key": "htsbox", - "digest": "hgv6puzttllbwzgmunnigqiopcq3gl7x", + "digest": "wzaxerbnwe327lejeyudsposw4ywor7t", "tests": [ { "inputs": { From 052365fae3b5feb9d5159c2518f878b5a8ad9de0 Mon Sep 17 00:00:00 2001 From: William Rowell Date: Tue, 23 May 2023 18:33:00 -0700 Subject: [PATCH 06/62] update submodule --- workflows/wdl-common | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/wdl-common b/workflows/wdl-common index 0b034ff..7fdb52c 160000 --- a/workflows/wdl-common +++ b/workflows/wdl-common @@ -1 +1 @@ -Subproject commit 0b034ff68b995b8667ff711fdbb40ce803e9892a +Subproject commit 7fdb52c9d82861e1cae403f3e6fbed1d1f6aeaf6 From fcbd54c10e36fc4e0d0f8563a7e201c4c542fc9f Mon Sep 17 00:00:00 2001 From: Billy Rowell Date: Fri, 2 Jun 2023 13:27:45 -0700 Subject: [PATCH 07/62] Create LICENSE --- LICENSE | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..aaea0c1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,34 @@ +Copyright (c) 2023, Pacific Biosciences of California, Inc. + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted (subject to the limitations in the +disclaimer below) provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of Pacific Biosciences nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. From de2e2187c5a74fef3a9941af5741429dd2273dd2 Mon Sep 17 00:00:00 2001 From: Heather Ward Date: Tue, 6 Jun 2023 10:05:09 -0400 Subject: [PATCH 08/62] Set max retries to 3 for HPC backend configuration --- workflows/wdl-common | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/wdl-common b/workflows/wdl-common index 7fdb52c..101b444 160000 --- a/workflows/wdl-common +++ b/workflows/wdl-common @@ -1 +1 @@ -Subproject commit 7fdb52c9d82861e1cae403f3e6fbed1d1f6aeaf6 +Subproject commit 101b444a3b2bd76ea014a53da80c0e800576ebbe From 0ba71f4a44dc6f1978e2f2288506d64e5ae4cc96 Mon Sep 17 00:00:00 2001 From: Heather Ward Date: Tue, 27 Jun 2023 15:57:58 -0400 Subject: [PATCH 09/62] Format docs; add backend-specific docs --- README.md | 195 ++++++++++++------ backends/README.md | 5 + backends/aws/.gitignore | 1 + backends/aws/README.md | 129 ++++++++++++ backends/aws/agc-project.template.yaml | 167 +++++++++++++++ {workflows => backends/aws}/inputs.aws.json | 0 backends/azure/README.md | 30 +++ .../azure}/inputs.azure.json | 0 backends/gcp/README.md | 34 +++ {workflows => backends/gcp}/inputs.gcp.json | 0 backends/hpc/README.md | 53 +++++ {workflows => backends/hpc}/inputs.hpc.json | 0 backends/hpc/miniwdl.cfg | 50 +++++ .../{inputs.json => input_template.json} | 0 14 files changed, 600 insertions(+), 64 deletions(-) create mode 100644 backends/README.md create mode 100644 backends/aws/.gitignore create mode 100644 backends/aws/README.md create mode 100644 backends/aws/agc-project.template.yaml rename {workflows => backends/aws}/inputs.aws.json (100%) create mode 100644 backends/azure/README.md rename {workflows => backends/azure}/inputs.azure.json (100%) create mode 100644 backends/gcp/README.md rename {workflows => backends/gcp}/inputs.gcp.json (100%) create mode 100644 backends/hpc/README.md rename {workflows => backends/hpc}/inputs.hpc.json (100%) create mode 100644 backends/hpc/miniwdl.cfg rename workflows/{inputs.json => input_template.json} (100%) diff --git a/README.md b/README.md index e28cd5a..a333a44 100644 --- a/README.md +++ b/README.md @@ -3,54 +3,155 @@ Workflow for running de novo assembly using human PacBio whole genome sequencing (WGS) data. Written using [Workflow Description Language (WDL)](https://openwdl.org/). - Docker images used by these workflows are defined [here](https://github.com/PacificBiosciences/wdl-dockerfiles). - - Common tasks that may be reused within or between workflows are defined [here](https://github.com/PacificBiosciences/wdl-common). # Workflow -The assembly workflow performs _de novo_ assembly on samples and trios. - **Workflow entrypoint**: [workflows/main.wdl](workflows/main.wdl) -- [Blank input template file](workflows/inputs.json) -- [Azure-based inputs](workflows/inputs.azure.json) -- [AWS-based inputs](workflows/inputs.aws.json) -- [GCP-based inputs]((workflows/inputs.gcp.json)) +The assembly workflow performs _de novo_ assembly on samples and trios. ![De novo assembly workflow diagram](workflows/main.graphviz.svg "De novo assembly workflow diagram") -# Reference datasets and associated workflow files +## Setup + +Some tasks and workflows are pulled in from other repositories. Ensure you have initialized submodules following cloning by running `git submodule update --init --recursive`. + +## Resource requirements + +The workflow requires at minimum 48 cores and 288 GB of RAM. Ensure that the backend environment you're using has enough quota to run the workflow. + +## Reference datasets and associated workflow files + +Reference datasets are hosted publicly for use in the pipeline. For data locations, see the [backend-specific documentation](backends/) and template inputs files for each backend with paths to publicly hosted reference files filled out. + +# Running the workflow + +1. [Select a backend environment](#selecting-a-backend) +2. [Configure a workflow execution engine in the chosen environment](#configuring-a-workflow-engine) +3. [Optional] [Register the engine in Workbench](#registering-a-workflow-engine-in-workbench) +4. [Fill out the inputs JSON file for your cohort](#filling-out-the-inputs-json) +5. [Run the workflow](#running-the-workflow-1) + +## Selecting a backend + +The workflow can be run on Azure, AWS, GCP, or HPC. Your choice of backend will largely be determined by the location of your data. + +For backend-specific configuration, see the relevant documentation: + +- [Azure](backends/azure) +- [AWS](backends/aws) +- [GCP](backends/gcp) +- [HPC](backends/hpc) + +## Configuring a workflow engine + +An execution engine is required to run workflows. Two popular engines for running WDL-based workflows are [`miniwdl`](https://miniwdl.readthedocs.io/en/latest/getting_started.html) and [`Cromwell`](https://cromwell.readthedocs.io/en/stable/tutorials/FiveMinuteIntro/). + +See [Workbench's documentation](https://docs.dnastack.com/docs/introduction-to-engines-and-backends) as well as the [backend-specific documentation](backends) for details on setting up an engine. + +| Engine | Azure | AWS | GCP | HPC | +| :- | :- | :- | :- | :- | +| [**miniwdl**](https://github.com/chanzuckerberg/miniwdl#scaling-up) | _Unsupported_ | Supported via the [Amazon Genomics CLI](https://aws.amazon.com/genomics-cli/) | _Unsupported_ | (SLURM only) Supported via the [`miniwdl-slurm`](https://github.com/miniwdl-ext/miniwdl-slurm) plugin | +| [**Cromwell**](https://cromwell.readthedocs.io/en/stable/backends/Backends/) | Supported via [Cromwell on Azure](https://github.com/microsoft/CromwellOnAzure) | Supported via the [Amazon Genomics CLI](https://aws.amazon.com/genomics-cli/) | Supported via Google's [Pipelines API](https://cromwell.readthedocs.io/en/stable/backends/Google/) | Supported - [Configuration varies depending on HPC infrastructure](https://cromwell.readthedocs.io/en/stable/tutorials/HPCIntro/) | + +## Registering a workflow engine in Workbench + +Once an engine has been configured, it can optionally be registered in [Workbench](https://workbench.dnastack.com/) to enable a unified interface for workflow submission, monitoring, and statistics. Once configured, workflow runs may be submitted either [via the browser](https://docs.dnastack.com/docs/accessing-the-workbench-gui) or [via the Workbench CLI](#run-using-workbench). + +See [Workbench's documentation](https://docs.dnastack.com/docs/connecting-to-a-workflow-engine) for details on how to register an engine in Workbench. Backend-specific resources and default configurations that may be required as part of engine setup may also be found in the [backends](backends) directory. + +Workbench requires a license to use. For information on obtaining a license or to set up a demo, please contact [support@dnastack.com](mailto:support@dnastack.com). + +## Filling out the inputs JSON + +The input to a workflow run is defined in JSON format. Template input files with reference dataset information filled out are available for each backend: + +- [Azure](backends/azure/inputs.azure.json) +- [AWS](backends/aws/inputs.aws.json) +- [GCP](backends/gcp/inputs.gcp.json) +- [HPC](backends/hpc/inputs.hpc.json) + +Using the appropriate inputs template file, fill in the cohort and sample information (see [Workflow Inputs](#workflow-inputs) for more information on the input structure). + +If using an HPC backend, you will need to download the reference bundle and replace the `` in the input template file with the local path to the reference datasets on your HPC. + +## Running the workflow + +Run the workflow using the engine and backend that you have configured ([miniwdl](#run-directly-using-miniwdl), [Cromwell](#run-directly-using-cromwell), [Workbench](#run-using-workbench)). + +Note that the calls to `miniwdl` and `Cromwell` assume you are accessing the engine directly on the machine on which it has been deployed. Depending on the backend you have configured, you may be able to submit workflows using different methods (e.g. using trigger files in Azure, or using the Amazon Genomics CLI in AWS). Calls to the Workbench CLI will be the same regardless of the engine/backend combination. + +### Run directly using miniwdl + +`miniwdl run workflows/main.wdl -i ` -Reference datasets are hosted publicly for use in the pipeline. For data locations, see `workflows/inputs.${backend}.json`. +### Run directly using Cromwell -## Reference data hosted in Azure +`java -jar run workflows/main.wdl -i ` -To use Azure reference data, add the following line to your `containers-to-mount` file in your Cromwell on Azure installation ([more info here](https://github.com/microsoft/CromwellOnAzure/blob/develop/docs/troubleshooting-guide.md#use-input-data-files-from-an-existing-azure-storage-account-that-my-lab-or-team-is-currently-using)): +### Run using Workbench -`https://datasetpbrarediseases.blob.core.windows.net/dataset?si=public&spr=https&sv=2021-06-08&sr=c&sig=o6OkcqWWlGcGOOr8I8gCA%2BJwlpA%2FYsRz0DMB8CCtCJk%3D` +Rather than running a workflow directly using an engine, engines can be configured using [Workbench](https://workbench.dnastack.com/). Workbench presents a unified interface to the respective backends and engines. Workflow runs may be submitted and monitored either [directly in-browser](https://docs.dnastack.com/docs/accessing-the-workbench-gui) or using the command-line interface (CLI) (see below). -The [Azure input file template](workflows/inputs.azure.json) has paths to the reference files in this blob storage prefilled. +Note that these steps assume you have already [set up and registered an engine in Workbench](https://docs.dnastack.com/docs/workbench-settings). -## Reference data hosted in AWS +1. [Install and configure the DNAstack CLI](#installing-and-configuring-the-dnastack-cli) +2. [Register the workflow on Workbench](#registering-the-workflow-on-workbench) +3. [Submit a workflow run](#submitting-workflow-runs-via-workbench) -AWS reference data is hosted in the `us-west-2` region in the bucket `s3://dnastack-resources`. +Steps (1) and (2) are one-time setup, following which any number of workflow runs may be submitted. -To use AWS reference data, add the following line to the data section of your [`agc-project.yaml`](https://aws.github.io/amazon-genomics-cli/docs/concepts/projects/): +For assistance and licensing, please contact [support@dnastack.com](mailto:support@dnastack.com). -```yaml -data: - - location: s3://dnastack-resources - readOnly: true +#### Installing and configuring the DNAstack CLI + +1. Install the DNAstack CLI + +`python3 -m pip install --user dnastack-client-library` + +Confirm that the CLI is installed and available by running `dnastack --version`. + +2. Authenticate using the CLI + +`dnastack auth login` + +3. Configure the CLI to use workbench + +`dnastack use workbench.dnastack.com` + +You can now use the DNAstack CLI to interact with Workbench. + +#### Registering the workflow on Workbench + +From the root of this repository, run: + +```bash +dnastack alpha workbench workflows create \ + --name "PacBio Human Assembly" \ + --description =@README.md \ + workflows/main.wdl ``` +Note the `internalId` field of the returned JSON. This will be used as the `--url` value when submitting workflow runs. -The [AWS input file template](workflows/inputs.aws.json) has paths to the reference files in the blob storage prefilled. +This step only needs to be completed once, when initially registering the workflow. Following this initial setup, additional runs may be submitted by using the same `internalId` recorded here. -## Reference data hosted in GCP +#### Submitting workflow runs via Workbench - +In the following command, replace `` with the path to your filled out inputs file, and `` with the ID you noted in step (1). If no engine is provided, the default engine you have configured will be used. + +```bash +dnastack workbench runs submit \ + --workflow-params @ \ + --url \ + [--tags ] \ + [--engine ] +``` # Workflow inputs +This section describes the inputs required for a run of the workflow. Typically, only the `de_novo_assembly.cohort` and potentially [run/backend-specific sections](#other-inputs) will be filled out by the user for each run of the workflow. Input templates with reference file locations filled out are provided [for each backend](backends). + ## [Cohort](workflows/humanwgs_structs.wdl) A cohort can include one or more samples. Samples need not be related. @@ -78,7 +179,7 @@ Sample information for each sample in the workflow run. Files associated with the reference genome. -These files are hosted publicly in each of the cloud backends; see `workflows/inputs.${backend}.json`. +These files are hosted publicly in each of the cloud backends; see `backends/${backend}/inputs.${backend}.json`. | Type | Name | Description | Notes | | :- | :- | :- | :- | @@ -89,45 +190,11 @@ These files are hosted publicly in each of the cloud backends; see `workflows/in | Type | Name | Description | Notes | | :- | :- | :- | :- | -| String | backend | Backend where the workflow will be executed | \["Azure", "AWS", "GCP"\] | -| String? | zones | Zones where compute will take place; required if backend is set to 'AWS' or 'GCP'. | [Determining available zones in AWS and GCP](#determining-available-zones-in-aws-and-gcp). | -| String? | aws_spot_queue_arn | Queue ARN for the spot batch queue; required if backend is set to 'AWS' and `preemptible` is set to `true` | [Determining the AWS queue ARN](#determining-the-aws-batch-queue-arn) | -| String? | aws_on_demand_queue_arn | Queue ARN for the on demand batch queue; required if backend is set to 'AWS' and `preemptible` is set to `false` | [Determining the AWS queue ARN](#determining-the-aws-batch-queue-arn) | -| Boolean | preemptible | If set to `true`, run tasks preemptibly where possible. On-demand VMs will be used only for tasks that run for >24 hours if the backend is set to GCP. If set to `false`, on-demand VMs will be used for every task. | \[true, false\] | - -### Determining available zones in AWS and GCP - -#### AWS - -To determine available zones in AWS, look for the ZoneName attributes output by the following command: - -```bash -aws ec2 describe-availability-zones --region -``` -For example, the zones in region us-east-2 are `"us-east-2a us-east-2b us-east-2c"`. - -#### GCP - -To determine available zones in GCP, run the following; available zones within a region can be found in the first column of the output: - -```bash -gcloud compute zones list | grep -``` - -For example, the zones in region us-central1 are `"us-central1-a us-central1-b us-central1c us-central1f"`. - -### Determining the AWS batch queue ARN - -**Note that if you are using a `miniwdl` engine, you can skip these steps; workflows run via miniwdl will run exclusively in the job queue to which they are submitted.** - -1. Visit [the AWS console](https://console.aws.amazon.com/). -2. Navigate to the Batch service. -3. In the lefthand sidebar, select "Compute environments". Note the name of the compute environment with the provisioning model SPOT (if you have deployed a context using spot instances) and the name of the compute environment with provisioning model "EC2" (if you have deployed a context that does not use spot instances). -4. In the lefthand sidebar, select "Job queues". -5. Clicking into an individual queue will show information about the compute environment ("Compute environment order"). Identify the job queue with the Compute environment name that matches the name you identified for the SPOT compute environment; copy the Amazon Resource Name (ARN) for this job queue. This is the value that should be used for the `aws_spot_queue_arn`. Repeat this process to find the ARN for the `aws_on_demand_queue_arn`. - -- If `preemptible = true`, only the `aws_spot_queue_arn` is required. -- If `preemptible = false`, only the `aws_on_demand_queue_arn` is required. +| String | backend | Backend where the workflow will be executed | \["Azure", "AWS", "GCP", "HPC"\] | +| String? | zones | Zones where compute will take place; required if backend is set to 'AWS' or 'GCP'. |
  • [Determining available zones in AWS](backends/aws/README.md#determining-available-zones)
  • [Determining available zones in GCP](backends/gcp/README.md#determining-available-zones)
| +| String? | aws_spot_queue_arn | Queue ARN for the spot batch queue; required if backend is set to 'AWS' and `preemptible` is set to `true` | [Determining the AWS queue ARN](backends/aws/README.md#determining-the-aws-batch-queue-arn) | +| String? | aws_on_demand_queue_arn | Queue ARN for the on demand batch queue; required if backend is set to 'AWS' and `preemptible` is set to `false` | [Determining the AWS queue ARN](backends/aws/README.md#determining-the-aws-batch-queue-arn) | +| Boolean | preemptible | If set to `true`, run tasks preemptibly where possible. On-demand VMs will be used only for tasks that run for >24 hours if the backend is set to GCP. If set to `false`, on-demand VMs will be used for every task. Ignored if backend is set to HPC. | \[true, false\] | # Workflow outputs @@ -160,7 +227,7 @@ These files will be output if `cohort.de_novo_assembly_trio` is set to `true` an # Tool versions and Docker images -Docker images definitions used by the human WGS workflow can be found in [the wdl-dockerfiles repository](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a). Images are hosted in PacBio's [quay.io](https://quay.io/organization/pacbio). Docker images used in the workflow are pegged to specific versions by referring to their digests rather than tags. +Docker images definitions used by this workflow can be found in [the wdl-dockerfiles repository](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a). Images are hosted in PacBio's [quay.io](https://quay.io/organization/pacbio). Docker images used in the workflow are pegged to specific versions by referring to their digests rather than tags. The Docker image used by a particular step of the workflow can be identified by looking at the `docker` key in the `runtime` block for the given task. Images can be referenced in the following table by looking for the name after the final `/` character and before the `@sha256:...`. For example, the image referred to here is "align_hifiasm": > ~{runtime_attributes.container_registry}/**align_hifiasm**@sha256:3968cb<...>b01f80fe diff --git a/backends/README.md b/backends/README.md new file mode 100644 index 0000000..17505f4 --- /dev/null +++ b/backends/README.md @@ -0,0 +1,5 @@ +# Backend and engine-specific configuration + +Example configuration specific to each backend is provided here. + +For detailed instructions on configuring engines in different backends, see the [Workbench documentation for configuring engines](https://docs.dnastack.com/docs/workbench-settings). diff --git a/backends/aws/.gitignore b/backends/aws/.gitignore new file mode 100644 index 0000000..10ef663 --- /dev/null +++ b/backends/aws/.gitignore @@ -0,0 +1 @@ +agc-project.yaml diff --git a/backends/aws/README.md b/backends/aws/README.md new file mode 100644 index 0000000..ca609a9 --- /dev/null +++ b/backends/aws/README.md @@ -0,0 +1,129 @@ +# Configuring the Amazon Genomics CLI + +The Amazon Genomics CLI (`agc`) allows users to orchestrate workflow execution using AWS Batch. See the [Workbench documentation](https://docs.dnastack.com/docs/cromwell-on-aws-amazon-genomics-cli) for information on installing and using the `agc` to configure and run workflows. The following section provides additional information on deploying a project using the `agc`. + +## Deploying a context with `agc` + +Once you have installed and authenticated with the `agc`, you can deploy a context using an agc project YAML file. This file must be named `agc-project.yaml`. + +An [example agc-project.yaml file](agc-project.template.yaml) that has the workflow, reference data source, and both on-demand and spot contexts configured using Cromwell as the engine is provided here. This will create an agc project named `humanassemblyAgc`, with either (or both) a `spotContext` or an `onDemandContext`. The `spotContext` will allow you to run worklfows using [AWS spot instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-spot-instances.html), which can result in substantial cost savings relative to using [on-demand instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-on-demand-instances.html). + +Note that deploying a context **will incur costs** even if you are not actively running workflows; ensure that [contexts that are not in use are destroyed](https://aws.github.io/amazon-genomics-cli/docs/reference/agc_context_destroy/) to avoid incurring ongoing costs. + +To deploy the agc project using the template file, first copy the template file to a file named `agc-project.yaml` (`cp agc-project.template.yaml agc-project.yaml`). + +In the `data` section of the `agc-project.yaml` file, add any additional s3 buckets that the workflow will require access to, for example the bucket containing sample input data. Make sure that you do not remove the section granting access to the s3://dnastack-resources bucket; this is where [reference datasets are hosted](#reference-data-hosted-in-aws). + +``` +data: + - location: s3://dnastack-resources + readOnly: true + - location: s3:// + readOnly: true +``` + +Then from the directory containing the `agc-project.yaml` file, run: + +```bash +agc context deploy --context ${context} +``` + +Where `${context}` is either `spotContext` or `onDemandContext`. + +If you want both spot and on-demand contexts, all contexts can be deployed at once by running: + +``` +agc context deploy --all +``` + +Note that the `miniwdl` engine run via AWS is currently not supported for this workflow. + +# Checking and requesting quota in AWS + +See [resources requirements](../../README.md#resource-requirements) for information on the minimum requirements for running the workflow. Typically in a new AWS environment, additional vCPU quota will be required. + +## Checking current quota + +1. Navigate to [the AWS console](https://console.aws.amazon.com/). +2. In the top right corner, select the region where your `agc` deployment is located. +3. Navigate to EC2. +4. In the menu on the left, select 'Limits'. +5. Filter the limits by searching for "Standard". The current limit field indicates the number of vCPUs that you currently have access to. +- Spot instance limit: `All Standard (A, C, D, H, I, M, R, T, Z) Spot Instance Requests` +- On-demand instance limit: `Running On-Demand All Standard (A, C, D, H, I, M, R, T, Z) instances` + +If the number of vCPUs in the context you plan to run the workflow in is less than the limites specified in [the resources requirements](../../README.md#resource-requirements) section, you will need to request additional quota before you can run the workflow. + +## Requesting additional quota + +5. Continuing from the steps outlined in [checking the current quota](#checking-current-quota), select the service you want to request an increase for. +6. In the top right corner, select 'Request limit increase'. +7. Fill out the appropriate fields in the request form, ensuring that the region you select is the region where you have deployed your `agc` and where your data is located. 256 vCPUs are recommended for running trio data. + +Low quota increase requests are typically fulfilled within a 1-2 hours. + +# Configuring and running the workflow + +## Filling out workflow inputs + +Fill out any information missing in [the inputs file](inputs.aws.json). Ensure that all data files used by the workflow are at locations that have been configured in the agc-project.yaml file; see the [granting access to other data files](#granting-access-to-other-data-files) for more information. + +See [the inputs section of the main README](../../README.md#workflow-inputs) for more information on the structure of the inputs.json file. + +Note that you only need to fill out the queueArn corresponding to the context you are submitting the workflow to (spot or on-demand). + +### Determining available zones + +To determine available zones in AWS, look for the `ZoneName` attribute output by the following command: + +```bash +aws ec2 describe-availability-zones --region +``` + +For example, the zones in region us-east-2 are `"us-east-2a us-east-2b us-east-2c"`. + +### Determining the AWS batch queue ARN + +**Note that if you are using a `miniwdl` engine, you can skip these steps; workflows run via miniwdl will run exclusively in the job queue to which they are submitted.** + +1. Visit [the AWS console](https://console.aws.amazon.com/). +2. Navigate to the Batch service. +3. In the lefthand sidebar, select "Compute environments". Note the name of the compute environment with the provisioning model SPOT (if you have deployed a context using spot instances) and the name of the compute environment with provisioning model "EC2" (if you have deployed a context that does not use spot instances). +4. In the lefthand sidebar, select "Job queues". +5. Clicking into an individual queue will show information about the compute environment ("Compute environment order"). Identify the job queue with the Compute environment name that matches the name you identified for the SPOT compute environment; copy the Amazon Resource Name (ARN) for this job queue. This is the value that should be used for the `aws_spot_queue_arn`. Repeat this process to find the ARN for the `aws_on_demand_queue_arn`. + +- If `preemptible = true`, only the `aws_spot_queue_arn` is required. +- If `preemptible = false`, only the `aws_on_demand_queue_arn` is required. + +## Running the workflow + +### Running via `agc` + +From the directory where your `agc-project.yaml` is located, run: + +`agc workflow run humanassembly --context --inputsFile ` + +The running workflow can be monitored via [`agc workflow` commands](https://aws.github.io/amazon-genomics-cli/docs/reference/agc_workflow/), or via the AWS console. + +### Running via Workbench + +1. [Register the engine in Workbench](https://docs.dnastack.com/docs/connecting-to-a-workflow-engine) +2. [Follow the instructions in the README to run the workflow via Workbench](../../README.md#run-using-workbench) + +# Reference data hosted in AWS + +AWS reference data is hosted in the `us-west-2` region in the bucket `s3://dnastack-resources`. + +To use AWS reference data, add the following line to the data section of your [`agc-project.yaml`](https://aws.github.io/amazon-genomics-cli/docs/concepts/projects/): + +```yaml +data: + - location: s3://dnastack-resources + readOnly: true +``` + +The [AWS input file template](inputs.aws.json) has paths to the reference files in s3 prefilled. The template [agc-project.template.yaml file](agc-project.template.yaml) has this section filled out already. + +### Granting access to other data files + +S3 buckets outside of the reference files can be accessed by adding additional data blocks to the agc-project.yaml file. See the [agc documentation](https://aws.github.io/amazon-genomics-cli/docs/concepts/data/) for more details on adding additional data sources. All inputs referenced in the inputs.json file will need to be at locations that have been configured in the agc-project.yaml. diff --git a/backends/aws/agc-project.template.yaml b/backends/aws/agc-project.template.yaml new file mode 100644 index 0000000..ef58c89 --- /dev/null +++ b/backends/aws/agc-project.template.yaml @@ -0,0 +1,167 @@ +name: humanassemblyAgc +schemaVersion: 1 +data: + - location: s3://dnastack-resources + readOnly: true +workflows: + humanassembly: + type: + language: wdl + version: 1.0 + sourceURL: ../../workflows +contexts: + onDemandContext: + instanceTypes: [ + "c5.large", + "c5.xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.metal", + "c5a.large", + "c5a.xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5n.large", + "c5n.xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.18xlarge", + "m5.large", + "m5.xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5a.large", + "m5a.xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5n.large", + "m5n.xlarge", + "m5n.2xlarge", + "m5n.4xlarge", + "m5n.8xlarge", + "m5n.12xlarge", + "m5n.16xlarge", + "m5n.24xlarge", + "r5.large", + "r5.xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5a.large", + "r5a.xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5n.large", + "r5n.xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + ] + engines: + - type: wdl + engine: cromwell + spotContext: + requestSpotInstances: true + instanceTypes: [ + "c5.large", + "c5.xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.metal", + "c5a.large", + "c5a.xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5n.large", + "c5n.xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.18xlarge", + "m5.large", + "m5.xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5a.large", + "m5a.xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5n.large", + "m5n.xlarge", + "m5n.2xlarge", + "m5n.4xlarge", + "m5n.8xlarge", + "m5n.12xlarge", + "m5n.16xlarge", + "m5n.24xlarge", + "r5.large", + "r5.xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5a.large", + "r5a.xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5n.large", + "r5n.xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + ] + engines: + - type: wdl + engine: cromwell diff --git a/workflows/inputs.aws.json b/backends/aws/inputs.aws.json similarity index 100% rename from workflows/inputs.aws.json rename to backends/aws/inputs.aws.json diff --git a/backends/azure/README.md b/backends/azure/README.md new file mode 100644 index 0000000..05c9ca6 --- /dev/null +++ b/backends/azure/README.md @@ -0,0 +1,30 @@ +# Configuring Cromwell on Azure + +Workflows can be run in Azure by setting up [Cromwell on Azure (CoA)](https://github.com/microsoft/CromwellOnAzure). Documentation on deploying and configuring an instance of CoA can be found [here](https://github.com/microsoft/CromwellOnAzure/wiki/Deploy-your-instance-of-Cromwell-on-Azure). + +## Requirements + +- [Cromwell on Azure](https://github.com/microsoft/CromwellOnAzure) version 3.2+; version 4.0+ is recommended + +# Configuring and running the workflow + +## Filling out workflow inputs + +Fill out any information missing in [the inputs file](inputs.azure.json). + +See [the inputs section of the main README](../../README.md#workflow-inputs) for more information on the structure of the inputs.json file. + +## Running the workflow + +### Running via Workbench + +1. [Register the engine in Workbench](https://docs.dnastack.com/docs/connecting-to-a-workflow-engine) +2. [Follow the instructions in the README to run the workflow via Workbench](../../README.md#run-using-workbench) + +# Reference data hosted in Azure + +To use Azure reference data, add the following line to your `containers-to-mount` file in your Cromwell on Azure installation ([more info here](https://github.com/microsoft/CromwellOnAzure/blob/develop/docs/troubleshooting-guide.md#use-input-data-files-from-an-existing-azure-storage-account-that-my-lab-or-team-is-currently-using)): + +`https://datasetpbrarediseases.blob.core.windows.net/dataset?si=public&spr=https&sv=2021-06-08&sr=c&sig=o6OkcqWWlGcGOOr8I8gCA%2BJwlpA%2FYsRz0DMB8CCtCJk%3D` + +The [Azure input file template](inputs.azure.json) has paths to the reference files in this blob storage prefilled. diff --git a/workflows/inputs.azure.json b/backends/azure/inputs.azure.json similarity index 100% rename from workflows/inputs.azure.json rename to backends/azure/inputs.azure.json diff --git a/backends/gcp/README.md b/backends/gcp/README.md new file mode 100644 index 0000000..7b8ffb7 --- /dev/null +++ b/backends/gcp/README.md @@ -0,0 +1,34 @@ +# Configuring Cromwell on GCP + +[Cromwell's documentation](https://cromwell.readthedocs.io/en/stable/tutorials/PipelinesApi101/) on getting started with Google's genomics Pipelines API can be used to set up the resources needed to run the workflow. + +# Configuring and running the workflow + +## Filling out workflow inputs + +Fill out any information missing in [the inputs file](inputs.gcp.json). + +See [the inputs section of the main README](../../README.md#workflow-inputs) for more information on the structure of the inputs.json file. + +### Determining available zones + +To determine available zones in GCP, run the following; available zones within a region can be found in the first column of the output: + +```bash +gcloud compute zones list | grep +``` + +For example, the zones in region us-central1 are `"us-central1-a us-central1-b us-central1c us-central1f"`. + +## Running the workflow + +### Running via Workbench + +1. [Register the engine in Workbench](https://docs.dnastack.com/docs/connecting-to-a-workflow-engine) +2. [Follow the instructions in the README to run the workflow via Workbench](../../README.md#run-using-workbench) + +# Reference data hosted in GCP + +GCP reference data is hosted in the `us-west1` region in the bucket `gs://pacbio-wdl`. This bucket is requester-pays, meaning that users will need to [provide a billing project in their Cromwell configuration](https://cromwell.readthedocs.io/en/stable/filesystems/GoogleCloudStorage/) in order to use files located in this bucket. + +To avoid egress charges, Cromwell should be set up to spin up compute resources in the same region in which the data is located. If possible, add cohort data to the same region as the reference dataset, or consider mirroring this dataset in the region where your data is located. See [Google's information about data storage and egress charges for more information](https://cloud.google.com/storage/pricing). diff --git a/workflows/inputs.gcp.json b/backends/gcp/inputs.gcp.json similarity index 100% rename from workflows/inputs.gcp.json rename to backends/gcp/inputs.gcp.json diff --git a/backends/hpc/README.md b/backends/hpc/README.md new file mode 100644 index 0000000..d3ab650 --- /dev/null +++ b/backends/hpc/README.md @@ -0,0 +1,53 @@ +Either `miniwdl` or `Cromwell` can be used to run workflows on the HPC. + +# Installing and configuring `miniwdl` + +## Requirements + +- [`miniwdl`](https://github.com/chanzuckerberg/miniwdl) >= 1.9.0 +- [`miniwdl-slurm`](https://github.com/miniwdl-ext/miniwdl-slurm) + +## Configuring + +An [example miniwdl.cfg file](miniwdl.cfg) is provided here. This should be placed at `~/.config/miniwdl.cfg` and edited to match your slurm configuration. This allows running workflows using a basic SLURM setup. + +# Installing and configuring `Cromwell` + +Cromwell supports a number of different HPC backends; see [Cromwell's documentation](https://cromwell.readthedocs.io/en/stable/backends/HPC/) for more information on configuring each of the backends. + +# Configuring and running the workflow + +## Filling out workflow inputs + +Fill out any information missing in [the inputs file](inputs.hpc.json). Once you have downloaded the reference data bundle, ensure that you have replaced the `` in the input template file with the local path to the reference datasets on your HPC. + +See [the inputs section of the main README](../../README.md#workflow-inputs) for more information on the structure of the inputs.json file. + +## Running the workflow + +### Running via miniwdl + +`miniwdl run workflows/main.wdl -i ` + +### Running via Cromwell + +`cromwell run workflows/main.wdl -i ` + +### Running via Workbench + +1. [Register the engine in Workbench](https://docs.dnastack.com/docs/connecting-to-a-workflow-engine) +2. [Follow the instructions in the README to run the workflow via Workbench](../../README.md#run-using-workbench) + +# Reference data bundle + +![https://doi.org/10.5281/zenodo.7922357](https://zenodo.org/badge/DOI/10.5281/zenodo.7922357.svg) + +Reference data is hosted on Zenodo at [10.5281/zenodo.7922357](https://zenodo.org/record/7922357). Download the reference data bundle and extract it to a location on your HPC, then update the input template file with the path to the reference data. + +```bash +# download the reference data bundle +wget https://zenodo.org/record/7922357/files/wdl-humanwgs.v1.0.0.resources.tgz + +# extract the reference data bundle and rename as dataset +tar -xzf wdl-humanwgs.v1.0.0.resources.tgz && mv static_resources dataset +``` diff --git a/workflows/inputs.hpc.json b/backends/hpc/inputs.hpc.json similarity index 100% rename from workflows/inputs.hpc.json rename to backends/hpc/inputs.hpc.json diff --git a/backends/hpc/miniwdl.cfg b/backends/hpc/miniwdl.cfg new file mode 100644 index 0000000..3bdd33d --- /dev/null +++ b/backends/hpc/miniwdl.cfg @@ -0,0 +1,50 @@ +[scheduler] +container_backend = slurm_singularity +# task_concurrency defaults to the number of processors on the system. +# since we submit the jobs to SLURM this is not necessary. +# higher numbers means miniwdl has to monitor more processes simultaneously +# which might impact performance. +task_concurrency=200 + +# This setting allows running tasks to continue, even if one other tasks fails. +# Useful in combination with call caching. Prevents wasting resources by +# cancelling jobs half-way that would probably succeed. +fail_fast = false + +[call_cache] +# The following settings create a call cache under the current directory. +# This prevents wasting unnecessary resources on the cluster by rerunning +# jobs that have already succeeded. +put = true +get = true +dir = "$PWD/miniwdl_call_cache" + +[task_runtime] +# Setting a 'maxRetries' default allows jobs that fail due to intermittent +# errors on the cluster to be retried. +## Requires miniwdl >= 1.9.0 +command_shell = /bin/bash +defaults = { + "maxRetries": 2, + "docker": "ubuntu:20.04" + } + +[singularity] +# This plugin wraps the singularity backend. Make sure the settings are +# appropriate for your cluster. +exe = ["/usr/bin/singularity"] + +# the miniwdl default options contain options to run as a fake root, which +# is not available on most clusters. +run_options = [ + "--containall" + ] + +# Location of the singularity images (optional). The miniwdl-slurm plugin +# will set it to a directory inside $PWD. This location must be reachable +# for the submit nodes. +image_cache = "$PWD/miniwdl_singularity_cache" + +[slurm] +# extra arguments passed to the srun command (optional). +extra_args="--partition compute --comment 'run with miniwdl'" diff --git a/workflows/inputs.json b/workflows/input_template.json similarity index 100% rename from workflows/inputs.json rename to workflows/input_template.json From e1b4074f6e77787c7427fe60485827289f2b5f41 Mon Sep 17 00:00:00 2001 From: Heather Ward Date: Tue, 27 Jun 2023 16:06:19 -0400 Subject: [PATCH 10/62] Update paths for AWS reference files --- backends/aws/inputs.aws.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/aws/inputs.aws.json b/backends/aws/inputs.aws.json index 457ce7a..bd89ebf 100644 --- a/backends/aws/inputs.aws.json +++ b/backends/aws/inputs.aws.json @@ -16,8 +16,8 @@ "de_novo_assembly.reference": { "name": "GRCh38", "fasta": { - "data": "s3://dnastack-resources/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "data_index": "s3://dnastack-resources/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + "data": "s3://dnastack-resources/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "data_index": "s3://dnastack-resources/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" } }, "de_novo_assembly.backend": "AWS", From c7714fe64b9f9e9a9d6f2c5d241ec34469d6662a Mon Sep 17 00:00:00 2001 From: Heather Ward Date: Tue, 4 Jul 2023 11:28:11 -0400 Subject: [PATCH 11/62] Allow user to override container registry --- README.md | 1 + workflows/main.wdl | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a333a44..c031286 100644 --- a/README.md +++ b/README.md @@ -194,6 +194,7 @@ These files are hosted publicly in each of the cloud backends; see `backends/${b | String? | zones | Zones where compute will take place; required if backend is set to 'AWS' or 'GCP'. |
  • [Determining available zones in AWS](backends/aws/README.md#determining-available-zones)
  • [Determining available zones in GCP](backends/gcp/README.md#determining-available-zones)
| | String? | aws_spot_queue_arn | Queue ARN for the spot batch queue; required if backend is set to 'AWS' and `preemptible` is set to `true` | [Determining the AWS queue ARN](backends/aws/README.md#determining-the-aws-batch-queue-arn) | | String? | aws_on_demand_queue_arn | Queue ARN for the on demand batch queue; required if backend is set to 'AWS' and `preemptible` is set to `false` | [Determining the AWS queue ARN](backends/aws/README.md#determining-the-aws-batch-queue-arn) | +| String? | container_registry | Container registry where workflow images are hosted. If left blank, [PacBio's public Quay.io registry](https://quay.io/organization/pacbio) will be used. | | | Boolean | preemptible | If set to `true`, run tasks preemptibly where possible. On-demand VMs will be used only for tasks that run for >24 hours if the backend is set to GCP. If set to `false`, on-demand VMs will be used for every task. Ignored if backend is set to HPC. | \[true, false\] | # Workflow outputs diff --git a/workflows/main.wdl b/workflows/main.wdl index 03ff494..bafe340 100644 --- a/workflows/main.wdl +++ b/workflows/main.wdl @@ -17,6 +17,7 @@ workflow de_novo_assembly { String? zones String? aws_spot_queue_arn String? aws_on_demand_queue_arn + String? container_registry Boolean preemptible } @@ -26,7 +27,8 @@ workflow de_novo_assembly { backend = backend, zones = zones, aws_spot_queue_arn = aws_spot_queue_arn, - aws_on_demand_queue_arn = aws_on_demand_queue_arn + aws_on_demand_queue_arn = aws_on_demand_queue_arn, + container_registry = container_registry } RuntimeAttributes default_runtime_attributes = if preemptible then backend_configuration.spot_runtime_attributes else backend_configuration.on_demand_runtime_attributes @@ -83,6 +85,7 @@ workflow de_novo_assembly { zones: {help: "Zones where compute will take place; required if backend is set to 'AWS' or 'GCP'"} aws_spot_queue_arn: {help: "Queue ARN for the spot batch queue; required if backend is set to 'AWS'"} aws_on_demand_queue_arn: {help: "Queue ARN for the on demand batch queue; required if backend is set to 'AWS'"} + container_registry: {help: "Container registry where workflow images are hosted. If left blank, PacBio's public Quay.io registry will be used."} preemptible: {help: "Where possible, run tasks preemptibly"} } } From 5d1299b51d393fc8768bb4f51bb23ef950f58855 Mon Sep 17 00:00:00 2001 From: William Rowell Date: Fri, 11 Aug 2023 11:33:52 -0700 Subject: [PATCH 12/62] Avoid underestimating disk size based on first element of array. --- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 38431c8..b5c9bcc 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -157,7 +157,7 @@ task yak_count { # Usage up to 140 GB @ 10 threads for Revio samples Int mem_gb = 16 * threads - Int disk_size = ceil(size(reads_fastas[0], "GB") * length(reads_fastas) * 2 + 20) + Int disk_size = ceil(size(reads_fastas, "GB") * 2 + 20) command <<< set -euo pipefail From cb9f17fa60d0e181e546ac58be04daf33ce2a5cb Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 11 Aug 2023 18:43:55 +0000 Subject: [PATCH 13/62] update wdl-ci config file after successful tests --- wdl-ci.config.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index e29a369..97c0734 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -19,7 +19,7 @@ "tasks": { "bcftools_stats": { "key": "bcftools_stats", - "digest": "zgvmaa6wtldmojg3gixeiah5aak7khb5", + "digest": "hnaklilhvfhzokyogil55ymtufpzugdc", "tests": [ { "inputs": { @@ -49,7 +49,7 @@ "tasks": { "zip_index_vcf": { "key": "zip_index_vcf", - "digest": "yxnm7toivkmsrrs4h3x72wdrtxd2lo72", + "digest": "zglkxnubs7arukywr6dtr2rmlrs4l6si", "tests": [ { "inputs": { @@ -79,7 +79,7 @@ "tasks": { "samtools_fasta": { "key": "samtools_fasta", - "digest": "i2tzlr7ni5gglbe7regxody2bttg35na", + "digest": "fzvpxhpi2a5nyyys7ktoirf4ww2exbe3", "tests": [ { "inputs": { @@ -137,7 +137,7 @@ }, "yak_count": { "key": "yak_count", - "digest": "32uc62rlhwfjfatcb4qfpr44ohg2wdlb", + "digest": "2ovi7jh4btl4sb7xr23ga6mxtd7nlq4s", "tests": [ { "inputs": { From 9c6c96e7d3c9f7295b6dde427f48141a53624801 Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Wed, 16 Aug 2023 11:53:14 -0700 Subject: [PATCH 14/62] Calculate total bases input for each parent to set yak params on the fly --- .../de_novo_assembly_trio.wdl | 121 +++++++++++++++++- 1 file changed, 120 insertions(+), 1 deletion(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index b5c9bcc..d223029 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -41,10 +41,28 @@ workflow de_novo_assembly_trio { } } + # For yak, we need to know the total input coverage so we can set cloud memory resources accordingly + scatter (fasta in samtools_fasta_father.reads_fasta) { + call fasta_basecount as fasta_bc_father { + input: + reads_fasta = fasta, + runtime_attributes = default_runtime_attributes + } + } + + call get_total_bp as get_total_bp_father { + input: + sample_id = father.sample_id, + fasta_totals = fasta_bc_father.read_total_bp, + runtime_attributes = default_runtime_attributes + } + call yak_count as yak_count_father { input: sample_id = father.sample_id, reads_fastas = samtools_fasta_father.reads_fasta, + sample_total_bp = get_total_bp_father.sample_total_bp, + runtime_attributes = default_runtime_attributes } @@ -56,10 +74,29 @@ workflow de_novo_assembly_trio { } } + # For yak, we need to know the total input coverage so we can set cloud memory resources accordingly + scatter (fasta in samtools_fasta_mother.reads_fasta) { + call fasta_basecount as fasta_bc_mother { + input: + reads_fasta = fasta, + runtime_attributes = default_runtime_attributes + } + } + + call get_total_bp as get_total_bp_mother { + input: + sample_id = mother.sample_id, + fasta_totals = fasta_bc_mother.read_total_bp, + runtime_attributes = default_runtime_attributes + } + + call yak_count as yak_count_mother { input: sample_id = mother.sample_id, reads_fastas = samtools_fasta_mother.reads_fasta, + sample_total_bp = get_total_bp_father.sample_total_bp, + runtime_attributes = default_runtime_attributes } @@ -149,15 +186,18 @@ task yak_count { input { String sample_id Array[File] reads_fastas + Int sample_total_bp RuntimeAttributes runtime_attributes } - Int threads = 10 # Usage up to 140 GB @ 10 threads for Revio samples Int mem_gb = 16 * threads Int disk_size = ceil(size(reads_fastas, "GB") * 2 + 20) + + # if sample is less than 15X (3.2Gb * 15) use -b37 bloom filter parameter + String yak_options = if sample_total_bp > 48 then "-b37" else "" command <<< set -euo pipefail @@ -165,6 +205,7 @@ task yak_count { yak count \ -t ~{threads} \ -o ~{sample_id}.yak \ + ~{yak_options} \ ~{sep=' ' reads_fastas} >>> @@ -185,3 +226,81 @@ task yak_count { zones: runtime_attributes.zones } } + +task fasta_basecount { + input { + File reads_fasta + String reads_fasta_basename = basename(reads_fasta) + + RuntimeAttributes runtime_attributes + } + + Int threads = 1 + Int mem_gb = 4 * threads + + Int disk_size = ceil(size(reads_fasta, "GB") * 2 + 20) + + command <<< + set -euo pipefail + + grep -v "^>" ~{reads_fasta} | tr -d '\n' | wc -c > ~{reads_fasta_basename}.total + >>> + + output { + File read_total_bp = "~{reads_fasta_basename}.total" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/python@sha256:e4d921e252c3c19fe64097aa619c369c50cc862768d5fcb5e19d2877c55cfdd2" + cpu: threads + memory: mem_gb + " GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +} + +task get_total_bp { + input { + String sample_id + Array[File] fasta_totals + + RuntimeAttributes runtime_attributes + } + + Int threads = 1 + Int mem_gb = 4 * threads + + Int disk_size = ceil(size(fasta_totals[0], "GB") * 2 + 20) + + command <<< + set -euo pipefail + + cat ~{sep=' ' fasta_totals} | awk '{sum+=$1}END{print sum/1000000000}' > ~{sample_id}.total + + >>> + + output { + Int sample_total_bp = round(read_float("~{sample_id}.total")) +# File sample_total_bp = "~{sample_id}.total" + + } + + runtime { + docker: "~{runtime_attributes.container_registry}/python@sha256:e4d921e252c3c19fe64097aa619c369c50cc862768d5fcb5e19d2877c55cfdd2" + cpu: threads + memory: mem_gb + " GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } + +} From c9f709291d7b94716555cdbec2b9b5741a989014 Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Wed, 16 Aug 2023 11:55:54 -0700 Subject: [PATCH 15/62] less than not greater than --- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index d223029..2d62021 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -197,7 +197,7 @@ task yak_count { Int disk_size = ceil(size(reads_fastas, "GB") * 2 + 20) # if sample is less than 15X (3.2Gb * 15) use -b37 bloom filter parameter - String yak_options = if sample_total_bp > 48 then "-b37" else "" + String yak_options = if sample_total_bp < 48 then "-b37" else "" command <<< set -euo pipefail From e873f208e9296cc7ddd2883c851abf19f96838a7 Mon Sep 17 00:00:00 2001 From: Heather Ward Date: Wed, 23 Aug 2023 13:40:02 -0400 Subject: [PATCH 16/62] Document submitting the workflow directly to Cromwell using cURL --- README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/README.md b/README.md index c031286..42da455 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,26 @@ Note that the calls to `miniwdl` and `Cromwell` assume you are accessing the eng `java -jar run workflows/main.wdl -i ` +If Cromwell is running in server mode, the workflow can be submitted using cURL. Fill in the values of CROMWELL_URL and INPUTS_JSON below, then from the root of the repository, run: + +```bash +# The base URL (and port, if applicable) of your Cromwell server +CROMWELL_URL= +# The path to your inputs JSON file +INPUTS_JSON= + +(cd workflows && zip -r dependencies.zip assembly_structs.wdl assemble_genome/ de_novo_assembly_sample/ de_novo_assembly_trio/ wdl-common/) +curl -X "POST" \ + "${CROMWELL_URL}/api/workflows/v1" \ + -H "accept: application/json" \ + -H "Content-Type: multipart/form-data" \ + -F "workflowSource=@workflows/main.wdl" \ + -F "workflowInputs=@${INPUTS_JSON};type=application/json" \ + -F "workflowDependencies=@workflows/dependencies.zip;type=application/zip" +``` + +To specify [workflow options](https://cromwell.readthedocs.io/en/latest/wf_options/Overview/), add the following to the request (assuming your options file is a file called `options.json` located in the `pwd`): `-F "workflowOptions=@options.json;type=application/json"`. + ### Run using Workbench Rather than running a workflow directly using an engine, engines can be configured using [Workbench](https://workbench.dnastack.com/). Workbench presents a unified interface to the respective backends and engines. Workflow runs may be submitted and monitored either [directly in-browser](https://docs.dnastack.com/docs/accessing-the-workbench-gui) or using the command-line interface (CLI) (see below). From 902f92605f49fceb755caed7a65a33237f0c3b62 Mon Sep 17 00:00:00 2001 From: Heather Ward Date: Wed, 23 Aug 2023 13:45:48 -0400 Subject: [PATCH 17/62] Add dependencies.zip to gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..13d7372 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +inputs.test_data*.json +.wdltest* +dependencies.zip From 56833d4077f502bcadea5a77b87088cda0f2047e Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Fri, 22 Sep 2023 15:18:47 -0700 Subject: [PATCH 18/62] Adding multi-reference alignment option --- backends/aws/inputs.aws.json | 11 ++-- backends/azure/inputs.azure.json | 11 ++-- backends/gcp/inputs.gcp.json | 11 ++-- backends/hpc/inputs.hpc.json | 11 ++-- wdl-ci.config.json | 2 +- workflows/assemble_genome/assemble_genome.wdl | 37 +++++++----- .../de_novo_assembly_sample.wdl | 58 +++++++++++-------- .../de_novo_assembly_trio.wdl | 9 +-- workflows/input_template.json | 17 +++--- workflows/main.wdl | 15 ++--- 10 files changed, 105 insertions(+), 77 deletions(-) diff --git a/backends/aws/inputs.aws.json b/backends/aws/inputs.aws.json index bd89ebf..44c4a52 100644 --- a/backends/aws/inputs.aws.json +++ b/backends/aws/inputs.aws.json @@ -13,13 +13,14 @@ ], "run_de_novo_assembly_trio": "Boolean" }, - "de_novo_assembly.reference": { - "name": "GRCh38", + "de_novo_assembly.references": [ + { + "name": "String", "fasta": { - "data": "s3://dnastack-resources/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "data_index": "s3://dnastack-resources/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + "data": "File", + "data_index": "File" } - }, + ], "de_novo_assembly.backend": "AWS", "de_novo_assembly.zones": "us-east-2a us-east-2b us-east-2c", "de_novo_assembly.aws_spot_queue_arn": "", diff --git a/backends/azure/inputs.azure.json b/backends/azure/inputs.azure.json index 64a1911..5d63fa6 100644 --- a/backends/azure/inputs.azure.json +++ b/backends/azure/inputs.azure.json @@ -13,13 +13,14 @@ ], "run_de_novo_assembly_trio": "Boolean" }, - "de_novo_assembly.reference": { - "name": "GRCh38", + "de_novo_assembly.references": [ + { + "name": "String", "fasta": { - "data": "/datasetpbrarediseases/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "data_index": "/datasetpbrarediseases/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + "data": "File", + "data_index": "File" } - }, + ], "de_novo_assembly.backend": "Azure", "de_novo_assembly.preemptible": "Boolean" } diff --git a/backends/gcp/inputs.gcp.json b/backends/gcp/inputs.gcp.json index f4cc3c5..723742a 100644 --- a/backends/gcp/inputs.gcp.json +++ b/backends/gcp/inputs.gcp.json @@ -13,13 +13,14 @@ ], "run_de_novo_assembly_trio": "Boolean" }, - "de_novo_assembly.reference": { - "name": "GRCh38", + "de_novo_assembly.references": [ + { + "name": "String", "fasta": { - "data": "gs:///dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "data_index": "gs:///dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + "data": "File", + "data_index": "File" } - }, + ], "de_novo_assembly.backend": "GCP", "de_novo_assembly.zones": "String", "de_novo_assembly.preemptible": "Boolean" diff --git a/backends/hpc/inputs.hpc.json b/backends/hpc/inputs.hpc.json index 338e58d..e0979de 100644 --- a/backends/hpc/inputs.hpc.json +++ b/backends/hpc/inputs.hpc.json @@ -13,13 +13,14 @@ ], "run_de_novo_assembly_trio": "Boolean" }, - "de_novo_assembly.reference": { - "name": "GRCh38", + "de_novo_assembly.references": [ + { + "name": "String", "fasta": { - "data": "/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "data_index": "/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + "data": "File", + "data_index": "File" } - }, + ], "de_novo_assembly.backend": "HPC", "de_novo_assembly.preemptible": false } diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 97c0734..2e5bd10 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -361,4 +361,4 @@ } } } -} \ No newline at end of file +} diff --git a/workflows/assemble_genome/assemble_genome.wdl b/workflows/assemble_genome/assemble_genome.wdl index 96a733d..66acedf 100644 --- a/workflows/assemble_genome/assemble_genome.wdl +++ b/workflows/assemble_genome/assemble_genome.wdl @@ -9,7 +9,7 @@ workflow assemble_genome { String sample_id Array[File] reads_fastas - ReferenceData reference + Array[ReferenceData] references String? hifiasm_extra_params File? father_yak @@ -38,26 +38,36 @@ workflow assemble_genome { call gfa2fa { input: gfa = gfa, - reference_index = reference.fasta.data_index, - runtime_attributes = default_runtime_attributes + runtime_attributes = default_runtime_attributes } } + + scatter (ref in references) { + call align_hifiasm { + input: + sample_id = sample_id, + query_sequences = gfa2fa.zipped_fasta, + reference = ref.fasta.data, + reference_name = ref.name, + runtime_attributes = default_runtime_attributes + } - call align_hifiasm { - input: - sample_id = sample_id, - query_sequences = gfa2fa.zipped_fasta, - reference = reference.fasta.data, - reference_name = reference.name, - runtime_attributes = default_runtime_attributes + IndexData sample_aligned_bam = { + "data": align_hifiasm.asm_bam, + "data_index": align_hifiasm.asm_bam_index + } + + Pair[ReferenceData,IndexData] align_data = (ref, sample_aligned_bam) } + output { Array[File] assembly_noseq_gfas = hifiasm_assemble.assembly_noseq_gfas Array[File] assembly_lowQ_beds = hifiasm_assemble.assembly_lowQ_beds Array[File] zipped_assembly_fastas = gfa2fa.zipped_fasta Array[File] assembly_stats = gfa2fa.assembly_stats - IndexData asm_bam = {"data": align_hifiasm.asm_bam, "data_index": align_hifiasm.asm_bam_index} + Array[IndexData] asm_bams = sample_aligned_bam + Array[Pair[ReferenceData,IndexData]] alignments = align_data } parameter_meta { @@ -132,8 +142,6 @@ task gfa2fa { input { File gfa - File reference_index - RuntimeAttributes runtime_attributes } @@ -157,11 +165,12 @@ task gfa2fa { # Calculate assembly stats k8 \ /opt/calN50/calN50.js \ - -f ~{reference_index} \ ~{gfa_basename}.fasta.gz \ > ~{gfa_basename}.fasta.stats.txt >>> + + output { File zipped_fasta = "~{gfa_basename}.fasta.gz" File assembly_stats = "~{gfa_basename}.fasta.stats.txt" diff --git a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl index 72f7957..6481d5f 100644 --- a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl +++ b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl @@ -12,7 +12,7 @@ workflow de_novo_assembly_sample { input { Sample sample - ReferenceData reference + Array[ReferenceData] references String backend RuntimeAttributes default_runtime_attributes @@ -31,43 +31,53 @@ workflow de_novo_assembly_sample { input: sample_id = sample.sample_id, reads_fastas = samtools_fasta.reads_fasta, - reference = reference, + references = references, hifiasm_extra_params = "", backend = backend, default_runtime_attributes = default_runtime_attributes, on_demand_runtime_attributes = on_demand_runtime_attributes } - call htsbox { - input: - bam = assemble_genome.asm_bam.data, - bam_index = assemble_genome.asm_bam.data_index, - reference = reference.fasta.data, - runtime_attributes = default_runtime_attributes - } + scatter (aln in assemble_genome.alignments) { + ReferenceData ref = aln.left + IndexData bam = aln.right + call htsbox { + input: + bam = bam.data, + bam_index = bam.data_index, + reference = ref.fasta.data, + runtime_attributes = default_runtime_attributes + } - call ZipIndexVcf.zip_index_vcf { - input: - vcf = htsbox.htsbox_vcf, - runtime_attributes = default_runtime_attributes - } + call ZipIndexVcf.zip_index_vcf { + input: + vcf = htsbox.htsbox_vcf, + runtime_attributes = default_runtime_attributes + } - call BcftoolsStats.bcftools_stats { - input: - vcf = zip_index_vcf.zipped_vcf, - params = "--samples ~{basename(assemble_genome.asm_bam.data)}", - reference = reference.fasta.data, - runtime_attributes = default_runtime_attributes - } + IndexData htsbox_vcf = { + "data": zip_index_vcf.zipped_vcf, + "data_index": zip_index_vcf.zipped_vcf_index + } + call BcftoolsStats.bcftools_stats { + input: + vcf = zip_index_vcf.zipped_vcf, + params = "--samples ~{basename(bam.data)}", + reference = ref.fasta.data, + runtime_attributes = default_runtime_attributes + } + + } output { Array[File] assembly_noseq_gfas = assemble_genome.assembly_noseq_gfas Array[File] assembly_lowQ_beds = assemble_genome.assembly_lowQ_beds Array[File] zipped_assembly_fastas = assemble_genome.zipped_assembly_fastas Array[File] assembly_stats = assemble_genome.assembly_stats - IndexData asm_bam = assemble_genome.asm_bam - IndexData htsbox_vcf = {"data": zip_index_vcf.zipped_vcf, "data_index": zip_index_vcf.zipped_vcf_index} - File htsbox_vcf_stats = bcftools_stats.stats + Array[IndexData] asm_bams = assemble_genome.asm_bams + + Array[IndexData] htsbox_vcfs = htsbox_vcf + Array[File] htsbox_vcf_stats = bcftools_stats.stats } parameter_meta { diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 2d62021..b9b6502 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -11,7 +11,7 @@ workflow de_novo_assembly_trio { input { Cohort cohort - ReferenceData reference + Array[ReferenceData] references String backend RuntimeAttributes default_runtime_attributes @@ -121,7 +121,7 @@ workflow de_novo_assembly_trio { input: sample_id = "~{cohort.cohort_id}.~{child.sample_id}", reads_fastas = samtools_fasta_child.reads_fasta, - reference = reference, + references = references, hifiasm_extra_params = "-c1 -d1", father_yak = yak_count_father.yak, mother_yak = yak_count_mother.yak, @@ -138,12 +138,13 @@ workflow de_novo_assembly_trio { Array[Array[File]] assembly_lowQ_beds = flatten(assemble_genome.assembly_lowQ_beds) Array[Array[File]] zipped_assembly_fastas = flatten(assemble_genome.zipped_assembly_fastas) Array[Array[File]] assembly_stats = flatten(assemble_genome.assembly_stats) - Array[IndexData] asm_bams = flatten(assemble_genome.asm_bam) + Array[Array[IndexData]] asm_bams = flatten(assemble_genome.asm_bams) + } parameter_meta { cohort: {help: "Sample information for the cohort"} - reference: {help: "Reference genome data"} + references: {help: "List of reference genome data"} default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"} on_demand_runtime_attributes: {help: "RuntimeAttributes for tasks that require dedicated instances"} } diff --git a/workflows/input_template.json b/workflows/input_template.json index 148b817..e97ef8d 100644 --- a/workflows/input_template.json +++ b/workflows/input_template.json @@ -13,16 +13,19 @@ ], "run_de_novo_assembly_trio": "Boolean" }, - "de_novo_assembly.reference": { + "de_novo_assembly.references": [ + { "name": "String", "fasta": { "data": "File", "data_index": "File" } - }, - "de_novo_assembly.backend": "String ['GCP', 'Azure', 'AWS', or 'HPC']", - "de_novo_assembly.zones": "String? (optional); required if backend is set to 'GCP' or 'AWS'", + ], + "de_novo_assembly.zones": "String? (optional); required if backend is set to 'AWS'", "de_novo_assembly.aws_spot_queue_arn": "String? (optional); required if backend is set to 'AWS'", - "de_novo_assembly.aws_on_demand_queue_arn": "String? (optional); required if backend is set to 'AWS'", - "de_novo_assembly.preemptible": "Boolean" -} \ No newline at end of file + "de_novo_assembly.aws_on_demand_queue_arn": "String? (optional)", + "de_novo_assembly.preemptible": "Boolean", + "de_novo_assembly.backend": "String ['GCP', 'Azure', 'AWS', or 'HPC']", + "de_novo_assembly.container_registry": "String? (optional)", + } +} diff --git a/workflows/main.wdl b/workflows/main.wdl index bafe340..7be4319 100644 --- a/workflows/main.wdl +++ b/workflows/main.wdl @@ -10,7 +10,7 @@ workflow de_novo_assembly { input { Cohort cohort - ReferenceData reference + Array[ReferenceData] references # Backend configuration String backend @@ -38,7 +38,7 @@ workflow de_novo_assembly { call DeNovoAssemblySample.de_novo_assembly_sample { input: sample = sample, - reference = reference, + references = references, backend = backend, default_runtime_attributes = default_runtime_attributes, on_demand_runtime_attributes = backend_configuration.on_demand_runtime_attributes @@ -51,7 +51,7 @@ workflow de_novo_assembly { call DeNovoAssemblyTrio.de_novo_assembly_trio { input: cohort = cohort, - reference = reference, + references = references, backend = backend, default_runtime_attributes = default_runtime_attributes, on_demand_runtime_attributes = backend_configuration.on_demand_runtime_attributes @@ -65,9 +65,9 @@ workflow de_novo_assembly { Array[Array[File]?] assembly_lowQ_beds = de_novo_assembly_sample.assembly_lowQ_beds Array[Array[File]?] zipped_assembly_fastas = de_novo_assembly_sample.zipped_assembly_fastas Array[Array[File]?] assembly_stats = de_novo_assembly_sample.assembly_stats - Array[IndexData?] asm_bam = de_novo_assembly_sample.asm_bam - Array[IndexData?] htsbox_vcf = de_novo_assembly_sample.htsbox_vcf - Array[File?] htsbox_vcf_stats = de_novo_assembly_sample.htsbox_vcf_stats + Array[Array[IndexData]?] asm_bam = de_novo_assembly_sample.asm_bams + Array[Array[IndexData]?] htsbox_vcf = de_novo_assembly_sample.htsbox_vcfs + Array[Array[File]?] htsbox_vcf_stats = de_novo_assembly_sample.htsbox_vcf_stats # de_novo_assembly_trio output Array[Map[String, String]]? haplotype_key = de_novo_assembly_trio.haplotype_key @@ -75,7 +75,8 @@ workflow de_novo_assembly { Array[Array[File]]? trio_assembly_lowQ_beds = de_novo_assembly_trio.assembly_lowQ_beds Array[Array[File]]? trio_zipped_assembly_fastas = de_novo_assembly_trio.zipped_assembly_fastas Array[Array[File]]? trio_assembly_stats = de_novo_assembly_trio.assembly_stats - Array[IndexData]? trio_asm_bams = de_novo_assembly_trio.asm_bams +## Array[IndexData]? trio_asm_bams = de_novo_assembly_trio.asm_bams + Array[Array[IndexData]]? trio_asm_bams = de_novo_assembly_trio.asm_bams } parameter_meta { From a40d2a2ca8cb01b77fb6072a95aebd6fff59842c Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Fri, 22 Sep 2023 15:48:11 -0700 Subject: [PATCH 19/62] add yak bloom filter condition --- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index b9b6502..fcade9e 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -197,8 +197,8 @@ task yak_count { Int mem_gb = 16 * threads Int disk_size = ceil(size(reads_fastas, "GB") * 2 + 20) - # if sample is less than 15X (3.2Gb * 15) use -b37 bloom filter parameter - String yak_options = if sample_total_bp < 48 then "-b37" else "" + # Use bloom filter (-b37) to conserve on resources unless input coverage is low (<15X) + String yak_options = if sample_total_bp < 48000000000 then "" else "-b37" command <<< set -euo pipefail @@ -287,7 +287,6 @@ task get_total_bp { output { Int sample_total_bp = round(read_float("~{sample_id}.total")) -# File sample_total_bp = "~{sample_id}.total" } From fcf9ffe23d40eeaf6dfc8d07a4f6b8c02688257d Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Fri, 22 Sep 2023 16:11:34 -0700 Subject: [PATCH 20/62] fix coverage --- wdl-ci.config.json | 162 +++++++++++++++++- .../de_novo_assembly_trio.wdl | 21 +-- 2 files changed, 172 insertions(+), 11 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 2e5bd10..f90b9a4 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -163,6 +163,16 @@ } } ] + }, + "fasta_basecount": { + "key": "fasta_basecount", + "digest": "", + "tests": [] + }, + "get_total_gbp": { + "key": "get_total_gbp", + "digest": "", + "tests": [] } } }, @@ -325,6 +335,156 @@ "name": "", "description": "", "tasks": {} + }, + "workflows/wdl-common/wdl/tasks/glnexus.wdl": { + "key": "workflows/wdl-common/wdl/tasks/glnexus.wdl", + "name": "", + "description": "", + "tasks": { + "glnexus": { + "key": "glnexus", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/mosdepth.wdl": { + "key": "workflows/wdl-common/wdl/tasks/mosdepth.wdl", + "name": "", + "description": "", + "tasks": { + "mosdepth": { + "key": "mosdepth", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/pbsv_call.wdl": { + "key": "workflows/wdl-common/wdl/tasks/pbsv_call.wdl", + "name": "", + "description": "", + "tasks": { + "pbsv_call": { + "key": "pbsv_call", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/pbsv_discover.wdl": { + "key": "workflows/wdl-common/wdl/tasks/pbsv_discover.wdl", + "name": "", + "description": "", + "tasks": { + "pbsv_discover": { + "key": "pbsv_discover", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/pharmcat.wdl": { + "key": "workflows/wdl-common/wdl/tasks/pharmcat.wdl", + "name": "", + "description": "", + "tasks": { + "pangu_cyp2d6": { + "key": "pangu_cyp2d6", + "digest": "", + "tests": [] + }, + "pharmcat_preprocess": { + "key": "pharmcat_preprocess", + "digest": "", + "tests": [] + }, + "filter_preprocessed_vcf": { + "key": "filter_preprocessed_vcf", + "digest": "", + "tests": [] + }, + "run_pharmcat": { + "key": "run_pharmcat", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/whatshap_haplotag.wdl": { + "key": "workflows/wdl-common/wdl/tasks/whatshap_haplotag.wdl", + "name": "", + "description": "", + "tasks": { + "whatshap_haplotag": { + "key": "whatshap_haplotag", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/whatshap_phase.wdl": { + "key": "workflows/wdl-common/wdl/tasks/whatshap_phase.wdl", + "name": "", + "description": "", + "tasks": { + "whatshap_phase": { + "key": "whatshap_phase", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/whatshap_stats.wdl": { + "key": "workflows/wdl-common/wdl/tasks/whatshap_stats.wdl", + "name": "", + "description": "", + "tasks": { + "whatshap_stats": { + "key": "whatshap_stats", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/workflows/deepvariant/deepvariant.wdl": { + "key": "workflows/wdl-common/wdl/workflows/deepvariant/deepvariant.wdl", + "name": "", + "description": "", + "tasks": { + "deepvariant_make_examples": { + "key": "deepvariant_make_examples", + "digest": "", + "tests": [] + }, + "deepvariant_call_variants": { + "key": "deepvariant_call_variants", + "digest": "", + "tests": [] + }, + "deepvariant_postprocess_variants": { + "key": "deepvariant_postprocess_variants", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/workflows/phase_vcf/phase_vcf.wdl": { + "key": "workflows/wdl-common/wdl/workflows/phase_vcf/phase_vcf.wdl", + "name": "", + "description": "", + "tasks": { + "split_vcf": { + "key": "split_vcf", + "digest": "", + "tests": [] + }, + "bcftools_concat": { + "key": "bcftools_concat", + "digest": "", + "tests": [] + } + } } }, "engines": { @@ -361,4 +521,4 @@ } } } -} +} \ No newline at end of file diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index fcade9e..15a1807 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -50,7 +50,7 @@ workflow de_novo_assembly_trio { } } - call get_total_bp as get_total_bp_father { + call get_total_gbp as get_total_bp_father { input: sample_id = father.sample_id, fasta_totals = fasta_bc_father.read_total_bp, @@ -61,7 +61,7 @@ workflow de_novo_assembly_trio { input: sample_id = father.sample_id, reads_fastas = samtools_fasta_father.reads_fasta, - sample_total_bp = get_total_bp_father.sample_total_bp, + sample_total_gbp = get_total_bp_father.sample_total_gbp, runtime_attributes = default_runtime_attributes } @@ -83,7 +83,7 @@ workflow de_novo_assembly_trio { } } - call get_total_bp as get_total_bp_mother { + call get_total_gbp as get_total_bp_mother { input: sample_id = mother.sample_id, fasta_totals = fasta_bc_mother.read_total_bp, @@ -95,7 +95,7 @@ workflow de_novo_assembly_trio { input: sample_id = mother.sample_id, reads_fastas = samtools_fasta_mother.reads_fasta, - sample_total_bp = get_total_bp_father.sample_total_bp, + sample_total_gbp = get_total_bp_father.sample_total_gbp, runtime_attributes = default_runtime_attributes } @@ -187,7 +187,7 @@ task yak_count { input { String sample_id Array[File] reads_fastas - Int sample_total_bp + Int sample_total_gbp RuntimeAttributes runtime_attributes } @@ -197,8 +197,9 @@ task yak_count { Int mem_gb = 16 * threads Int disk_size = ceil(size(reads_fastas, "GB") * 2 + 20) - # Use bloom filter (-b37) to conserve on resources unless input coverage is low (<15X) - String yak_options = if sample_total_bp < 48000000000 then "" else "-b37" + # Use bloom filter (-b37) to conserve resources unless input coverage + # is low ( <15X; (3.2Gb*15=48)) + String yak_options = if sample_total_gbp < 48 then "" else "-b37" command <<< set -euo pipefail @@ -265,7 +266,7 @@ task fasta_basecount { } } -task get_total_bp { +task get_total_gbp { input { String sample_id Array[File] fasta_totals @@ -286,8 +287,8 @@ task get_total_bp { >>> output { - Int sample_total_bp = round(read_float("~{sample_id}.total")) - + Int sample_total_gbp = round(read_float("~{sample_id}.total")) + #Int sample_total_cov = round(sample_total_bp / 3200000000) } runtime { From d46292426d939977eef43ad8f78b9ac47f1c715e Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Mon, 25 Sep 2023 11:03:44 -0700 Subject: [PATCH 21/62] determine yak settings for both parents rather than independently --- wdl-ci.config.json | 5 ++ .../de_novo_assembly_trio.wdl | 61 ++++++++++++++----- 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index f90b9a4..59817e7 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -173,6 +173,11 @@ "key": "get_total_gbp", "digest": "", "tests": [] + }, + "determine_yak_options": { + "key": "determine_yak_options", + "digest": "", + "tests": [] } } }, diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 15a1807..0b3c178 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -50,22 +50,13 @@ workflow de_novo_assembly_trio { } } - call get_total_gbp as get_total_bp_father { + call get_total_gbp as get_total_gbp_father { input: sample_id = father.sample_id, fasta_totals = fasta_bc_father.read_total_bp, runtime_attributes = default_runtime_attributes } - call yak_count as yak_count_father { - input: - sample_id = father.sample_id, - reads_fastas = samtools_fasta_father.reads_fasta, - sample_total_gbp = get_total_bp_father.sample_total_gbp, - - runtime_attributes = default_runtime_attributes - } - scatter (movie_bam in mother.movie_bams) { call SamtoolsFasta.samtools_fasta as samtools_fasta_mother { input: @@ -83,19 +74,35 @@ workflow de_novo_assembly_trio { } } - call get_total_gbp as get_total_bp_mother { + call get_total_gbp as get_total_gbp_mother { input: sample_id = mother.sample_id, fasta_totals = fasta_bc_mother.read_total_bp, runtime_attributes = default_runtime_attributes } + call determine_yak_options { + input: + father_total_gbp = get_total_gbp_father.sample_total_gbp, + mother_total_gbp = get_total_gbp_mother.sample_total_gbp, + } + + call yak_count as yak_count_father { + input: + sample_id = father.sample_id, + reads_fastas = samtools_fasta_father.reads_fasta, + yak_options = determine_yak_options.yak_options, +# sample_total_gbp = get_total_gbp_father.sample_total_gbp, + + runtime_attributes = default_runtime_attributes + } call yak_count as yak_count_mother { input: sample_id = mother.sample_id, reads_fastas = samtools_fasta_mother.reads_fasta, - sample_total_gbp = get_total_bp_father.sample_total_gbp, + yak_options = determine_yak_options.yak_options, +# sample_total_gbp = get_total_gbp_mother.sample_total_gbp, runtime_attributes = default_runtime_attributes } @@ -183,11 +190,32 @@ task parse_families { } } +task determine_yak_options { + input { + Int mother_total_gbp + Int father_total_gbp + } + + command { + set -e + if [ ~{father_total_gbp} -lt 48 ] && [ ~{mother_total_gbp} -lt 48 ]; then + options="" + else + options="-b37" + fi + echo $options + } + output { + String yak_options = read_string(stdout()) + } +} + task yak_count { input { String sample_id Array[File] reads_fastas - Int sample_total_gbp + #Int sample_total_gbp + String yak_options RuntimeAttributes runtime_attributes } @@ -199,7 +227,7 @@ task yak_count { # Use bloom filter (-b37) to conserve resources unless input coverage # is low ( <15X; (3.2Gb*15=48)) - String yak_options = if sample_total_gbp < 48 then "" else "-b37" + #String yak_options = if sample_total_gbp < 48 then "" else "-b37" command <<< set -euo pipefail @@ -282,7 +310,7 @@ task get_total_gbp { command <<< set -euo pipefail - cat ~{sep=' ' fasta_totals} | awk '{sum+=$1}END{print sum/1000000000}' > ~{sample_id}.total + awk '{sum+=$1}END{print sum/1000000000}' ~{sep=' ' fasta_totals} > ~{sample_id}.total >>> @@ -305,3 +333,6 @@ task get_total_gbp { } } + +# cat ~{sep=' ' fasta_totals} | awk '{sum+=$1}END{print sum/1000000000}' > ~{sample_id}.total + From af88a08171ce731758aac2b921dcd1d95aec822a Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Mon, 25 Sep 2023 11:29:33 -0700 Subject: [PATCH 22/62] fix tests and remove some debug comments I missed --- wdl-ci.config.json | 1 + .../de_novo_assembly_trio/de_novo_assembly_trio.wdl | 13 ------------- workflows/main.wdl | 1 - 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 59817e7..86c24dd 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -150,6 +150,7 @@ "${resources_file_path}/m64017_200108_232219.hifi_reads.fasta", "${resources_file_path}/m64017_200112_090459.hifi_reads.fasta" ], + "yak_options": "-b37", "runtime_attributes": "${default_runtime_attributes}" }, "output_tests": { diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 0b3c178..0f155ef 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -92,8 +92,6 @@ workflow de_novo_assembly_trio { sample_id = father.sample_id, reads_fastas = samtools_fasta_father.reads_fasta, yak_options = determine_yak_options.yak_options, -# sample_total_gbp = get_total_gbp_father.sample_total_gbp, - runtime_attributes = default_runtime_attributes } @@ -102,8 +100,6 @@ workflow de_novo_assembly_trio { sample_id = mother.sample_id, reads_fastas = samtools_fasta_mother.reads_fasta, yak_options = determine_yak_options.yak_options, -# sample_total_gbp = get_total_gbp_mother.sample_total_gbp, - runtime_attributes = default_runtime_attributes } @@ -214,7 +210,6 @@ task yak_count { input { String sample_id Array[File] reads_fastas - #Int sample_total_gbp String yak_options RuntimeAttributes runtime_attributes @@ -225,10 +220,6 @@ task yak_count { Int mem_gb = 16 * threads Int disk_size = ceil(size(reads_fastas, "GB") * 2 + 20) - # Use bloom filter (-b37) to conserve resources unless input coverage - # is low ( <15X; (3.2Gb*15=48)) - #String yak_options = if sample_total_gbp < 48 then "" else "-b37" - command <<< set -euo pipefail @@ -316,7 +307,6 @@ task get_total_gbp { output { Int sample_total_gbp = round(read_float("~{sample_id}.total")) - #Int sample_total_cov = round(sample_total_bp / 3200000000) } runtime { @@ -333,6 +323,3 @@ task get_total_gbp { } } - -# cat ~{sep=' ' fasta_totals} | awk '{sum+=$1}END{print sum/1000000000}' > ~{sample_id}.total - diff --git a/workflows/main.wdl b/workflows/main.wdl index 7be4319..7647f34 100644 --- a/workflows/main.wdl +++ b/workflows/main.wdl @@ -75,7 +75,6 @@ workflow de_novo_assembly { Array[Array[File]]? trio_assembly_lowQ_beds = de_novo_assembly_trio.assembly_lowQ_beds Array[Array[File]]? trio_zipped_assembly_fastas = de_novo_assembly_trio.zipped_assembly_fastas Array[Array[File]]? trio_assembly_stats = de_novo_assembly_trio.assembly_stats -## Array[IndexData]? trio_asm_bams = de_novo_assembly_trio.asm_bams Array[Array[IndexData]]? trio_asm_bams = de_novo_assembly_trio.asm_bams } From 137d6a94b002236f627619651cd2092d500d927a Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 25 Sep 2023 18:43:32 +0000 Subject: [PATCH 23/62] update wdl-ci config file after successful tests --- wdl-ci.config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 86c24dd..fffe292 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -246,7 +246,7 @@ }, "gfa2fa": { "key": "gfa2fa", - "digest": "liyb2m4cbkovxctcgaxwunqkn5az77ev", + "digest": "es7l5kyje3fiy5vxjnnsqg4fw6sitmdo", "tests": [ { "inputs": { From 6c0232749f49dc4451c67a1911b2b5166e958b51 Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 25 Sep 2023 21:26:56 +0000 Subject: [PATCH 24/62] update wdl-ci config file after successful tests --- wdl-ci.config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index fffe292..b2e5d4f 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -137,7 +137,7 @@ }, "yak_count": { "key": "yak_count", - "digest": "2ovi7jh4btl4sb7xr23ga6mxtd7nlq4s", + "digest": "qysjdjudeldfcf6pm2unping3zkh4qve", "tests": [ { "inputs": { From 49acf5ccc5c2c1b42d8827b690a6a4e98ebcdecf Mon Sep 17 00:00:00 2001 From: William Rowell Date: Thu, 28 Sep 2023 11:10:47 -0700 Subject: [PATCH 25/62] Take a stab at estimating depth based on filesize. - updated parameter_meta - updated inputs.json - cleaned up some whitespace - added comments - using fasta filesize to estimate depth rather than a separate task; based on Greg's experiments, an uncompressed 10x FASTA is ~60GB --- workflows/assemble_genome/assemble_genome.wdl | 11 +- .../de_novo_assembly_sample.wdl | 3 +- .../de_novo_assembly_trio.wdl | 164 +++--------------- workflows/input_template.json | 58 +++---- 4 files changed, 57 insertions(+), 179 deletions(-) diff --git a/workflows/assemble_genome/assemble_genome.wdl b/workflows/assemble_genome/assemble_genome.wdl index 66acedf..1e6807f 100644 --- a/workflows/assemble_genome/assemble_genome.wdl +++ b/workflows/assemble_genome/assemble_genome.wdl @@ -73,7 +73,7 @@ workflow assemble_genome { parameter_meta { sample_id: {help: "Sample ID; used for naming files"} reads_fastas: {help: "Reads in fasta format to be used for assembly; one for each movie bam to be used in assembly. Reads fastas from one or more sample may be combined to use in the assembly"} - reference: {help: "Reference genome data"} + references: {help: "Array of Reference genomes data"} hiiasm_extra_params: {help: "[OPTIONAL] Additional parameters to pass to hifiasm assembly"} father_yak: {help: "[OPTIONAL] kmer counts for the father; required if running trio-based assembly"} mother_yak: {help: "[OPTIONAL] kmer counts for the mother; required if running trio-based assembly"} @@ -98,7 +98,7 @@ task hifiasm_assemble { String prefix = "~{sample_id}.asm" Int threads = 48 Int mem_gb = threads * 6 - Int disk_size = ceil((size(reads_fastas[0], "GB") * length(reads_fastas)) * 4 + 20) + Int disk_size = ceil(size(reads_fastas, "GB") * 4 + 20) command <<< set -euo pipefail @@ -202,7 +202,8 @@ task align_hifiasm { } Int threads = 16 - Int disk_size = ceil((size(query_sequences[0], "GB") * length(query_sequences) + size(reference, "GB")) * 2 + 20) + Int mem_gb = threads * 8 + Int disk_size = ceil((size(query_sequences, "GB") + size(reference, "GB")) * 2 + 20) command <<< set -euo pipefail @@ -218,7 +219,7 @@ task align_hifiasm { ~{reference} \ ~{sep=' ' query_sequences} \ | samtools sort \ - -@ 4 \ + -@ 3 \ -T ./TMP \ -m 8G \ -O BAM \ @@ -235,7 +236,7 @@ task align_hifiasm { runtime { docker: "~{runtime_attributes.container_registry}/align_hifiasm@sha256:3968cb152a65163005ffed46297127536701ec5af4c44e8f3e7051f7b01f80fe" cpu: threads - memory: "128 GB" + memory: mem_gb + " GB" disk: disk_size + " GB" disks: "local-disk " + disk_size + " HDD" preemptible: runtime_attributes.preemptible_tries diff --git a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl index 6481d5f..43354fc 100644 --- a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl +++ b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl @@ -32,7 +32,6 @@ workflow de_novo_assembly_sample { sample_id = sample.sample_id, reads_fastas = samtools_fasta.reads_fasta, references = references, - hifiasm_extra_params = "", backend = backend, default_runtime_attributes = default_runtime_attributes, on_demand_runtime_attributes = on_demand_runtime_attributes @@ -82,7 +81,7 @@ workflow de_novo_assembly_sample { parameter_meta { sample: {help: "Sample information and associated data files"} - reference: {help: "Reference genome data"} + references: {help: "Array of Reference genomes data"} default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"} on_demand_runtime_attributes: {help: "RuntimeAttributes for tasks that require dedicated instances"} } diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 0f155ef..f06513c 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -41,22 +41,6 @@ workflow de_novo_assembly_trio { } } - # For yak, we need to know the total input coverage so we can set cloud memory resources accordingly - scatter (fasta in samtools_fasta_father.reads_fasta) { - call fasta_basecount as fasta_bc_father { - input: - reads_fasta = fasta, - runtime_attributes = default_runtime_attributes - } - } - - call get_total_gbp as get_total_gbp_father { - input: - sample_id = father.sample_id, - fasta_totals = fasta_bc_father.read_total_bp, - runtime_attributes = default_runtime_attributes - } - scatter (movie_bam in mother.movie_bams) { call SamtoolsFasta.samtools_fasta as samtools_fasta_mother { input: @@ -65,33 +49,23 @@ workflow de_novo_assembly_trio { } } - # For yak, we need to know the total input coverage so we can set cloud memory resources accordingly - scatter (fasta in samtools_fasta_mother.reads_fasta) { - call fasta_basecount as fasta_bc_mother { - input: - reads_fasta = fasta, - runtime_attributes = default_runtime_attributes - } - } - - call get_total_gbp as get_total_gbp_mother { - input: - sample_id = mother.sample_id, - fasta_totals = fasta_bc_mother.read_total_bp, - runtime_attributes = default_runtime_attributes - } + # if parental coverage is low (<15x), keep singleton kmers from parents and use them to bin child reads + # if parental coverage is high (>=15x), use bloom filter and require that a kmer occur >= 5 times in + # one parent and <2 times in the other parent to be used for binning + # 60GB uncompressed FASTA ~= 10x coverage + # memory for 24 threads is 48GB with bloom filter (<=50x coverage) and 65GB without bloom filter (<=30x coverage) + Boolean bloom_filter = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false - call determine_yak_options { - input: - father_total_gbp = get_total_gbp_father.sample_total_gbp, - mother_total_gbp = get_total_gbp_mother.sample_total_gbp, - } + String yak_params = if (bloom_filter) then "-b37" else "" + Int yak_mem_gb = if (bloom_filter) then 50 else 70 + String hifiasm_extra_params = if (bloom_filter) then "" else "-c1 -d1" call yak_count as yak_count_father { input: sample_id = father.sample_id, reads_fastas = samtools_fasta_father.reads_fasta, - yak_options = determine_yak_options.yak_options, + yak_params = yak_params, + mem_gb = yak_mem_gb, runtime_attributes = default_runtime_attributes } @@ -99,7 +73,8 @@ workflow de_novo_assembly_trio { input: sample_id = mother.sample_id, reads_fastas = samtools_fasta_mother.reads_fasta, - yak_options = determine_yak_options.yak_options, + yak_params = yak_params, + mem_gb = yak_mem_gb, runtime_attributes = default_runtime_attributes } @@ -125,7 +100,7 @@ workflow de_novo_assembly_trio { sample_id = "~{cohort.cohort_id}.~{child.sample_id}", reads_fastas = samtools_fasta_child.reads_fasta, references = references, - hifiasm_extra_params = "-c1 -d1", + hifiasm_extra_params = hifiasm_extra_params, father_yak = yak_count_father.yak, mother_yak = yak_count_mother.yak, backend = backend, @@ -142,12 +117,11 @@ workflow de_novo_assembly_trio { Array[Array[File]] zipped_assembly_fastas = flatten(assemble_genome.zipped_assembly_fastas) Array[Array[File]] assembly_stats = flatten(assemble_genome.assembly_stats) Array[Array[IndexData]] asm_bams = flatten(assemble_genome.asm_bams) - } parameter_meta { cohort: {help: "Sample information for the cohort"} - references: {help: "List of reference genome data"} + references: {help: "Array of Reference genomes data"} default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"} on_demand_runtime_attributes: {help: "RuntimeAttributes for tasks that require dedicated instances"} } @@ -186,47 +160,27 @@ task parse_families { } } -task determine_yak_options { - input { - Int mother_total_gbp - Int father_total_gbp - } - - command { - set -e - if [ ~{father_total_gbp} -lt 48 ] && [ ~{mother_total_gbp} -lt 48 ]; then - options="" - else - options="-b37" - fi - echo $options - } - output { - String yak_options = read_string(stdout()) - } -} - task yak_count { input { String sample_id Array[File] reads_fastas - String yak_options + + String yak_params + String mem_gb RuntimeAttributes runtime_attributes } - Int threads = 10 - # Usage up to 140 GB @ 10 threads for Revio samples - Int mem_gb = 16 * threads + Int threads = 24 Int disk_size = ceil(size(reads_fastas, "GB") * 2 + 20) - + command <<< set -euo pipefail yak count \ -t ~{threads} \ -o ~{sample_id}.yak \ - ~{yak_options} \ + ~{yak_params} \ ~{sep=' ' reads_fastas} >>> @@ -247,79 +201,3 @@ task yak_count { zones: runtime_attributes.zones } } - -task fasta_basecount { - input { - File reads_fasta - String reads_fasta_basename = basename(reads_fasta) - - RuntimeAttributes runtime_attributes - } - - Int threads = 1 - Int mem_gb = 4 * threads - - Int disk_size = ceil(size(reads_fasta, "GB") * 2 + 20) - - command <<< - set -euo pipefail - - grep -v "^>" ~{reads_fasta} | tr -d '\n' | wc -c > ~{reads_fasta_basename}.total - >>> - - output { - File read_total_bp = "~{reads_fasta_basename}.total" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/python@sha256:e4d921e252c3c19fe64097aa619c369c50cc862768d5fcb5e19d2877c55cfdd2" - cpu: threads - memory: mem_gb + " GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } -} - -task get_total_gbp { - input { - String sample_id - Array[File] fasta_totals - - RuntimeAttributes runtime_attributes - } - - Int threads = 1 - Int mem_gb = 4 * threads - - Int disk_size = ceil(size(fasta_totals[0], "GB") * 2 + 20) - - command <<< - set -euo pipefail - - awk '{sum+=$1}END{print sum/1000000000}' ~{sep=' ' fasta_totals} > ~{sample_id}.total - - >>> - - output { - Int sample_total_gbp = round(read_float("~{sample_id}.total")) - } - - runtime { - docker: "~{runtime_attributes.container_registry}/python@sha256:e4d921e252c3c19fe64097aa619c369c50cc862768d5fcb5e19d2877c55cfdd2" - cpu: threads - memory: mem_gb + " GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } - -} diff --git a/workflows/input_template.json b/workflows/input_template.json index e97ef8d..64e5d62 100644 --- a/workflows/input_template.json +++ b/workflows/input_template.json @@ -1,31 +1,31 @@ { - "de_novo_assembly.cohort": { - "cohort_id": "String", - "samples": [ - { - "sample_id": "String", - "movie_bams": "Array[File]", - "sex": "String?", - "father_id": "String?", - "mother_id": "String?", - "run_de_novo_assembly": "Boolean" - } + "de_novo_assembly.cohort": { + "cohort_id": "String", + "samples": [ + { + "sample_id": "String", + "movie_bams": "Array[File]", + "sex": "String?", + "father_id": "String?", + "mother_id": "String?", + "run_de_novo_assembly": "Boolean" + } + ], + "run_de_novo_assembly_trio": "Boolean" + }, + "de_novo_assembly.references": [ + { + "name": "String", + "fasta": { + "data": "File", + "data_index": "File" + } + } ], - "run_de_novo_assembly_trio": "Boolean" - }, - "de_novo_assembly.references": [ - { - "name": "String", - "fasta": { - "data": "File", - "data_index": "File" - } - ], - "de_novo_assembly.zones": "String? (optional); required if backend is set to 'AWS'", - "de_novo_assembly.aws_spot_queue_arn": "String? (optional); required if backend is set to 'AWS'", - "de_novo_assembly.aws_on_demand_queue_arn": "String? (optional)", - "de_novo_assembly.preemptible": "Boolean", - "de_novo_assembly.backend": "String ['GCP', 'Azure', 'AWS', or 'HPC']", - "de_novo_assembly.container_registry": "String? (optional)", - } -} + "de_novo_assembly.zones": "String? (optional); required if backend is set to 'AWS'", + "de_novo_assembly.aws_spot_queue_arn": "String? (optional); required if backend is set to 'AWS'", + "de_novo_assembly.aws_on_demand_queue_arn": "String? (optional)", + "de_novo_assembly.preemptible": "Boolean", + "de_novo_assembly.backend": "String ['GCP', 'Azure', 'AWS', or 'HPC']", + "de_novo_assembly.container_registry": "String? (optional)" +} \ No newline at end of file From f24e1e6e95ab4329744252a4dcd6b98d3a3aae80 Mon Sep 17 00:00:00 2001 From: William Rowell Date: Thu, 28 Sep 2023 11:14:50 -0700 Subject: [PATCH 26/62] changed flag name and fixed my flipped logic --- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index f06513c..d3aa695 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -54,11 +54,11 @@ workflow de_novo_assembly_trio { # one parent and <2 times in the other parent to be used for binning # 60GB uncompressed FASTA ~= 10x coverage # memory for 24 threads is 48GB with bloom filter (<=50x coverage) and 65GB without bloom filter (<=30x coverage) - Boolean bloom_filter = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false + Boolean low_depth = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false - String yak_params = if (bloom_filter) then "-b37" else "" - Int yak_mem_gb = if (bloom_filter) then 50 else 70 - String hifiasm_extra_params = if (bloom_filter) then "" else "-c1 -d1" + String yak_params = if (low_depth) then "" else "-b37" + Int yak_mem_gb = if (low_depth) then 70 else 50 + String hifiasm_extra_params = if (low_depth) then "-c1 -d1" else "" call yak_count as yak_count_father { input: From 42d0d847d5c06f4ea16fbc8d9890f29a408ef45a Mon Sep 17 00:00:00 2001 From: William Rowell Date: Thu, 28 Sep 2023 11:23:58 -0700 Subject: [PATCH 27/62] Adjusted tests. --- wdl-ci.config.json | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index b2e5d4f..792f307 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -150,7 +150,8 @@ "${resources_file_path}/m64017_200108_232219.hifi_reads.fasta", "${resources_file_path}/m64017_200112_090459.hifi_reads.fasta" ], - "yak_options": "-b37", + "yak_params": "-b37", + "mem_gb": 70, "runtime_attributes": "${default_runtime_attributes}" }, "output_tests": { @@ -164,21 +165,6 @@ } } ] - }, - "fasta_basecount": { - "key": "fasta_basecount", - "digest": "", - "tests": [] - }, - "get_total_gbp": { - "key": "get_total_gbp", - "digest": "", - "tests": [] - }, - "determine_yak_options": { - "key": "determine_yak_options", - "digest": "", - "tests": [] } } }, From c8a9b8d9d3a628d10e3355608acabb355cf78225 Mon Sep 17 00:00:00 2001 From: William Rowell Date: Thu, 28 Sep 2023 11:31:20 -0700 Subject: [PATCH 28/62] Memory is an int. --- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index d3aa695..2d93adb 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -166,7 +166,7 @@ task yak_count { Array[File] reads_fastas String yak_params - String mem_gb + Int mem_gb RuntimeAttributes runtime_attributes } From 42735154d3c3c8736697dbb89001c4ced04d61f0 Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 28 Sep 2023 18:51:21 +0000 Subject: [PATCH 29/62] update wdl-ci config file after successful tests --- wdl-ci.config.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 792f307..081474a 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -137,7 +137,7 @@ }, "yak_count": { "key": "yak_count", - "digest": "qysjdjudeldfcf6pm2unping3zkh4qve", + "digest": "6hlh6n3b3cqohtmjweg57of626he4c4v", "tests": [ { "inputs": { @@ -175,7 +175,7 @@ "tasks": { "hifiasm_assemble": { "key": "hifiasm_assemble", - "digest": "r4ikydzmdaed4hzsmc3t7efh6mz5e4mx", + "digest": "vhkzwee3f754jcjksog22uyps3j6myow", "tests": [ { "inputs": { @@ -264,7 +264,7 @@ }, "align_hifiasm": { "key": "align_hifiasm", - "digest": "77gs34t4c2i6epsg2epukfoaign2fmnt", + "digest": "4qf5jeepfn3jv3g2socql6xh7vmd4b7s", "tests": [ { "inputs": { From 2dc27601fd4183bcbead838d8359c5db9af59668 Mon Sep 17 00:00:00 2001 From: William Rowell Date: Thu, 28 Sep 2023 12:16:53 -0700 Subject: [PATCH 30/62] added warning comment about estimating depth by filesize --- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 2d93adb..5db4c29 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -52,7 +52,7 @@ workflow de_novo_assembly_trio { # if parental coverage is low (<15x), keep singleton kmers from parents and use them to bin child reads # if parental coverage is high (>=15x), use bloom filter and require that a kmer occur >= 5 times in # one parent and <2 times in the other parent to be used for binning - # 60GB uncompressed FASTA ~= 10x coverage + # 60GB uncompressed FASTA ~= 10x coverage (this is not robust to big changes in mean read length) # memory for 24 threads is 48GB with bloom filter (<=50x coverage) and 65GB without bloom filter (<=30x coverage) Boolean low_depth = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false From ebe6afeb0d5ec2409d6b31aaf5aa838976672bdc Mon Sep 17 00:00:00 2001 From: William Rowell Date: Fri, 29 Sep 2023 13:35:14 -0700 Subject: [PATCH 31/62] Explicitly pass default `yak count` and `hifiasm` params. --- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 5db4c29..aa07ffc 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -56,9 +56,9 @@ workflow de_novo_assembly_trio { # memory for 24 threads is 48GB with bloom filter (<=50x coverage) and 65GB without bloom filter (<=30x coverage) Boolean low_depth = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false - String yak_params = if (low_depth) then "" else "-b37" + String yak_params = if (low_depth) then "-b0" else "-b37" Int yak_mem_gb = if (low_depth) then 70 else 50 - String hifiasm_extra_params = if (low_depth) then "-c1 -d1" else "" + String hifiasm_extra_params = if (low_depth) then "-c1 -d1" else "-c2 -d5" call yak_count as yak_count_father { input: From 05312fe686c22bee0c2106d3ab5087d1b7480079 Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Thu, 26 Oct 2023 11:02:26 -0700 Subject: [PATCH 32/62] update README to reflect array of reference inputs as well as arrays of aligned bam outputs --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index c031286..bb97dbf 100644 --- a/README.md +++ b/README.md @@ -184,7 +184,7 @@ These files are hosted publicly in each of the cloud backends; see `backends/${b | Type | Name | Description | Notes | | :- | :- | :- | :- | | String | name | Reference name; used to name outputs (e.g., "GRCh38") | | -| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl) | fasta | Reference genome and index | | +| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | fastas | Reference genomes and associatedindex | | ## Other inputs @@ -209,9 +209,9 @@ These files will be output if `cohort.samples[sample]` is set to `true` for any | Array[Array[File]?] | assembly_noseq_gfas | Assembly graphs in [GFA format](https://github.com/chhylp123/hifiasm/blob/master/docs/source/interpreting-output.rst). | | | Array[Array[File]?] | assembly_lowQ_beds | Coordinates of low quality regions in BED format. | | | Array[Array[File]?] | assembly_stats | Assembly size and NG50 stats generated by [calN50](https://github.com/lh3/calN50). | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)?] | asm_bam | [minimap2](https://github.com/lh3/minimap2) alignment of assembly to reference. | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)?] | htsbox_vcf | Naive pileup variant calling of assembly against reference with [`htsbox`](https://github.com/lh3/htsbox) | | -| Array[File?] | htsbox_vcf_stats | [`bcftools stats`](https://samtools.github.io/bcftools/bcftools.html#stats) summary statistics for `htsbox` variant calls | | +| Array[Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)?]] | asm_bam | [minimap2](https://github.com/lh3/minimap2) alignment of assembly to reference. | | +| Array[Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)?]] | htsbox_vcf | Naive pileup variant calling of assembly against reference with [`htsbox`](https://github.com/lh3/htsbox) | | +| Array[Array[File?]] | htsbox_vcf_stats | [`bcftools stats`](https://samtools.github.io/bcftools/bcftools.html#stats) summary statistics for `htsbox` variant calls | | ## De novo assembly - trio @@ -223,7 +223,7 @@ These files will be output if `cohort.de_novo_assembly_trio` is set to `true` an | Array[Array[File]]? | trio_assembly_noseq_gfas | Assembly graphs in [GFA format](https://github.com/chhylp123/hifiasm/blob/master/docs/source/interpreting-output.rst). | | | Array[Array[File]]? | trio_assembly_lowQ_beds | Coordinates of low quality regions in BED format. | | | Array[Array[File]]? | trio_assembly_stats | Assembly size and NG50 stats generated by [calN50](https://github.com/lh3/calN50). | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)]? | trio_asm_bams | [minimap2](https://github.com/lh3/minimap2) alignment of assembly to reference. | | +| Array[Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)]?] | trio_asm_bams | [minimap2](https://github.com/lh3/minimap2) alignment of assembly to reference. | | | Array[Map[String, String]]? | haplotype_key | Indication of which haplotype (`hap1`/`hap2`) corresponds to which parent. | | # Tool versions and Docker images From 8a84fa67d464b57cd2b14a7a3951534997ca87fd Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Thu, 26 Oct 2023 15:08:46 -0700 Subject: [PATCH 33/62] disclaimer and dockstore.yml --- .dockstore.yml | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 4 +++ 2 files changed, 72 insertions(+) create mode 100644 .dockstore.yml diff --git a/.dockstore.yml b/.dockstore.yml new file mode 100644 index 0000000..81717f9 --- /dev/null +++ b/.dockstore.yml @@ -0,0 +1,68 @@ +# The first line refers to the version 1.2 of the .dockstore.yml schema +version: 1.2 + +# An array of workflows. Each element corresponds to a workflow on Dockstore. +workflows: + + # The optional workflow name for a workflow, which may only consist of alphanumerics + # and internal underscores and hyphens, but no spaces or other characters. Names may not exceed 256 characters. + # If using a .dockstore.yml with multiple workflows, this field is required + # to uniquely identify workflows in the repository. + # + # It should be noted that having the name come first is an arbitrary decision. + # You could use subclass instead, for instance. Provided arrays are not broken + # up, the order of fields within a .dockstore.yml is not important. + - name: wdl-humanassembly + + # The descriptor language used for the workflow. CWL, WDL, NFL (Nextflow), or GALAXY. + # This cannot be changed once the workflow is registered. + subclass: WDL + + # Workflow-wide setting that will affect ALL branches/tags; only set this as needed in a main branch. + # Set to true to publish an unpublished workflow, or false to unpublish a published workflow. + # Omitting the publish setting leaves the publish-state unchanged (recommended for all non-primary branches). + # publish: + + # The absolute path to the primary descriptor file in the Git repository. + # - For CWL, the primary descriptor is a .cwl file. + # - For WDL, the primary descriptor is a .wdl file. + # - For Galaxy, the primary descriptor is a .ga file. + # - Nextflow differs from these as the primary descriptor is a nextflow.config file. + primaryDescriptorPath: /workflows/main.wdl + + # An optional array of absolute paths to test parameter files in the Git repository. + # For example... + # testParameterFiles: + # - /null-model/null-model.json + # - /null-model/null-model-binary.json + # testParameterFiles: + + # An optional path to a workflow-specific readme in the Git repository. If not provided, Dockstore will show + # the readme.md present at the root of the Git repository if it is present. + # If you have multiple workflows in a single Git repository, it is recommend to give each one a readme. + readMePath: /README.md + + # An optional array of authorship information. + # Note that if orcid is present, then all other fields will be ignored, as information will be taken from orcid. + # If orcid is not present, make sure to at a minimum include the name field for each author. + authors: + - orcid: 0000-0001-5921-2022 # Juniper Lake + - orcid: 0000-0001-7628-5645 # Gregory Concepcion + - orcid: 0000-0002-7422-1194 # William Rowell + - orcid: 0000-0002-5507-0896 # Heather Ward + - orcid: 0009-0001-0205-4614 # Karen Fang + + # A boolean that will change the default version to be displayed on Dockstore. Default: False. + # A value of true will automatically display the latest tag updated as default. + # A value of false will retain the default version that has been specified via the Dockstore UI. + latestTagAsDefault: False + + # The optional filters section allow specifying sets of Git branches and tags to include for the workflow. + # If no filters are given, all branches and tags are included. + # Branches and tags are arrays of pattern-strings. + # Pattern-strings use Unix-style Glob syntax by default (Ex: `develop`, `myworkflow/**`) + # https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/nio/file/FileSystem.html#getPathMatcher(java.lang.String) + # or RegEx when the string is surrounded by / (Ex: `/develop/`, `/myworkflow\/.*/`). + filters: + branches: [ /develop/ ] + tags: [ /v.*/ ] diff --git a/README.md b/README.md index dcddd6d..3b29196 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ +# DISCLAIMER + +TO THE GREATEST EXTENT PERMITTED BY APPLICABLE LAW, THIS WEBSITE AND ITS CONTENT, INCLUDING ALL SOFTWARE, SOFTWARE CODE, SITE-RELATED SERVICES, AND DATA, ARE PROVIDED "AS IS," WITH ALL FAULTS, WITH NO REPRESENTATIONS OR WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, SATISFACTORY QUALITY, NON-INFRINGEMENT OR FITNESS FOR A PARTICULAR PURPOSE. ALL WARRANTIES ARE REJECTED AND DISCLAIMED. YOU ASSUME TOTAL RESPONSIBILITY AND RISK FOR YOUR USE OF THE FOREGOING. PACBIO IS NOT OBLIGATED TO PROVIDE ANY SUPPORT FOR ANY OF THE FOREGOING, AND ANY SUPPORT PACBIO DOES PROVIDE IS SIMILARLY PROVIDED WITHOUT REPRESENTATION OR WARRANTY OF ANY KIND. NO ORAL OR WRITTEN INFORMATION OR ADVICE SHALL CREATE A REPRESENTATION OR WARRANTY OF ANY KIND. ANY REFERENCES TO SPECIFIC PRODUCTS OR SERVICES ON THE WEBSITES DO NOT CONSTITUTE OR IMPLY A RECOMMENDATION OR ENDORSEMENT BY PACBIO. + # wdl-humanassembly Workflow for running de novo assembly using human PacBio whole genome sequencing (WGS) data. Written using [Workflow Description Language (WDL)](https://openwdl.org/). From 894094f0ee29794187ac164fe85d04f12f6e6c1b Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 26 Oct 2023 22:30:03 +0000 Subject: [PATCH 34/62] update wdl-ci config file after successful tests --- wdl-ci.config.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index ae9fcbf..24c8b08 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -137,7 +137,7 @@ }, "yak_count": { "key": "yak_count", - "digest": "6hlh6n3b3cqohtmjweg57of626he4c4v", + "digest": "i4jt54vu25mhikalp47febetx7mn6xmo", "tests": [ { "inputs": { @@ -175,7 +175,7 @@ "tasks": { "hifiasm_assemble": { "key": "hifiasm_assemble", - "digest": "vhkzwee3f754jcjksog22uyps3j6myow", + "digest": "yt7mrvhlur5xzn5sxbhe52kvvu6r4ejr", "tests": [ { "inputs": { @@ -232,7 +232,7 @@ }, "gfa2fa": { "key": "gfa2fa", - "digest": "es7l5kyje3fiy5vxjnnsqg4fw6sitmdo", + "digest": "drs64xxuazexpb6n6glhbkmartzdorbj", "tests": [ { "inputs": { @@ -264,7 +264,7 @@ }, "align_hifiasm": { "key": "align_hifiasm", - "digest": "4qf5jeepfn3jv3g2socql6xh7vmd4b7s", + "digest": "ctgtjbeekxz2xcq42jnuyxhsfnrb52xg", "tests": [ { "inputs": { From b9f0bcaa12ea767b9bec33c9a0c2ede4f391f17d Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Thu, 26 Oct 2023 16:12:34 -0700 Subject: [PATCH 35/62] clean up README --- README.md | 72 +++---------------------------------------------------- 1 file changed, 3 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index 3b29196..a453820 100644 --- a/README.md +++ b/README.md @@ -52,21 +52,13 @@ For backend-specific configuration, see the relevant documentation: An execution engine is required to run workflows. Two popular engines for running WDL-based workflows are [`miniwdl`](https://miniwdl.readthedocs.io/en/latest/getting_started.html) and [`Cromwell`](https://cromwell.readthedocs.io/en/stable/tutorials/FiveMinuteIntro/). -See [Workbench's documentation](https://docs.dnastack.com/docs/introduction-to-engines-and-backends) as well as the [backend-specific documentation](backends) for details on setting up an engine. +See [backend-specific documentation](backends) for details on setting up an engine. | Engine | Azure | AWS | GCP | HPC | | :- | :- | :- | :- | :- | | [**miniwdl**](https://github.com/chanzuckerberg/miniwdl#scaling-up) | _Unsupported_ | Supported via the [Amazon Genomics CLI](https://aws.amazon.com/genomics-cli/) | _Unsupported_ | (SLURM only) Supported via the [`miniwdl-slurm`](https://github.com/miniwdl-ext/miniwdl-slurm) plugin | | [**Cromwell**](https://cromwell.readthedocs.io/en/stable/backends/Backends/) | Supported via [Cromwell on Azure](https://github.com/microsoft/CromwellOnAzure) | Supported via the [Amazon Genomics CLI](https://aws.amazon.com/genomics-cli/) | Supported via Google's [Pipelines API](https://cromwell.readthedocs.io/en/stable/backends/Google/) | Supported - [Configuration varies depending on HPC infrastructure](https://cromwell.readthedocs.io/en/stable/tutorials/HPCIntro/) | -## Registering a workflow engine in Workbench - -Once an engine has been configured, it can optionally be registered in [Workbench](https://workbench.dnastack.com/) to enable a unified interface for workflow submission, monitoring, and statistics. Once configured, workflow runs may be submitted either [via the browser](https://docs.dnastack.com/docs/accessing-the-workbench-gui) or [via the Workbench CLI](#run-using-workbench). - -See [Workbench's documentation](https://docs.dnastack.com/docs/connecting-to-a-workflow-engine) for details on how to register an engine in Workbench. Backend-specific resources and default configurations that may be required as part of engine setup may also be found in the [backends](backends) directory. - -Workbench requires a license to use. For information on obtaining a license or to set up a demo, please contact [support@dnastack.com](mailto:support@dnastack.com). - ## Filling out the inputs JSON The input to a workflow run is defined in JSON format. Template input files with reference dataset information filled out are available for each backend: @@ -82,9 +74,9 @@ If using an HPC backend, you will need to download the reference bundle and repl ## Running the workflow -Run the workflow using the engine and backend that you have configured ([miniwdl](#run-directly-using-miniwdl), [Cromwell](#run-directly-using-cromwell), [Workbench](#run-using-workbench)). +Run the workflow using the engine and backend that you have configured ([miniwdl](#run-directly-using-miniwdl), [Cromwell](#run-directly-using-cromwell)). -Note that the calls to `miniwdl` and `Cromwell` assume you are accessing the engine directly on the machine on which it has been deployed. Depending on the backend you have configured, you may be able to submit workflows using different methods (e.g. using trigger files in Azure, or using the Amazon Genomics CLI in AWS). Calls to the Workbench CLI will be the same regardless of the engine/backend combination. +Note that the calls to `miniwdl` and `Cromwell` assume you are accessing the engine directly on the machine on which it has been deployed. Depending on the backend you have configured, you may be able to submit workflows using different methods (e.g. using trigger files in Azure, or using the Amazon Genomics CLI in AWS). ### Run directly using miniwdl @@ -114,64 +106,6 @@ curl -X "POST" \ To specify [workflow options](https://cromwell.readthedocs.io/en/latest/wf_options/Overview/), add the following to the request (assuming your options file is a file called `options.json` located in the `pwd`): `-F "workflowOptions=@options.json;type=application/json"`. -### Run using Workbench - -Rather than running a workflow directly using an engine, engines can be configured using [Workbench](https://workbench.dnastack.com/). Workbench presents a unified interface to the respective backends and engines. Workflow runs may be submitted and monitored either [directly in-browser](https://docs.dnastack.com/docs/accessing-the-workbench-gui) or using the command-line interface (CLI) (see below). - -Note that these steps assume you have already [set up and registered an engine in Workbench](https://docs.dnastack.com/docs/workbench-settings). - -1. [Install and configure the DNAstack CLI](#installing-and-configuring-the-dnastack-cli) -2. [Register the workflow on Workbench](#registering-the-workflow-on-workbench) -3. [Submit a workflow run](#submitting-workflow-runs-via-workbench) - -Steps (1) and (2) are one-time setup, following which any number of workflow runs may be submitted. - -For assistance and licensing, please contact [support@dnastack.com](mailto:support@dnastack.com). - -#### Installing and configuring the DNAstack CLI - -1. Install the DNAstack CLI - -`python3 -m pip install --user dnastack-client-library` - -Confirm that the CLI is installed and available by running `dnastack --version`. - -2. Authenticate using the CLI - -`dnastack auth login` - -3. Configure the CLI to use workbench - -`dnastack use workbench.dnastack.com` - -You can now use the DNAstack CLI to interact with Workbench. - -#### Registering the workflow on Workbench - -From the root of this repository, run: - -```bash -dnastack alpha workbench workflows create \ - --name "PacBio Human Assembly" \ - --description =@README.md \ - workflows/main.wdl -``` -Note the `internalId` field of the returned JSON. This will be used as the `--url` value when submitting workflow runs. - -This step only needs to be completed once, when initially registering the workflow. Following this initial setup, additional runs may be submitted by using the same `internalId` recorded here. - -#### Submitting workflow runs via Workbench - -In the following command, replace `` with the path to your filled out inputs file, and `` with the ID you noted in step (1). If no engine is provided, the default engine you have configured will be used. - -```bash -dnastack workbench runs submit \ - --workflow-params @ \ - --url \ - [--tags ] \ - [--engine ] -``` - # Workflow inputs This section describes the inputs required for a run of the workflow. Typically, only the `de_novo_assembly.cohort` and potentially [run/backend-specific sections](#other-inputs) will be filled out by the user for each run of the workflow. Input templates with reference file locations filled out are provided [for each backend](backends). From 461a6f821a6034ca00727b93a07654b88feb691f Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Tue, 31 Oct 2023 16:24:17 -0700 Subject: [PATCH 36/62] update input jsons --- backends/aws/inputs.aws.json | 16 ++++++++++++---- backends/azure/inputs.azure.json | 16 ++++++++++++---- backends/gcp/inputs.gcp.json | 16 ++++++++++++---- backends/hpc/inputs.hpc.json | 16 ++++++++++++---- 4 files changed, 48 insertions(+), 16 deletions(-) diff --git a/backends/aws/inputs.aws.json b/backends/aws/inputs.aws.json index 44c4a52..c120e44 100644 --- a/backends/aws/inputs.aws.json +++ b/backends/aws/inputs.aws.json @@ -15,10 +15,18 @@ }, "de_novo_assembly.references": [ { - "name": "String", - "fasta": { - "data": "File", - "data_index": "File" + "name": "GRCh38", + "fasta": { + "data": "s3://dnastack-resources/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "data_index": "s3://dnastack-resources/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + } + }, + { + "name": "chm13v2.0", + "fasta": { + "data": "s3://dnastack-resources/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta", + "data_index": "s3://dnastack-resources/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta.fai" + } } ], "de_novo_assembly.backend": "AWS", diff --git a/backends/azure/inputs.azure.json b/backends/azure/inputs.azure.json index 5d63fa6..7dfd2f6 100644 --- a/backends/azure/inputs.azure.json +++ b/backends/azure/inputs.azure.json @@ -15,10 +15,18 @@ }, "de_novo_assembly.references": [ { - "name": "String", - "fasta": { - "data": "File", - "data_index": "File" + "name": "GRCh38", + "fasta": { + "data": "/datasetpbrarediseases/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "data_index": "/datasetpbrarediseases/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + } + }, + { + "name": "chm13v2.0", + "fasta": { + "data": "/datasetpbrarediseases/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta", + "data_index": "/datasetpbrarediseases/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta.fai" + } } ], "de_novo_assembly.backend": "Azure", diff --git a/backends/gcp/inputs.gcp.json b/backends/gcp/inputs.gcp.json index 723742a..7969ec7 100644 --- a/backends/gcp/inputs.gcp.json +++ b/backends/gcp/inputs.gcp.json @@ -15,10 +15,18 @@ }, "de_novo_assembly.references": [ { - "name": "String", - "fasta": { - "data": "File", - "data_index": "File" + "name": "GRCh38", + "fasta": { + "data": "gs://pacbio-wdl/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "data_index": "gs://pacbio-wdl/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + } + }, + { + "name": "chm13v2.0", + "fasta": { + "data": "gs://pacbio-wdl/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta", + "data_index": "gs://pacbio-wdl/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta.fai" + } } ], "de_novo_assembly.backend": "GCP", diff --git a/backends/hpc/inputs.hpc.json b/backends/hpc/inputs.hpc.json index e0979de..be3118f 100644 --- a/backends/hpc/inputs.hpc.json +++ b/backends/hpc/inputs.hpc.json @@ -15,10 +15,18 @@ }, "de_novo_assembly.references": [ { - "name": "String", - "fasta": { - "data": "File", - "data_index": "File" + "name": "GRCh38", + "fasta": { + "data": "/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "data_index": "/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + } + }, + { + "name": "chm13v2.0", + "fasta": { + "data": "/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta", + "data_index": " Date: Wed, 1 Nov 2023 11:53:32 -0700 Subject: [PATCH 37/62] add -L3.1g for NGx calculation default for human genomes --- workflows/assemble_genome/assemble_genome.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/assemble_genome/assemble_genome.wdl b/workflows/assemble_genome/assemble_genome.wdl index 40debd1..34cd50f 100644 --- a/workflows/assemble_genome/assemble_genome.wdl +++ b/workflows/assemble_genome/assemble_genome.wdl @@ -173,6 +173,7 @@ task gfa2fa { # Calculate assembly stats k8 \ /opt/calN50/calN50.js \ + -L3.1g \ ~{gfa_basename}.fasta.gz \ > ~{gfa_basename}.fasta.stats.txt >>> From deba05b34b2a1bf4520d22b4ce6b445b0fe03040 Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Fri, 3 Nov 2023 14:34:05 -0700 Subject: [PATCH 38/62] substitute paftools for htsbox --- workflows/assemble_genome/assemble_genome.wdl | 129 +++++++++++++++--- .../de_novo_assembly_sample.wdl | 97 ++++++++++--- workflows/main.wdl | 9 +- 3 files changed, 191 insertions(+), 44 deletions(-) diff --git a/workflows/assemble_genome/assemble_genome.wdl b/workflows/assemble_genome/assemble_genome.wdl index 34cd50f..14c1c74 100644 --- a/workflows/assemble_genome/assemble_genome.wdl +++ b/workflows/assemble_genome/assemble_genome.wdl @@ -42,32 +42,63 @@ workflow assemble_genome { } } + #scatter (ref in references) { + # call align_hifiasm { +# input: +# sample_id = sample_id, +# query_sequences = gfa2fa.zipped_fasta, +# reference = ref.fasta.data, +# reference_name = ref.name, +# runtime_attributes = default_runtime_attributes +# } +# +# IndexData sample_aligned_bam = { +# "data": align_hifiasm.asm_bam, +# "data_index": align_hifiasm.asm_bam_index +# } + +# Pair[ReferenceData,IndexData] align_data = (ref, sample_aligned_bam) + #} + scatter (ref in references) { - call align_hifiasm { + scatter (hap in gfa2fa.zipped_fasta) { + call align_hifiasm { + input: + sample_id = sample_id, + query_sequences = hap, + reference = ref.fasta.data, + reference_name = ref.name, + runtime_attributes = default_runtime_attributes + } + + IndexData sample_aligned_bam = { + "data": align_hifiasm.asm_bam, + "data_index": align_hifiasm.asm_bam_index + } + + Pair[ReferenceData,IndexData] align_data = (ref, sample_aligned_bam) + } + + Array[File] bamlist = align_hifiasm.asm_bam + + call merge_haps { input: sample_id = sample_id, - query_sequences = gfa2fa.zipped_fasta, - reference = ref.fasta.data, - reference_name = ref.name, + bams = bamlist, + refname = ref.name, runtime_attributes = default_runtime_attributes } - IndexData sample_aligned_bam = { - "data": align_hifiasm.asm_bam, - "data_index": align_hifiasm.asm_bam_index - } - Pair[ReferenceData,IndexData] align_data = (ref, sample_aligned_bam) } - - output { Array[File] assembly_noseq_gfas = hifiasm_assemble.assembly_noseq_gfas Array[File] assembly_lowQ_beds = hifiasm_assemble.assembly_lowQ_beds Array[File] zipped_assembly_fastas = gfa2fa.zipped_fasta Array[File] assembly_stats = gfa2fa.assembly_stats - Array[IndexData] asm_bams = sample_aligned_bam - Array[Pair[ReferenceData,IndexData]] alignments = align_data + Array[IndexData] asm_bams = flatten(sample_aligned_bam) + Array[IndexData] merged_bams = merge_haps.merged_bam + Array[Pair[ReferenceData,IndexData]] alignments = flatten(align_data) } parameter_meta { @@ -202,7 +233,7 @@ task gfa2fa { task align_hifiasm { input { String sample_id - Array[File] query_sequences + File query_sequences File reference String reference_name @@ -215,10 +246,12 @@ task align_hifiasm { Int disk_size = ceil((size(query_sequences, "GB") + size(reference, "GB")) * 2 + 20) command <<< - set -euo pipefail + echo "minimap2 version: $(minimap2 --version)" - + haplotype=$(basename ~{query_sequences} | sed -n 's/.*\(hap.\).*/\1/p') + echo $haplotype > hap.txt + samtools --version minimap2 \ @@ -226,6 +259,7 @@ task align_hifiasm { -L \ --secondary=no \ --eqx \ + --cs \ -a \ -x asm5 \ -R "@RG\\tID:~{sample_id}_hifiasm\\tSM:~{sample_id}" \ @@ -236,18 +270,71 @@ task align_hifiasm { -T ./TMP \ -m 8G \ -O BAM \ - -o ~{sample_id}.asm.~{reference_name}.bam + -o ~{sample_id}.$haplotype.asm.~{reference_name}.bam + + samtools index ~{sample_id}.$haplotype.asm.~{reference_name}.bam + >>> + + output { +# String haplotype = read_string("hap.txt") +# File asm_bam = "~{sample_id}.~{haplotype}.asm.~{reference_name}.bam" +# File asm_bam_index = "~{sample_id}.~{haplotype}.asm.~{reference_name}.bam.bai" +# File asm_bam = "~{sample_id}.asm.~{reference_name}.bam" +# File asm_bam_index = "~{sample_id}.asm.~{reference_name}.bam.bai" + + File asm_bam = glob("*.bam")[0] + File asm_bam_index = glob("*.bam.bai")[0] + + + } + + runtime { + docker: "~{runtime_attributes.container_registry}/align_hifiasm@sha256:0e8ad680b0e89376eb94fa8daa1a0269a4abe695ba39523a5c56a59d5c0e3953" + cpu: threads + memory: mem_gb + " GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +} + +task merge_haps { + input { + Array[File] bams + String sample_id + String refname + + RuntimeAttributes runtime_attributes + } + + Int threads = 3 + Int disk_size = 20 + Int mem_gb = threads * 8 + + command <<< + + samtools merge \ + -@3 \ + -b \ + -o ~{sample_id}.asm.~{refname}.bam \ + ~{sep=' ' bams} + + samtools index ~{sample_id}.asm.~{refname}.bam + - samtools index ~{sample_id}.asm.~{reference_name}.bam >>> output { - File asm_bam = "~{sample_id}.asm.~{reference_name}.bam" - File asm_bam_index = "~{sample_id}.asm.~{reference_name}.bam.bai" + IndexData merged_bam = {"data": "~{sample_id}.asm.~{refname}.bam", + "data_index": "~{sample_id}.asm.~{refname}.bam.bai"} } runtime { - docker: "~{runtime_attributes.container_registry}/align_hifiasm@sha256:3968cb152a65163005ffed46297127536701ec5af4c44e8f3e7051f7b01f80fe" + docker: "~{runtime_attributes.container_registry}/align_hifiasm@sha256:0e8ad680b0e89376eb94fa8daa1a0269a4abe695ba39523a5c56a59d5c0e3953" cpu: threads memory: mem_gb + " GB" disk: disk_size + " GB" diff --git a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl index 76f8fe4..dd53438 100644 --- a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl +++ b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl @@ -40,21 +40,31 @@ workflow de_novo_assembly_sample { scatter (aln in assemble_genome.alignments) { ReferenceData ref = aln.left IndexData bam = aln.right - call htsbox { +# call htsbox { +# input: +# bam = bam.data, +# bam_index = bam.data_index, +# reference = ref.fasta.data, +# runtime_attributes = default_runtime_attributes +# } + + call paftools { input: bam = bam.data, + sample = sample.sample_id, bam_index = bam.data_index, reference = ref.fasta.data, runtime_attributes = default_runtime_attributes } + call ZipIndexVcf.zip_index_vcf { input: - vcf = htsbox.htsbox_vcf, + vcf = paftools.paftools_vcf, runtime_attributes = default_runtime_attributes } - IndexData htsbox_vcf = { + IndexData paftools_vcf = { "data": zip_index_vcf.zipped_vcf, "data_index": zip_index_vcf.zipped_vcf_index } @@ -62,21 +72,27 @@ workflow de_novo_assembly_sample { call BcftoolsStats.bcftools_stats { input: vcf = zip_index_vcf.zipped_vcf, - params = "--samples ~{basename(bam.data)}", + params = "--samples ~{sample.sample_id}", +# params = "--samples ~{basename(bam.data)}", reference = ref.fasta.data, runtime_attributes = default_runtime_attributes } } + + output { Array[File] assembly_noseq_gfas = assemble_genome.assembly_noseq_gfas Array[File] assembly_lowQ_beds = assemble_genome.assembly_lowQ_beds Array[File] zipped_assembly_fastas = assemble_genome.zipped_assembly_fastas Array[File] assembly_stats = assemble_genome.assembly_stats + Array[IndexData] merged_bams = assemble_genome.merged_bams Array[IndexData] asm_bams = assemble_genome.asm_bams + # Array[File] paftools_vcfs = paftools.paftools_vcf - Array[IndexData] htsbox_vcfs = htsbox_vcf - Array[File] htsbox_vcf_stats = bcftools_stats.stats + + Array[IndexData] paftools_vcfs = paftools_vcf + Array[File] paftools_vcf_stats = bcftools_stats.stats } parameter_meta { @@ -87,45 +103,83 @@ workflow de_novo_assembly_sample { } } -task htsbox { +task paftools { input { File bam File bam_index File reference + String sample + RuntimeAttributes runtime_attributes } String bam_basename = basename(bam, ".bam") Int threads = 2 - Int disk_size = ceil((size(bam, "GB") + size(reference, "GB")) * 3 + 200) + Int disk_size = ceil((size(bam, "GB") + size(reference, "GB")) * 3 + 20) + Int mem_gb = threads * 8 command <<< set -euo pipefail - # Ensure the sample is named based on the bam basename (not the full path) - cp ~{bam} . + samtools view -h ~{bam} | \ + k8 /opt/minimap2-2.17/misc/paftools.js sam2paf - | \ + sort -k6,6 -k8,8n | \ + k8 /opt/minimap2-2.17/misc/paftools.js call \ + -L5000 \ + -f ~{reference} \ + -s ~{sample} \ + - \ + > ~{bam_basename}.paftools.vcf - # htsbox has no version option; grep the version from the help output; ignore errors - htsbox 2>&1 | grep -Eo 'Version: htslib [0-9a-z-]+, htsbox [0-9a-z-]+' || true + >>> - htsbox pileup \ - -q20 \ - -c \ - -f ~{reference} \ - ~{basename(bam)} \ - > ~{bam_basename}.htsbox.vcf + output { + File paftools_vcf = "~{bam_basename}.paftools.vcf" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/align_hifiasm@sha256:0e8ad680b0e89376eb94fa8daa1a0269a4abe695ba39523a5c56a59d5c0e3953" + cpu: threads + memory: mem_gb + " GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +} + +task ref_group { + input { + Array[IndexData] bams + String sample_id + String refname + + RuntimeAttributes runtime_attributes + } + + Int threads = 3 + Int disk_size = 20 + Int mem_gb = threads * 8 + + command <<< + + echo "test" >>> output { - File htsbox_vcf = "~{bam_basename}.htsbox.vcf" + Array[File] bamlist = glob("*.bam") + String ref = refname } runtime { - docker: "~{runtime_attributes.container_registry}/htsbox@sha256:740b7962584a582757ee9601719fa98403517db669037bc3946e9ecc5f970654" + docker: "~{runtime_attributes.container_registry}/align_hifiasm@sha256:0e8ad680b0e89376eb94fa8daa1a0269a4abe695ba39523a5c56a59d5c0e3953" cpu: threads - memory: "14 GB" + memory: mem_gb + " GB" disk: disk_size + " GB" disks: "local-disk " + disk_size + " HDD" preemptible: runtime_attributes.preemptible_tries @@ -135,3 +189,4 @@ task htsbox { zones: runtime_attributes.zones } } + diff --git a/workflows/main.wdl b/workflows/main.wdl index 7647f34..31b2c58 100644 --- a/workflows/main.wdl +++ b/workflows/main.wdl @@ -65,9 +65,14 @@ workflow de_novo_assembly { Array[Array[File]?] assembly_lowQ_beds = de_novo_assembly_sample.assembly_lowQ_beds Array[Array[File]?] zipped_assembly_fastas = de_novo_assembly_sample.zipped_assembly_fastas Array[Array[File]?] assembly_stats = de_novo_assembly_sample.assembly_stats + + #ORIGINAL - UNSURE OF THIS ONE + #Array[Array[IndexData]?] asm_bam = de_novo_assembly_sample.asm_bams Array[Array[IndexData]?] asm_bam = de_novo_assembly_sample.asm_bams - Array[Array[IndexData]?] htsbox_vcf = de_novo_assembly_sample.htsbox_vcfs - Array[Array[File]?] htsbox_vcf_stats = de_novo_assembly_sample.htsbox_vcf_stats + Array[Array[IndexData]?] merged_bams = de_novo_assembly_sample.merged_bams + + Array[Array[IndexData]?] paftools_vcf = de_novo_assembly_sample.paftools_vcfs + Array[Array[File]?] paftools_vcf_stats = de_novo_assembly_sample.paftools_vcf_stats # de_novo_assembly_trio output Array[Map[String, String]]? haplotype_key = de_novo_assembly_trio.haplotype_key From 8b9a61dc66e70e6b04d6f7834d2c99b38cfecc00 Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Fri, 3 Nov 2023 14:49:26 -0700 Subject: [PATCH 39/62] clean up comments --- workflows/assemble_genome/assemble_genome.wdl | 27 +---------- .../de_novo_assembly_sample.wdl | 48 +------------------ 2 files changed, 2 insertions(+), 73 deletions(-) diff --git a/workflows/assemble_genome/assemble_genome.wdl b/workflows/assemble_genome/assemble_genome.wdl index 14c1c74..e0a9b1e 100644 --- a/workflows/assemble_genome/assemble_genome.wdl +++ b/workflows/assemble_genome/assemble_genome.wdl @@ -41,24 +41,7 @@ workflow assemble_genome { runtime_attributes = default_runtime_attributes } } - - #scatter (ref in references) { - # call align_hifiasm { -# input: -# sample_id = sample_id, -# query_sequences = gfa2fa.zipped_fasta, -# reference = ref.fasta.data, -# reference_name = ref.name, -# runtime_attributes = default_runtime_attributes -# } -# -# IndexData sample_aligned_bam = { -# "data": align_hifiasm.asm_bam, -# "data_index": align_hifiasm.asm_bam_index -# } - -# Pair[ReferenceData,IndexData] align_data = (ref, sample_aligned_bam) - #} + scatter (ref in references) { scatter (hap in gfa2fa.zipped_fasta) { @@ -276,16 +259,8 @@ task align_hifiasm { >>> output { -# String haplotype = read_string("hap.txt") -# File asm_bam = "~{sample_id}.~{haplotype}.asm.~{reference_name}.bam" -# File asm_bam_index = "~{sample_id}.~{haplotype}.asm.~{reference_name}.bam.bai" -# File asm_bam = "~{sample_id}.asm.~{reference_name}.bam" -# File asm_bam_index = "~{sample_id}.asm.~{reference_name}.bam.bai" - File asm_bam = glob("*.bam")[0] File asm_bam_index = glob("*.bam.bai")[0] - - } runtime { diff --git a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl index dd53438..b5ecf84 100644 --- a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl +++ b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl @@ -40,13 +40,6 @@ workflow de_novo_assembly_sample { scatter (aln in assemble_genome.alignments) { ReferenceData ref = aln.left IndexData bam = aln.right -# call htsbox { -# input: -# bam = bam.data, -# bam_index = bam.data_index, -# reference = ref.fasta.data, -# runtime_attributes = default_runtime_attributes -# } call paftools { input: @@ -73,7 +66,6 @@ workflow de_novo_assembly_sample { input: vcf = zip_index_vcf.zipped_vcf, params = "--samples ~{sample.sample_id}", -# params = "--samples ~{basename(bam.data)}", reference = ref.fasta.data, runtime_attributes = default_runtime_attributes } @@ -151,42 +143,4 @@ task paftools { queueArn: runtime_attributes.queue_arn zones: runtime_attributes.zones } -} - -task ref_group { - input { - Array[IndexData] bams - String sample_id - String refname - - RuntimeAttributes runtime_attributes - } - - Int threads = 3 - Int disk_size = 20 - Int mem_gb = threads * 8 - - command <<< - - echo "test" - >>> - - output { - Array[File] bamlist = glob("*.bam") - String ref = refname - } - - runtime { - docker: "~{runtime_attributes.container_registry}/align_hifiasm@sha256:0e8ad680b0e89376eb94fa8daa1a0269a4abe695ba39523a5c56a59d5c0e3953" - cpu: threads - memory: mem_gb + " GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } -} - +} \ No newline at end of file From 150b4a4b8d5809fe31bcd79a765da074c870e9fe Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Tue, 7 Nov 2023 15:52:42 -0800 Subject: [PATCH 40/62] now that we are aligning haps separately, there is a single assembly input instead of a list --- workflows/assemble_genome/assemble_genome.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/assemble_genome/assemble_genome.wdl b/workflows/assemble_genome/assemble_genome.wdl index e0a9b1e..d1710a6 100644 --- a/workflows/assemble_genome/assemble_genome.wdl +++ b/workflows/assemble_genome/assemble_genome.wdl @@ -247,7 +247,7 @@ task align_hifiasm { -x asm5 \ -R "@RG\\tID:~{sample_id}_hifiasm\\tSM:~{sample_id}" \ ~{reference} \ - ~{sep=' ' query_sequences} \ + ~{query_sequences} \ | samtools sort \ -@ 3 \ -T ./TMP \ From 58bdf47837af84de10f939576c94d09ec92949ad Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Wed, 15 Nov 2023 10:24:53 -0800 Subject: [PATCH 41/62] add double quotes and fix tests --- workflows/assemble_genome/assemble_genome.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows/assemble_genome/assemble_genome.wdl b/workflows/assemble_genome/assemble_genome.wdl index d1710a6..ce134b1 100644 --- a/workflows/assemble_genome/assemble_genome.wdl +++ b/workflows/assemble_genome/assemble_genome.wdl @@ -233,7 +233,7 @@ task align_hifiasm { echo "minimap2 version: $(minimap2 --version)" haplotype=$(basename ~{query_sequences} | sed -n 's/.*\(hap.\).*/\1/p') - echo $haplotype > hap.txt + echo "$haplotype" > hap.txt samtools --version @@ -253,9 +253,9 @@ task align_hifiasm { -T ./TMP \ -m 8G \ -O BAM \ - -o ~{sample_id}.$haplotype.asm.~{reference_name}.bam + -o "~{sample_id}.$haplotype.asm.~{reference_name}.bam" - samtools index ~{sample_id}.$haplotype.asm.~{reference_name}.bam + samtools index "~{sample_id}.$haplotype.asm.~{reference_name}.bam" >>> output { From de6bf2180b812e39704030f17dd5a126f4fea85f Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 15 Nov 2023 18:35:53 +0000 Subject: [PATCH 42/62] update wdl-ci config file after successful tests --- wdl-ci.config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 24c8b08..009e8c1 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -232,7 +232,7 @@ }, "gfa2fa": { "key": "gfa2fa", - "digest": "drs64xxuazexpb6n6glhbkmartzdorbj", + "digest": "r2xbqxqkae5owmzwkmvfk6atpdiu75vf", "tests": [ { "inputs": { From cc5afcb585c94a6e78374bf82312c4290db4917f Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Wed, 15 Nov 2023 12:38:08 -0800 Subject: [PATCH 43/62] update wdl-ci json --- wdl-ci.config.json | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 009e8c1..a0cd618 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -289,6 +289,11 @@ } } ] + }, + "merge_haps": { + "key": "merge_haps", + "digest": "", + "tests": [] } } }, @@ -319,6 +324,11 @@ } } ] + }, + "paftools": { + "key": "paftools", + "digest": "", + "tests": [] } } }, From fe8b97f7ccc11695f36b3f596296d6efdc45693c Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Wed, 15 Nov 2023 13:35:16 -0800 Subject: [PATCH 44/62] update wdl-common --- workflows/wdl-common | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/wdl-common b/workflows/wdl-common index 101b444..e37b327 160000 --- a/workflows/wdl-common +++ b/workflows/wdl-common @@ -1 +1 @@ -Subproject commit 101b444a3b2bd76ea014a53da80c0e800576ebbe +Subproject commit e37b3274f6e78a612adeae0e36a104a5752de9f7 From b5b121e7710218f73a44a33a45c2dcadc6d35454 Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Wed, 15 Nov 2023 13:37:37 -0800 Subject: [PATCH 45/62] update tests for new wdl-common --- wdl-ci.config.json | 51 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index a0cd618..3240f3a 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -487,6 +487,57 @@ "tests": [] } } + }, + "workflows/wdl-common/wdl/tasks/concat_vcf.wdl": { + "key": "workflows/wdl-common/wdl/tasks/concat_vcf.wdl", + "name": "", + "description": "", + "tasks": { + "concat_vcf": { + "key": "concat_vcf", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/workflows/hiphase/hiphase.wdl": { + "key": "workflows/wdl-common/wdl/workflows/hiphase/hiphase.wdl", + "name": "", + "description": "", + "tasks": { + "run_hiphase": { + "key": "run_hiphase", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/workflows/pharmcat/pharmcat.wdl": { + "key": "workflows/wdl-common/wdl/workflows/pharmcat/pharmcat.wdl", + "name": "", + "description": "", + "tasks": { + "pangu_cyp2d6": { + "key": "pangu_cyp2d6", + "digest": "", + "tests": [] + }, + "pharmcat_preprocess": { + "key": "pharmcat_preprocess", + "digest": "", + "tests": [] + }, + "filter_preprocessed_vcf": { + "key": "filter_preprocessed_vcf", + "digest": "", + "tests": [] + }, + "run_pharmcat": { + "key": "run_pharmcat", + "digest": "", + "tests": [] + } + } } }, "engines": { From f2b175f251a2efcbab34ad1162fd24690a64ce48 Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Wed, 15 Nov 2023 14:42:27 -0800 Subject: [PATCH 46/62] fix tests --- wdl-ci.config.json | 29 +---------------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 3240f3a..46a9928 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -386,33 +386,6 @@ } } }, - "workflows/wdl-common/wdl/tasks/pharmcat.wdl": { - "key": "workflows/wdl-common/wdl/tasks/pharmcat.wdl", - "name": "", - "description": "", - "tasks": { - "pangu_cyp2d6": { - "key": "pangu_cyp2d6", - "digest": "", - "tests": [] - }, - "pharmcat_preprocess": { - "key": "pharmcat_preprocess", - "digest": "", - "tests": [] - }, - "filter_preprocessed_vcf": { - "key": "filter_preprocessed_vcf", - "digest": "", - "tests": [] - }, - "run_pharmcat": { - "key": "run_pharmcat", - "digest": "", - "tests": [] - } - } - }, "workflows/wdl-common/wdl/tasks/whatshap_haplotag.wdl": { "key": "workflows/wdl-common/wdl/tasks/whatshap_haplotag.wdl", "name": "", @@ -574,4 +547,4 @@ } } } -} \ No newline at end of file +} From 2471441fdd3dc760a2e5a53b5618162eeb95cc65 Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 15 Nov 2023 22:55:24 +0000 Subject: [PATCH 47/62] update wdl-ci config file after successful tests --- wdl-ci.config.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 46a9928..dbc3b35 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -19,7 +19,7 @@ "tasks": { "bcftools_stats": { "key": "bcftools_stats", - "digest": "hnaklilhvfhzokyogil55ymtufpzugdc", + "digest": "cu73ojtpnhesxaa2jh7a7l23vlieds3i", "tests": [ { "inputs": { @@ -49,7 +49,7 @@ "tasks": { "zip_index_vcf": { "key": "zip_index_vcf", - "digest": "zglkxnubs7arukywr6dtr2rmlrs4l6si", + "digest": "cflenxzb6uj2ujfv4pkllo3vztdkev45", "tests": [ { "inputs": { @@ -79,7 +79,7 @@ "tasks": { "samtools_fasta": { "key": "samtools_fasta", - "digest": "fzvpxhpi2a5nyyys7ktoirf4ww2exbe3", + "digest": "x336uu76d5c6nzls2vgntvoqrnhex5q4", "tests": [ { "inputs": { @@ -547,4 +547,4 @@ } } } -} +} \ No newline at end of file From c434e7dfa0a5d26bd9d7c8e690c3c8484a94bb6b Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Thu, 16 Nov 2023 14:34:57 -0800 Subject: [PATCH 48/62] attempt to fix tests --- wdl-ci.config.json | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index dbc3b35..13a678e 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -269,17 +269,14 @@ { "inputs": { "sample_id": "HG005", - "query_sequences": [ - "${resources_file_path}/HG005.asm.bp.hap1.p_ctg.fasta.gz", - "${resources_file_path}/HG005.asm.bp.hap2.p_ctg.fasta.gz" - ], + "query_sequences": "${resources_file_path}/HG005.asm.bp.hap1.p_ctg.fasta.gz", "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", "reference_name": "GRCh38", "runtime_attributes": "${default_runtime_attributes}" }, "output_tests": { "asm_bam": { - "value": "${resources_file_path}/HG005.asm.GRCh38.bam", + "value": "${resources_file_path}/HG005.hap1.asm.GRCh38.bam", "test_tasks": [ "compare_file_basename", "samtools_quickcheck", @@ -547,4 +544,4 @@ } } } -} \ No newline at end of file +} From 979ab488e9be6ce6a786d1723525fcf831622c1d Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Thu, 16 Nov 2023 15:40:11 -0800 Subject: [PATCH 49/62] update test locaton and remove tests from non-humanassembly tasks --- wdl-ci.config.json | 34 +++------------------------------- 1 file changed, 3 insertions(+), 31 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 13a678e..290fab8 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -20,21 +20,7 @@ "bcftools_stats": { "key": "bcftools_stats", "digest": "cu73ojtpnhesxaa2jh7a7l23vlieds3i", - "tests": [ - { - "inputs": { - "vcf": "${resources_file_path}/HG005.GRCh38.deepvariant.vcf.gz", - "params": "--apply-filters PASS --samples ${sample_id}", - "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "runtime_attributes": "${default_runtime_attributes}" - }, - "output_tests": { - "stats": { - "value": "${resources_file_path}/HG005.GRCh38.deepvariant.vcf.stats.txt", - "test_tasks": [ - "compare_file_basename", - "check_empty_lines" - ] + "tests": [] } } } @@ -50,21 +36,7 @@ "zip_index_vcf": { "key": "zip_index_vcf", "digest": "cflenxzb6uj2ujfv4pkllo3vztdkev45", - "tests": [ - { - "inputs": { - "vcf": "${resources_file_path}/HG005.GRCh38.pbsv.vcf", - "runtime_attributes": "${default_runtime_attributes}" - }, - "output_tests": { - "zipped_vcf": { - "value": "${resources_file_path}/HG005.GRCh38.pbsv.vcf.gz", - "test_tasks": [ - "calculate_md5sum", - "compare_file_basename", - "vcftools_validator", - "check_gzip" - ] + "tests": [] } } } @@ -539,7 +511,7 @@ "engine_params": { "f1ed5b40-6a26-4eac-a2b8-9960516e4164": { "input_file_path": "/coac74908838b5dd7/inputs/small_dataset/chr6.p23", - "resources_file_path": "/coac74908838b5dd7/inputs/wdl-ci/humanwgs", + "resources_file_path": "/coac74908838b5dd7/inputs/wdl-ci/humanassembly", "datasets_file_path": "/datasetpbrarediseases/dataset" } } From 22c3fc26483e5a2bd6fedf4b6168d53911d53a0d Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Fri, 17 Nov 2023 15:08:30 -0800 Subject: [PATCH 50/62] update wdl config to reflect local HPC test running --- wdl-ci.config.json | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 290fab8..9a05650 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -21,10 +21,6 @@ "key": "bcftools_stats", "digest": "cu73ojtpnhesxaa2jh7a7l23vlieds3i", "tests": [] - } - } - } - ] } } }, @@ -37,10 +33,6 @@ "key": "zip_index_vcf", "digest": "cflenxzb6uj2ujfv4pkllo3vztdkev45", "tests": [] - } - } - } - ] } } }, @@ -485,8 +477,13 @@ "engines": { "f1ed5b40-6a26-4eac-a2b8-9960516e4164": { "key": "f1ed5b40-6a26-4eac-a2b8-9960516e4164", - "enabled": true, + "enabled": false, "name": "PacBio CoA installation" + }, + "abc123": { + "key": "abc123", + "enabled": true, + "name": "PacBio HPC" } }, "test_params": { @@ -510,9 +507,14 @@ }, "engine_params": { "f1ed5b40-6a26-4eac-a2b8-9960516e4164": { - "input_file_path": "/coac74908838b5dd7/inputs/small_dataset/chr6.p23", - "resources_file_path": "/coac74908838b5dd7/inputs/wdl-ci/humanassembly", - "datasets_file_path": "/datasetpbrarediseases/dataset" + "input_file_path": "/coac74908838b5dd7/inputs/small_dataset/chr6.p23", + "resources_file_path": "/coac74908838b5dd7/inputs/wdl-ci/humanassembly", + "datasets_file_path": "/datasetpbrarediseases/dataset" + }, + "abc123": { + "input_file_path": "/pbi/collections/appslabht/cromwell_output/testdata/inputs/small_dataset/chr6.p23", + "resources_file_path": "/pbi/collections/appslabht/cromwell_output/testdata/inputs/wdl-ci/humanassembly", + "datasets_file_path": "/pbi/collections/appslabht/cromwell_output/testdata//datasetpbrarediseases/dataset" } } } From 421fcc70343df55b091f06a145a7337de03c045b Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Mon, 20 Nov 2023 10:08:47 -0800 Subject: [PATCH 51/62] temporarily disable tests while troubleshooting --- wdl-ci.config.json | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 9a05650..74db142 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -242,9 +242,6 @@ "asm_bam": { "value": "${resources_file_path}/HG005.hap1.asm.GRCh38.bam", "test_tasks": [ - "compare_file_basename", - "samtools_quickcheck", - "check_coordinate_sorted_alignment" ] } } @@ -422,18 +419,6 @@ } } }, - "workflows/wdl-common/wdl/tasks/concat_vcf.wdl": { - "key": "workflows/wdl-common/wdl/tasks/concat_vcf.wdl", - "name": "", - "description": "", - "tasks": { - "concat_vcf": { - "key": "concat_vcf", - "digest": "", - "tests": [] - } - } - }, "workflows/wdl-common/wdl/workflows/hiphase/hiphase.wdl": { "key": "workflows/wdl-common/wdl/workflows/hiphase/hiphase.wdl", "name": "", From 013c77fb850096edb769bd547a8a4179aceda410 Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Mon, 20 Nov 2023 14:46:36 -0800 Subject: [PATCH 52/62] fix tests for CoA --- wdl-ci.config.json | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 74db142..cbe8be7 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -242,6 +242,9 @@ "asm_bam": { "value": "${resources_file_path}/HG005.hap1.asm.GRCh38.bam", "test_tasks": [ + "compare_file_basename", + "samtools_quickcheck", + "check_coordinate_sorted_alignment" ] } } @@ -462,13 +465,8 @@ "engines": { "f1ed5b40-6a26-4eac-a2b8-9960516e4164": { "key": "f1ed5b40-6a26-4eac-a2b8-9960516e4164", - "enabled": false, - "name": "PacBio CoA installation" - }, - "abc123": { - "key": "abc123", "enabled": true, - "name": "PacBio HPC" + "name": "PacBio CoA installation" } }, "test_params": { @@ -495,11 +493,6 @@ "input_file_path": "/coac74908838b5dd7/inputs/small_dataset/chr6.p23", "resources_file_path": "/coac74908838b5dd7/inputs/wdl-ci/humanassembly", "datasets_file_path": "/datasetpbrarediseases/dataset" - }, - "abc123": { - "input_file_path": "/pbi/collections/appslabht/cromwell_output/testdata/inputs/small_dataset/chr6.p23", - "resources_file_path": "/pbi/collections/appslabht/cromwell_output/testdata/inputs/wdl-ci/humanassembly", - "datasets_file_path": "/pbi/collections/appslabht/cromwell_output/testdata//datasetpbrarediseases/dataset" } } } From 51a53aa2a0e72e763a5d131b13a988762ab2d5c1 Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Mon, 20 Nov 2023 15:57:08 -0800 Subject: [PATCH 53/62] finally fixed? --- wdl-ci.config.json | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index cbe8be7..6a15968 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -465,8 +465,13 @@ "engines": { "f1ed5b40-6a26-4eac-a2b8-9960516e4164": { "key": "f1ed5b40-6a26-4eac-a2b8-9960516e4164", - "enabled": true, + "enabled": false, "name": "PacBio CoA installation" + }, + "abc123": { + "key": "abc123", + "enabled": true, + "name": "pacbio-hpc" } }, "test_params": { @@ -493,6 +498,11 @@ "input_file_path": "/coac74908838b5dd7/inputs/small_dataset/chr6.p23", "resources_file_path": "/coac74908838b5dd7/inputs/wdl-ci/humanassembly", "datasets_file_path": "/datasetpbrarediseases/dataset" + }, + "abc123": { + "input_file_path": "/pbi/collections/appslabht/cromwell_output/testdata/inputs/chr6.p23", + "resources_file_path": "/pbi/collections/appslabht/cromwell_output/testdata/wdl-ci/humanassembly", + "datasets_file_path": "/pbi/collections/appslabht/cromwell_output/testdata/datasetpbrarediseases/dataset" } } } From d1b2510352411ede05c160e021be840ac252fc10 Mon Sep 17 00:00:00 2001 From: Heather Ward Date: Tue, 21 Nov 2023 15:31:39 -0500 Subject: [PATCH 54/62] Use latest version of wdl-ci --- .github/workflows/lint-test-workflows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint-test-workflows.yml b/.github/workflows/lint-test-workflows.yml index f92b45c..19c5675 100644 --- a/.github/workflows/lint-test-workflows.yml +++ b/.github/workflows/lint-test-workflows.yml @@ -11,7 +11,7 @@ jobs: repository: ${{ github.event.pull_request.head.repo.full_name }} ref: ${{ github.event.pull_request.head.ref }} - name: wdl-ci - uses: dnastack/wdl-ci@v0.1.6 + uses: dnastack/wdl-ci@v1.0.0 with: wallet-url: ${{ secrets.WALLET_URL }} wallet-client-id: ${{ secrets.WALLET_CLIENT_ID }} From a88f0c2f183e0744dce332ab3f372f81b3bab810 Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Tue, 21 Nov 2023 14:50:12 -0800 Subject: [PATCH 55/62] do not save haplotype bams and output trio merged bam --- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 4 +++- workflows/main.wdl | 6 +----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 36633cf..34ab8af 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -116,7 +116,9 @@ workflow de_novo_assembly_trio { Array[Array[File]] assembly_lowQ_beds = flatten(assemble_genome.assembly_lowQ_beds) Array[Array[File]] zipped_assembly_fastas = flatten(assemble_genome.zipped_assembly_fastas) Array[Array[File]] assembly_stats = flatten(assemble_genome.assembly_stats) - Array[Array[IndexData]] asm_bams = flatten(assemble_genome.asm_bams) + + Array[Array[IndexData]] merged_bams = flatten(assemble_genome.merged_bams) + } parameter_meta { diff --git a/workflows/main.wdl b/workflows/main.wdl index 31b2c58..2a68151 100644 --- a/workflows/main.wdl +++ b/workflows/main.wdl @@ -66,11 +66,7 @@ workflow de_novo_assembly { Array[Array[File]?] zipped_assembly_fastas = de_novo_assembly_sample.zipped_assembly_fastas Array[Array[File]?] assembly_stats = de_novo_assembly_sample.assembly_stats - #ORIGINAL - UNSURE OF THIS ONE - #Array[Array[IndexData]?] asm_bam = de_novo_assembly_sample.asm_bams - Array[Array[IndexData]?] asm_bam = de_novo_assembly_sample.asm_bams Array[Array[IndexData]?] merged_bams = de_novo_assembly_sample.merged_bams - Array[Array[IndexData]?] paftools_vcf = de_novo_assembly_sample.paftools_vcfs Array[Array[File]?] paftools_vcf_stats = de_novo_assembly_sample.paftools_vcf_stats @@ -80,7 +76,7 @@ workflow de_novo_assembly { Array[Array[File]]? trio_assembly_lowQ_beds = de_novo_assembly_trio.assembly_lowQ_beds Array[Array[File]]? trio_zipped_assembly_fastas = de_novo_assembly_trio.zipped_assembly_fastas Array[Array[File]]? trio_assembly_stats = de_novo_assembly_trio.assembly_stats - Array[Array[IndexData]]? trio_asm_bams = de_novo_assembly_trio.asm_bams + Array[Array[IndexData]]? trio_merged_asm_bams = de_novo_assembly_trio.merged_bams } parameter_meta { From 37b6ffbd5687ff4652cd22506cff202064152427 Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 21 Nov 2023 22:59:08 +0000 Subject: [PATCH 56/62] update wdl-ci config file after successful tests --- wdl-ci.config.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 6a15968..4315946 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -228,7 +228,7 @@ }, "align_hifiasm": { "key": "align_hifiasm", - "digest": "ctgtjbeekxz2xcq42jnuyxhsfnrb52xg", + "digest": "ib3l4i6tdenf5sa3s54yzxfraxhk5u73", "tests": [ { "inputs": { @@ -495,9 +495,9 @@ }, "engine_params": { "f1ed5b40-6a26-4eac-a2b8-9960516e4164": { - "input_file_path": "/coac74908838b5dd7/inputs/small_dataset/chr6.p23", - "resources_file_path": "/coac74908838b5dd7/inputs/wdl-ci/humanassembly", - "datasets_file_path": "/datasetpbrarediseases/dataset" + "input_file_path": "/coac74908838b5dd7/inputs/small_dataset/chr6.p23", + "resources_file_path": "/coac74908838b5dd7/inputs/wdl-ci/humanassembly", + "datasets_file_path": "/datasetpbrarediseases/dataset" }, "abc123": { "input_file_path": "/pbi/collections/appslabht/cromwell_output/testdata/inputs/chr6.p23", @@ -506,4 +506,4 @@ } } } -} +} \ No newline at end of file From 24cfc8d35a880f3705a6a5e6fa48257a6d45cfd1 Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Mon, 27 Nov 2023 11:37:48 -0800 Subject: [PATCH 57/62] shorten chm13 name --- backends/aws/inputs.aws.json | 2 +- backends/azure/inputs.azure.json | 2 +- backends/gcp/inputs.gcp.json | 2 +- backends/hpc/inputs.hpc.json | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/backends/aws/inputs.aws.json b/backends/aws/inputs.aws.json index c120e44..bcbe58e 100644 --- a/backends/aws/inputs.aws.json +++ b/backends/aws/inputs.aws.json @@ -22,7 +22,7 @@ } }, { - "name": "chm13v2.0", + "name": "chm13", "fasta": { "data": "s3://dnastack-resources/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta", "data_index": "s3://dnastack-resources/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta.fai" diff --git a/backends/azure/inputs.azure.json b/backends/azure/inputs.azure.json index 7dfd2f6..591fcf7 100644 --- a/backends/azure/inputs.azure.json +++ b/backends/azure/inputs.azure.json @@ -22,7 +22,7 @@ } }, { - "name": "chm13v2.0", + "name": "chm13", "fasta": { "data": "/datasetpbrarediseases/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta", "data_index": "/datasetpbrarediseases/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta.fai" diff --git a/backends/gcp/inputs.gcp.json b/backends/gcp/inputs.gcp.json index 7969ec7..145a9fa 100644 --- a/backends/gcp/inputs.gcp.json +++ b/backends/gcp/inputs.gcp.json @@ -22,7 +22,7 @@ } }, { - "name": "chm13v2.0", + "name": "chm13", "fasta": { "data": "gs://pacbio-wdl/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta", "data_index": "gs://pacbio-wdl/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta.fai" diff --git a/backends/hpc/inputs.hpc.json b/backends/hpc/inputs.hpc.json index be3118f..bc4ea22 100644 --- a/backends/hpc/inputs.hpc.json +++ b/backends/hpc/inputs.hpc.json @@ -22,7 +22,7 @@ } }, { - "name": "chm13v2.0", + "name": "chm13", "fasta": { "data": "/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta", "data_index": " Date: Mon, 27 Nov 2023 13:34:38 -0800 Subject: [PATCH 58/62] update backends documentation --- .dockstore.yml | 4 ++-- backends/README.md | 5 ----- backends/aws/README.md | 7 +------ backends/aws/inputs.aws.json | 5 +++-- backends/azure/README.md | 5 ----- backends/azure/inputs.azure.json | 5 +++-- backends/gcp/README.md | 5 ----- backends/gcp/inputs.gcp.json | 5 +++-- backends/hpc/README.md | 14 +++++--------- backends/hpc/inputs.hpc.json | 5 +++-- 10 files changed, 20 insertions(+), 40 deletions(-) delete mode 100644 backends/README.md diff --git a/.dockstore.yml b/.dockstore.yml index 81717f9..6629a6e 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -64,5 +64,5 @@ workflows: # https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/nio/file/FileSystem.html#getPathMatcher(java.lang.String) # or RegEx when the string is surrounded by / (Ex: `/develop/`, `/myworkflow\/.*/`). filters: - branches: [ /develop/ ] - tags: [ /v.*/ ] + branches: [ /(.*)?dockstore/ ] + tags: [ /v.*dockstore/ ] \ No newline at end of file diff --git a/backends/README.md b/backends/README.md deleted file mode 100644 index 17505f4..0000000 --- a/backends/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Backend and engine-specific configuration - -Example configuration specific to each backend is provided here. - -For detailed instructions on configuring engines in different backends, see the [Workbench documentation for configuring engines](https://docs.dnastack.com/docs/workbench-settings). diff --git a/backends/aws/README.md b/backends/aws/README.md index ca609a9..c07e5d5 100644 --- a/backends/aws/README.md +++ b/backends/aws/README.md @@ -1,6 +1,6 @@ # Configuring the Amazon Genomics CLI -The Amazon Genomics CLI (`agc`) allows users to orchestrate workflow execution using AWS Batch. See the [Workbench documentation](https://docs.dnastack.com/docs/cromwell-on-aws-amazon-genomics-cli) for information on installing and using the `agc` to configure and run workflows. The following section provides additional information on deploying a project using the `agc`. +The Amazon Genomics CLI (`agc`) allows users to orchestrate workflow execution using AWS Batch. See the [documentation](https://docs.dnastack.com/docs/cromwell-on-aws-amazon-genomics-cli) for information on installing and using the `agc` to configure and run workflows. The following section provides additional information on deploying a project using the `agc`. ## Deploying a context with `agc` @@ -105,11 +105,6 @@ From the directory where your `agc-project.yaml` is located, run: The running workflow can be monitored via [`agc workflow` commands](https://aws.github.io/amazon-genomics-cli/docs/reference/agc_workflow/), or via the AWS console. -### Running via Workbench - -1. [Register the engine in Workbench](https://docs.dnastack.com/docs/connecting-to-a-workflow-engine) -2. [Follow the instructions in the README to run the workflow via Workbench](../../README.md#run-using-workbench) - # Reference data hosted in AWS AWS reference data is hosted in the `us-west-2` region in the bucket `s3://dnastack-resources`. diff --git a/backends/aws/inputs.aws.json b/backends/aws/inputs.aws.json index bcbe58e..c3530ae 100644 --- a/backends/aws/inputs.aws.json +++ b/backends/aws/inputs.aws.json @@ -4,8 +4,9 @@ "samples": [ { "sample_id": "String", - "movie_bams": "Array[File]", - "sex": "String?", + "movie_bams": [ + "File" + ], "father_id": "String?", "mother_id": "String?", "run_de_novo_assembly": "Boolean" diff --git a/backends/azure/README.md b/backends/azure/README.md index 05c9ca6..0adce10 100644 --- a/backends/azure/README.md +++ b/backends/azure/README.md @@ -16,11 +16,6 @@ See [the inputs section of the main README](../../README.md#workflow-inputs) for ## Running the workflow -### Running via Workbench - -1. [Register the engine in Workbench](https://docs.dnastack.com/docs/connecting-to-a-workflow-engine) -2. [Follow the instructions in the README to run the workflow via Workbench](../../README.md#run-using-workbench) - # Reference data hosted in Azure To use Azure reference data, add the following line to your `containers-to-mount` file in your Cromwell on Azure installation ([more info here](https://github.com/microsoft/CromwellOnAzure/blob/develop/docs/troubleshooting-guide.md#use-input-data-files-from-an-existing-azure-storage-account-that-my-lab-or-team-is-currently-using)): diff --git a/backends/azure/inputs.azure.json b/backends/azure/inputs.azure.json index 591fcf7..dcfee99 100644 --- a/backends/azure/inputs.azure.json +++ b/backends/azure/inputs.azure.json @@ -4,8 +4,9 @@ "samples": [ { "sample_id": "String", - "movie_bams": "Array[File]", - "sex": "String?", + "movie_bams": [ + "File" + ], "father_id": "String?", "mother_id": "String?", "run_de_novo_assembly": "Boolean" diff --git a/backends/gcp/README.md b/backends/gcp/README.md index 7b8ffb7..1d2e840 100644 --- a/backends/gcp/README.md +++ b/backends/gcp/README.md @@ -22,11 +22,6 @@ For example, the zones in region us-central1 are `"us-central1-a us-central1-b u ## Running the workflow -### Running via Workbench - -1. [Register the engine in Workbench](https://docs.dnastack.com/docs/connecting-to-a-workflow-engine) -2. [Follow the instructions in the README to run the workflow via Workbench](../../README.md#run-using-workbench) - # Reference data hosted in GCP GCP reference data is hosted in the `us-west1` region in the bucket `gs://pacbio-wdl`. This bucket is requester-pays, meaning that users will need to [provide a billing project in their Cromwell configuration](https://cromwell.readthedocs.io/en/stable/filesystems/GoogleCloudStorage/) in order to use files located in this bucket. diff --git a/backends/gcp/inputs.gcp.json b/backends/gcp/inputs.gcp.json index 145a9fa..be7ff33 100644 --- a/backends/gcp/inputs.gcp.json +++ b/backends/gcp/inputs.gcp.json @@ -4,8 +4,9 @@ "samples": [ { "sample_id": "String", - "movie_bams": "Array[File]", - "sex": "String?", + "movie_bams": [ + "File" + ], "father_id": "String?", "mother_id": "String?", "run_de_novo_assembly": "Boolean" diff --git a/backends/hpc/README.md b/backends/hpc/README.md index d3ab650..850925a 100644 --- a/backends/hpc/README.md +++ b/backends/hpc/README.md @@ -33,21 +33,17 @@ See [the inputs section of the main README](../../README.md#workflow-inputs) for `cromwell run workflows/main.wdl -i ` -### Running via Workbench - -1. [Register the engine in Workbench](https://docs.dnastack.com/docs/connecting-to-a-workflow-engine) -2. [Follow the instructions in the README to run the workflow via Workbench](../../README.md#run-using-workbench) - # Reference data bundle -![https://doi.org/10.5281/zenodo.7922357](https://zenodo.org/badge/DOI/10.5281/zenodo.7922357.svg) +10.5281/zenodo.10059671 +![https://doi.org/10.5281/zenodo.10059671](https://zenodo.org/badge/DOI/10.5281/zenodo.10059671.svg) -Reference data is hosted on Zenodo at [10.5281/zenodo.7922357](https://zenodo.org/record/7922357). Download the reference data bundle and extract it to a location on your HPC, then update the input template file with the path to the reference data. +Reference data is hosted on Zenodo at [10.5281/zenodo.10059671](https://zenodo.org/record/10059671). Download the reference data bundle and extract it to a location on your HPC, then update the input template file with the path to the reference data. ```bash # download the reference data bundle -wget https://zenodo.org/record/7922357/files/wdl-humanwgs.v1.0.0.resources.tgz +wget https://zenodo.org/record/10059671/files/wdl-humanassembly.v0.9.0.resource.tgz # extract the reference data bundle and rename as dataset -tar -xzf wdl-humanwgs.v1.0.0.resources.tgz && mv static_resources dataset +tar -xzf wdl-humanassembly.v9.9.0.resource.tgz && mv static_resources dataset ``` diff --git a/backends/hpc/inputs.hpc.json b/backends/hpc/inputs.hpc.json index bc4ea22..5509a0e 100644 --- a/backends/hpc/inputs.hpc.json +++ b/backends/hpc/inputs.hpc.json @@ -4,8 +4,9 @@ "samples": [ { "sample_id": "String", - "movie_bams": "Array[File]", - "sex": "String?", + "movie_bams": [ + "File" + ], "father_id": "String?", "mother_id": "String?", "run_de_novo_assembly": "Boolean" From 12c7f4299e3f62324e6ed28171814326b5af8970 Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Mon, 27 Nov 2023 17:06:27 -0800 Subject: [PATCH 59/62] variant calling for trio asm --- workflows/assemble_genome/assemble_genome.wdl | 50 +++++++++++++++++ .../de_novo_assembly_sample.wdl | 56 +------------------ .../de_novo_assembly_trio.wdl | 42 +++++++++++++- workflows/main.wdl | 5 ++ 4 files changed, 98 insertions(+), 55 deletions(-) diff --git a/workflows/assemble_genome/assemble_genome.wdl b/workflows/assemble_genome/assemble_genome.wdl index ce134b1..d70fc84 100644 --- a/workflows/assemble_genome/assemble_genome.wdl +++ b/workflows/assemble_genome/assemble_genome.wdl @@ -321,3 +321,53 @@ task merge_haps { zones: runtime_attributes.zones } } + +task paftools { + input { + File bam + File bam_index + + File reference + + String sample + + RuntimeAttributes runtime_attributes + } + + String bam_basename = basename(bam, ".bam") + Int threads = 2 + Int disk_size = ceil((size(bam, "GB") + size(reference, "GB")) * 3 + 20) + Int mem_gb = threads * 8 + + command <<< + set -euo pipefail + + samtools view -h ~{bam} | \ + k8 /opt/minimap2-2.17/misc/paftools.js sam2paf - | \ + sort -k6,6 -k8,8n | \ + k8 /opt/minimap2-2.17/misc/paftools.js call \ + -L5000 \ + -f ~{reference} \ + -s ~{sample} \ + - \ + > ~{bam_basename}.paftools.vcf + + >>> + + output { + File paftools_vcf = "~{bam_basename}.paftools.vcf" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/align_hifiasm@sha256:0e8ad680b0e89376eb94fa8daa1a0269a4abe695ba39523a5c56a59d5c0e3953" + cpu: threads + memory: mem_gb + " GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +} \ No newline at end of file diff --git a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl index b5ecf84..1891ad5 100644 --- a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl +++ b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl @@ -41,7 +41,7 @@ workflow de_novo_assembly_sample { ReferenceData ref = aln.left IndexData bam = aln.right - call paftools { + call AssembleGenome.paftools { input: bam = bam.data, sample = sample.sample_id, @@ -79,9 +79,7 @@ workflow de_novo_assembly_sample { Array[File] zipped_assembly_fastas = assemble_genome.zipped_assembly_fastas Array[File] assembly_stats = assemble_genome.assembly_stats Array[IndexData] merged_bams = assemble_genome.merged_bams - Array[IndexData] asm_bams = assemble_genome.asm_bams - # Array[File] paftools_vcfs = paftools.paftools_vcf - + #Array[IndexData] asm_bams = assemble_genome.asm_bams Array[IndexData] paftools_vcfs = paftools_vcf Array[File] paftools_vcf_stats = bcftools_stats.stats @@ -93,54 +91,4 @@ workflow de_novo_assembly_sample { default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"} on_demand_runtime_attributes: {help: "RuntimeAttributes for tasks that require dedicated instances"} } -} - -task paftools { - input { - File bam - File bam_index - - File reference - - String sample - - RuntimeAttributes runtime_attributes - } - - String bam_basename = basename(bam, ".bam") - Int threads = 2 - Int disk_size = ceil((size(bam, "GB") + size(reference, "GB")) * 3 + 20) - Int mem_gb = threads * 8 - - command <<< - set -euo pipefail - - samtools view -h ~{bam} | \ - k8 /opt/minimap2-2.17/misc/paftools.js sam2paf - | \ - sort -k6,6 -k8,8n | \ - k8 /opt/minimap2-2.17/misc/paftools.js call \ - -L5000 \ - -f ~{reference} \ - -s ~{sample} \ - - \ - > ~{bam_basename}.paftools.vcf - - >>> - - output { - File paftools_vcf = "~{bam_basename}.paftools.vcf" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/align_hifiasm@sha256:0e8ad680b0e89376eb94fa8daa1a0269a4abe695ba39523a5c56a59d5c0e3953" - cpu: threads - memory: mem_gb + " GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } } \ No newline at end of file diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 34ab8af..b11e5a0 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -6,6 +6,8 @@ version 1.0 import "../assembly_structs.wdl" import "../wdl-common/wdl/tasks/samtools_fasta.wdl" as SamtoolsFasta import "../assemble_genome/assemble_genome.wdl" as AssembleGenome +import "../wdl-common/wdl/tasks/zip_index_vcf.wdl" as ZipIndexVcf +import "../wdl-common/wdl/tasks/bcftools_stats.wdl" as BcftoolsStats workflow de_novo_assembly_trio { input { @@ -107,6 +109,42 @@ workflow de_novo_assembly_trio { default_runtime_attributes = default_runtime_attributes, on_demand_runtime_attributes = on_demand_runtime_attributes } + + scatter (aln in assemble_genome.alignments) { + ReferenceData ref = aln.left + IndexData bam = aln.right + + call AssembleGenome.paftools { + input: + bam = bam.data, + sample = child.sample_id, + bam_index = bam.data_index, + reference = ref.fasta.data, + runtime_attributes = default_runtime_attributes + } + + + call ZipIndexVcf.zip_index_vcf { + input: + vcf = paftools.paftools_vcf, + runtime_attributes = default_runtime_attributes + } + + IndexData paftools_vcf = { + "data": zip_index_vcf.zipped_vcf, + "data_index": zip_index_vcf.zipped_vcf_index + } + + call BcftoolsStats.bcftools_stats { + input: + vcf = zip_index_vcf.zipped_vcf, + params = "--samples ~{child.sample_id}", + reference = ref.fasta.data, + runtime_attributes = default_runtime_attributes + } + + } + } } @@ -118,7 +156,9 @@ workflow de_novo_assembly_trio { Array[Array[File]] assembly_stats = flatten(assemble_genome.assembly_stats) Array[Array[IndexData]] merged_bams = flatten(assemble_genome.merged_bams) - + Array[Array[IndexData]] paftools_vcfs = flatten(paftools_vcf) + Array[Array[File]] paftools_vcf_stats = flatten(bcftools_stats.stats) + } parameter_meta { diff --git a/workflows/main.wdl b/workflows/main.wdl index 2a68151..54cb3f9 100644 --- a/workflows/main.wdl +++ b/workflows/main.wdl @@ -77,6 +77,11 @@ workflow de_novo_assembly { Array[Array[File]]? trio_zipped_assembly_fastas = de_novo_assembly_trio.zipped_assembly_fastas Array[Array[File]]? trio_assembly_stats = de_novo_assembly_trio.assembly_stats Array[Array[IndexData]]? trio_merged_asm_bams = de_novo_assembly_trio.merged_bams + + Array[Array[IndexData]]? trio_paftools_vcf = de_novo_assembly_trio.paftools_vcfs + Array[Array[File]]? trio_paftools_vcf_stats = de_novo_assembly_trio.paftools_vcf_stats + + } parameter_meta { From 389d8ff42b44621737e996288ea0e90e9806b921 Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Tue, 28 Nov 2023 10:45:18 -0800 Subject: [PATCH 60/62] cleanup README --- README.md | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a453820..1adc42f 100644 --- a/README.md +++ b/README.md @@ -33,9 +33,8 @@ Reference datasets are hosted publicly for use in the pipeline. For data locatio 1. [Select a backend environment](#selecting-a-backend) 2. [Configure a workflow execution engine in the chosen environment](#configuring-a-workflow-engine) -3. [Optional] [Register the engine in Workbench](#registering-a-workflow-engine-in-workbench) -4. [Fill out the inputs JSON file for your cohort](#filling-out-the-inputs-json) -5. [Run the workflow](#running-the-workflow-1) +3. [Fill out the inputs JSON file for your cohort](#filling-out-the-inputs-json) +4. [Run the workflow](#running-the-workflow-1) ## Selecting a backend @@ -128,21 +127,20 @@ Sample information for each sample in the workflow run. | :- | :- | :- | :- | | String | sample_id | A unique name for the sample; used to name outputs | | | Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | movie_bams | The set of unaligned movie BAMs associated with this sample | | -| String? | sex | Sample sex | ["MALE", "FEMALE", "."]. "." or `null` will set sex to unknown. | | String? | father_id | Paternal `sample_id` | | | String? | mother_id | Maternal `sample_id` | | | Boolean | run_de_novo_assembly | If true, run single-sample _de novo_ assembly for this sample | \[true, false\] | ## [ReferenceData](workflows/humanwgs_structs.wdl) -Files associated with the reference genome. +Array of references and their associated names and indices. These files are hosted publicly in each of the cloud backends; see `backends/${backend}/inputs.${backend}.json`. | Type | Name | Description | Notes | | :- | :- | :- | :- | | String | name | Reference name; used to name outputs (e.g., "GRCh38") | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | fastas | Reference genomes and associatedindex | | +| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl) | fasta | Reference genome and associated index | | ## Other inputs @@ -168,8 +166,8 @@ These files will be output if `cohort.samples[sample]` is set to `true` for any | Array[Array[File]?] | assembly_lowQ_beds | Coordinates of low quality regions in BED format. | | | Array[Array[File]?] | assembly_stats | Assembly size and NG50 stats generated by [calN50](https://github.com/lh3/calN50). | | | Array[Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)?]] | asm_bam | [minimap2](https://github.com/lh3/minimap2) alignment of assembly to reference. | | -| Array[Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)?]] | htsbox_vcf | Naive pileup variant calling of assembly against reference with [`htsbox`](https://github.com/lh3/htsbox) | | -| Array[Array[File?]] | htsbox_vcf_stats | [`bcftools stats`](https://samtools.github.io/bcftools/bcftools.html#stats) summary statistics for `htsbox` variant calls | | +| Array[Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)?]] | paftools_vcf | calls variants from coordinate-sorted assembly-to-reference alignment. It calls variants from the cs tag and identifies confident/callable regions as those covered by exactly one contig [`paftools`](https://github.com/lh3/minimap2/blob/master/misc/README.md#calling-variants-from-haploid-assemblies) | | +| Array[Array[File?]] | paftools_vcf_stats | [`bcftools stats`](https://samtools.github.io/bcftools/bcftools.html#stats) summary statistics for `paftools` variant calls | | ## De novo assembly - trio @@ -197,8 +195,9 @@ The Docker image used by a particular step of the workflow can be identified by | bcftools |
  • [bcftools 1.14](https://github.com/samtools/bcftools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/bcftools) | | gfatools |
  • [gfatools 0.4](https://github.com/lh3/gfatools/releases/tag/v0.4)
  • [htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)
  • [k8 0.2.5](https://github.com/attractivechaos/k8/releases/tag/0.2.5)
  • [caln50 01091f2](https://github.com/lh3/calN50/tree/01091f25bc24e17fbf0da3407ea24aa448c489ae)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/gfatools) | | hifiasm |
  • [hifiasm 0.19.4](https://github.com/chhylp123/hifiasm/releases/tag/0.19.4)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/hifiasm) | -| htsbox |
  • [htsbox r346 (6964440)](https://github.com/lh3/htsbox/tree/6964440d791a60a22ca5ff25dc413a362bdc0abe)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/htsbox) | | htslib |
  • [htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/htslib) | -| parse-cohort |
  • python 3.8.10; custom scripts
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/parse-cohort) | +| paftools |
  • [minimap2 2.17](https://github.com/lh3/minimap2/releases/tag/v2.17)
  • [samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/align_hifiasm) | +| parse-cohort |
  • python 3.8.10; custom scripts
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/ +987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/parse-cohort) | | samtools |
  • [samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/samtools) | | yak |
  • [yak 0.1](https://github.com/lh3/yak/releases/tag/v0.1)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/yak) | From 13ef8d7c88f06421a25d9cbda2bd665aa6d66861 Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Tue, 28 Nov 2023 14:43:43 -0800 Subject: [PATCH 61/62] paftools print version and minor doc fixes before merge --- README.md | 18 +++++++++--------- workflows/assemble_genome/assemble_genome.wdl | 2 ++ .../de_novo_assembly_sample.wdl | 1 - workflows/main.wdl | 4 ++-- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 1adc42f..4d13c30 100644 --- a/README.md +++ b/README.md @@ -191,13 +191,13 @@ The Docker image used by a particular step of the workflow can be identified by | Image | Major tool versions | Links | | :- | :- | :- | -| align_hifiasm |
  • [minimap2 2.17](https://github.com/lh3/minimap2/releases/tag/v2.17)
  • [samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/align_hifiasm) | -| bcftools |
  • [bcftools 1.14](https://github.com/samtools/bcftools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/bcftools) | -| gfatools |
  • [gfatools 0.4](https://github.com/lh3/gfatools/releases/tag/v0.4)
  • [htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)
  • [k8 0.2.5](https://github.com/attractivechaos/k8/releases/tag/0.2.5)
  • [caln50 01091f2](https://github.com/lh3/calN50/tree/01091f25bc24e17fbf0da3407ea24aa448c489ae)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/gfatools) | -| hifiasm |
  • [hifiasm 0.19.4](https://github.com/chhylp123/hifiasm/releases/tag/0.19.4)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/hifiasm) | -| htslib |
  • [htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/htslib) | -| paftools |
  • [minimap2 2.17](https://github.com/lh3/minimap2/releases/tag/v2.17)
  • [samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/align_hifiasm) | +| align_hifiasm |
  • [minimap2 2.17](https://github.com/lh3/minimap2/releases/tag/v2.17)
  • [samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/align_hifiasm) | +| bcftools |
  • [bcftools 1.14](https://github.com/samtools/bcftools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/bcftools) | +| gfatools |
  • [gfatools 0.4](https://github.com/lh3/gfatools/releases/tag/v0.4)
  • [htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)
  • [k8 0.2.5](https://github.com/attractivechaos/k8/releases/tag/0.2.5)
  • [caln50 01091f2](https://github.com/lh3/calN50/tree/01091f25bc24e17fbf0da3407ea24aa448c489ae)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/gfatools) | +| hifiasm |
  • [hifiasm 0.19.4](https://github.com/chhylp123/hifiasm/releases/tag/0.19.4)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/hifiasm) | +| htslib |
  • [htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/htslib) | +| paftools | [paftools 2.26-r1182-dirty](https://github.com/lh3/minimap2/blob/bc588c0eeb26426d0d90a93fb0877358a389c515/misc/paftools.js) | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/align_hifiasm) | | parse-cohort |
  • python 3.8.10; custom scripts
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/ -987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/parse-cohort) | -| samtools |
  • [samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/samtools) | -| yak |
  • [yak 0.1](https://github.com/lh3/yak/releases/tag/v0.1)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/yak) | +3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/parse-cohort) | +| samtools |
  • [samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/samtools) | +| yak |
  • [yak 0.1](https://github.com/lh3/yak/releases/tag/v0.1)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/yak) | diff --git a/workflows/assemble_genome/assemble_genome.wdl b/workflows/assemble_genome/assemble_genome.wdl index d70fc84..4e42ef6 100644 --- a/workflows/assemble_genome/assemble_genome.wdl +++ b/workflows/assemble_genome/assemble_genome.wdl @@ -342,6 +342,8 @@ task paftools { command <<< set -euo pipefail + k8 /opt/minimap2-2.17/misc/paftools.js version + samtools view -h ~{bam} | \ k8 /opt/minimap2-2.17/misc/paftools.js sam2paf - | \ sort -k6,6 -k8,8n | \ diff --git a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl index 1891ad5..bae1906 100644 --- a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl +++ b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl @@ -79,7 +79,6 @@ workflow de_novo_assembly_sample { Array[File] zipped_assembly_fastas = assemble_genome.zipped_assembly_fastas Array[File] assembly_stats = assemble_genome.assembly_stats Array[IndexData] merged_bams = assemble_genome.merged_bams - #Array[IndexData] asm_bams = assemble_genome.asm_bams Array[IndexData] paftools_vcfs = paftools_vcf Array[File] paftools_vcf_stats = bcftools_stats.stats diff --git a/workflows/main.wdl b/workflows/main.wdl index 54cb3f9..d690372 100644 --- a/workflows/main.wdl +++ b/workflows/main.wdl @@ -86,8 +86,8 @@ workflow de_novo_assembly { parameter_meta { cohort: {help: "Sample information for the cohort"} - reference: {help: "Reference genome data"} - backend: {help: "Backend where the workflow will be executed ['GCP', 'Azure', 'AWS']"} + reference: {help: "Array of Reference genome data"} + backend: {help: "Backend where the workflow will be executed ['GCP', 'Azure', 'AWS', 'HPC']"} zones: {help: "Zones where compute will take place; required if backend is set to 'AWS' or 'GCP'"} aws_spot_queue_arn: {help: "Queue ARN for the spot batch queue; required if backend is set to 'AWS'"} aws_on_demand_queue_arn: {help: "Queue ARN for the on demand batch queue; required if backend is set to 'AWS'"} From b87d85be3e21185a81d711d582a37128cc920f81 Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Tue, 28 Nov 2023 14:59:31 -0800 Subject: [PATCH 62/62] ... --- README.md | 5 ++--- workflows/main.wdl | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 4d13c30..89a557e 100644 --- a/README.md +++ b/README.md @@ -196,8 +196,7 @@ The Docker image used by a particular step of the workflow can be identified by | gfatools |
  • [gfatools 0.4](https://github.com/lh3/gfatools/releases/tag/v0.4)
  • [htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)
  • [k8 0.2.5](https://github.com/attractivechaos/k8/releases/tag/0.2.5)
  • [caln50 01091f2](https://github.com/lh3/calN50/tree/01091f25bc24e17fbf0da3407ea24aa448c489ae)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/gfatools) | | hifiasm |
  • [hifiasm 0.19.4](https://github.com/chhylp123/hifiasm/releases/tag/0.19.4)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/hifiasm) | | htslib |
  • [htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/htslib) | -| paftools | [paftools 2.26-r1182-dirty](https://github.com/lh3/minimap2/blob/bc588c0eeb26426d0d90a93fb0877358a389c515/misc/paftools.js) | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/align_hifiasm) | -| parse-cohort |
  • python 3.8.10; custom scripts
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/ -3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/parse-cohort) | +| paftools |
  • [paftools 2.26-r1182-dirty](https://github.com/lh3/minimap2/blob/bc588c0eeb26426d0d90a93fb0877358a389c515/misc/paftools.js)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/align_hifiasm) | +| parse-cohort |
  • python 3.8.10; custom scripts
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/parse-cohort) | | samtools |
  • [samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/samtools) | | yak |
  • [yak 0.1](https://github.com/lh3/yak/releases/tag/v0.1)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/yak) | diff --git a/workflows/main.wdl b/workflows/main.wdl index d690372..29f475e 100644 --- a/workflows/main.wdl +++ b/workflows/main.wdl @@ -86,7 +86,7 @@ workflow de_novo_assembly { parameter_meta { cohort: {help: "Sample information for the cohort"} - reference: {help: "Array of Reference genome data"} + references: {help: "Array of Reference genome data"} backend: {help: "Backend where the workflow will be executed ['GCP', 'Azure', 'AWS', 'HPC']"} zones: {help: "Zones where compute will take place; required if backend is set to 'AWS' or 'GCP'"} aws_spot_queue_arn: {help: "Queue ARN for the spot batch queue; required if backend is set to 'AWS'"}