diff --git a/.dockstore.yml b/.dockstore.yml new file mode 100644 index 0000000..6629a6e --- /dev/null +++ b/.dockstore.yml @@ -0,0 +1,68 @@ +# The first line refers to the version 1.2 of the .dockstore.yml schema +version: 1.2 + +# An array of workflows. Each element corresponds to a workflow on Dockstore. +workflows: + + # The optional workflow name for a workflow, which may only consist of alphanumerics + # and internal underscores and hyphens, but no spaces or other characters. Names may not exceed 256 characters. + # If using a .dockstore.yml with multiple workflows, this field is required + # to uniquely identify workflows in the repository. + # + # It should be noted that having the name come first is an arbitrary decision. + # You could use subclass instead, for instance. Provided arrays are not broken + # up, the order of fields within a .dockstore.yml is not important. + - name: wdl-humanassembly + + # The descriptor language used for the workflow. CWL, WDL, NFL (Nextflow), or GALAXY. + # This cannot be changed once the workflow is registered. + subclass: WDL + + # Workflow-wide setting that will affect ALL branches/tags; only set this as needed in a main branch. + # Set to true to publish an unpublished workflow, or false to unpublish a published workflow. + # Omitting the publish setting leaves the publish-state unchanged (recommended for all non-primary branches). + # publish: + + # The absolute path to the primary descriptor file in the Git repository. + # - For CWL, the primary descriptor is a .cwl file. + # - For WDL, the primary descriptor is a .wdl file. + # - For Galaxy, the primary descriptor is a .ga file. + # - Nextflow differs from these as the primary descriptor is a nextflow.config file. + primaryDescriptorPath: /workflows/main.wdl + + # An optional array of absolute paths to test parameter files in the Git repository. + # For example... + # testParameterFiles: + # - /null-model/null-model.json + # - /null-model/null-model-binary.json + # testParameterFiles: + + # An optional path to a workflow-specific readme in the Git repository. If not provided, Dockstore will show + # the readme.md present at the root of the Git repository if it is present. + # If you have multiple workflows in a single Git repository, it is recommend to give each one a readme. + readMePath: /README.md + + # An optional array of authorship information. + # Note that if orcid is present, then all other fields will be ignored, as information will be taken from orcid. + # If orcid is not present, make sure to at a minimum include the name field for each author. + authors: + - orcid: 0000-0001-5921-2022 # Juniper Lake + - orcid: 0000-0001-7628-5645 # Gregory Concepcion + - orcid: 0000-0002-7422-1194 # William Rowell + - orcid: 0000-0002-5507-0896 # Heather Ward + - orcid: 0009-0001-0205-4614 # Karen Fang + + # A boolean that will change the default version to be displayed on Dockstore. Default: False. + # A value of true will automatically display the latest tag updated as default. + # A value of false will retain the default version that has been specified via the Dockstore UI. + latestTagAsDefault: False + + # The optional filters section allow specifying sets of Git branches and tags to include for the workflow. + # If no filters are given, all branches and tags are included. + # Branches and tags are arrays of pattern-strings. + # Pattern-strings use Unix-style Glob syntax by default (Ex: `develop`, `myworkflow/**`) + # https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/nio/file/FileSystem.html#getPathMatcher(java.lang.String) + # or RegEx when the string is surrounded by / (Ex: `/develop/`, `/myworkflow\/.*/`). + filters: + branches: [ /(.*)?dockstore/ ] + tags: [ /v.*dockstore/ ] \ No newline at end of file diff --git a/.github/workflows/lint-test-workflows.yml b/.github/workflows/lint-test-workflows.yml index f92b45c..19c5675 100644 --- a/.github/workflows/lint-test-workflows.yml +++ b/.github/workflows/lint-test-workflows.yml @@ -11,7 +11,7 @@ jobs: repository: ${{ github.event.pull_request.head.repo.full_name }} ref: ${{ github.event.pull_request.head.ref }} - name: wdl-ci - uses: dnastack/wdl-ci@v0.1.6 + uses: dnastack/wdl-ci@v1.0.0 with: wallet-url: ${{ secrets.WALLET_URL }} wallet-client-id: ${{ secrets.WALLET_CLIENT_ID }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..13d7372 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +inputs.test_data*.json +.wdltest* +dependencies.zip diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..aaea0c1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,34 @@ +Copyright (c) 2023, Pacific Biosciences of California, Inc. + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted (subject to the limitations in the +disclaimer below) provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of Pacific Biosciences nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. diff --git a/README.md b/README.md index e28cd5a..89a557e 100644 --- a/README.md +++ b/README.md @@ -1,56 +1,114 @@ +# DISCLAIMER + +TO THE GREATEST EXTENT PERMITTED BY APPLICABLE LAW, THIS WEBSITE AND ITS CONTENT, INCLUDING ALL SOFTWARE, SOFTWARE CODE, SITE-RELATED SERVICES, AND DATA, ARE PROVIDED "AS IS," WITH ALL FAULTS, WITH NO REPRESENTATIONS OR WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, SATISFACTORY QUALITY, NON-INFRINGEMENT OR FITNESS FOR A PARTICULAR PURPOSE. ALL WARRANTIES ARE REJECTED AND DISCLAIMED. YOU ASSUME TOTAL RESPONSIBILITY AND RISK FOR YOUR USE OF THE FOREGOING. PACBIO IS NOT OBLIGATED TO PROVIDE ANY SUPPORT FOR ANY OF THE FOREGOING, AND ANY SUPPORT PACBIO DOES PROVIDE IS SIMILARLY PROVIDED WITHOUT REPRESENTATION OR WARRANTY OF ANY KIND. NO ORAL OR WRITTEN INFORMATION OR ADVICE SHALL CREATE A REPRESENTATION OR WARRANTY OF ANY KIND. ANY REFERENCES TO SPECIFIC PRODUCTS OR SERVICES ON THE WEBSITES DO NOT CONSTITUTE OR IMPLY A RECOMMENDATION OR ENDORSEMENT BY PACBIO. + # wdl-humanassembly Workflow for running de novo assembly using human PacBio whole genome sequencing (WGS) data. Written using [Workflow Description Language (WDL)](https://openwdl.org/). - Docker images used by these workflows are defined [here](https://github.com/PacificBiosciences/wdl-dockerfiles). - - Common tasks that may be reused within or between workflows are defined [here](https://github.com/PacificBiosciences/wdl-common). # Workflow -The assembly workflow performs _de novo_ assembly on samples and trios. - **Workflow entrypoint**: [workflows/main.wdl](workflows/main.wdl) -- [Blank input template file](workflows/inputs.json) -- [Azure-based inputs](workflows/inputs.azure.json) -- [AWS-based inputs](workflows/inputs.aws.json) -- [GCP-based inputs]((workflows/inputs.gcp.json)) +The assembly workflow performs _de novo_ assembly on samples and trios. ![De novo assembly workflow diagram](workflows/main.graphviz.svg "De novo assembly workflow diagram") -# Reference datasets and associated workflow files +## Setup -Reference datasets are hosted publicly for use in the pipeline. For data locations, see `workflows/inputs.${backend}.json`. +Some tasks and workflows are pulled in from other repositories. Ensure you have initialized submodules following cloning by running `git submodule update --init --recursive`. -## Reference data hosted in Azure +## Resource requirements -To use Azure reference data, add the following line to your `containers-to-mount` file in your Cromwell on Azure installation ([more info here](https://github.com/microsoft/CromwellOnAzure/blob/develop/docs/troubleshooting-guide.md#use-input-data-files-from-an-existing-azure-storage-account-that-my-lab-or-team-is-currently-using)): +The workflow requires at minimum 48 cores and 288 GB of RAM. Ensure that the backend environment you're using has enough quota to run the workflow. -`https://datasetpbrarediseases.blob.core.windows.net/dataset?si=public&spr=https&sv=2021-06-08&sr=c&sig=o6OkcqWWlGcGOOr8I8gCA%2BJwlpA%2FYsRz0DMB8CCtCJk%3D` +## Reference datasets and associated workflow files -The [Azure input file template](workflows/inputs.azure.json) has paths to the reference files in this blob storage prefilled. +Reference datasets are hosted publicly for use in the pipeline. For data locations, see the [backend-specific documentation](backends/) and template inputs files for each backend with paths to publicly hosted reference files filled out. -## Reference data hosted in AWS +# Running the workflow -AWS reference data is hosted in the `us-west-2` region in the bucket `s3://dnastack-resources`. +1. [Select a backend environment](#selecting-a-backend) +2. [Configure a workflow execution engine in the chosen environment](#configuring-a-workflow-engine) +3. [Fill out the inputs JSON file for your cohort](#filling-out-the-inputs-json) +4. [Run the workflow](#running-the-workflow-1) -To use AWS reference data, add the following line to the data section of your [`agc-project.yaml`](https://aws.github.io/amazon-genomics-cli/docs/concepts/projects/): +## Selecting a backend -```yaml -data: - - location: s3://dnastack-resources - readOnly: true -``` +The workflow can be run on Azure, AWS, GCP, or HPC. Your choice of backend will largely be determined by the location of your data. + +For backend-specific configuration, see the relevant documentation: + +- [Azure](backends/azure) +- [AWS](backends/aws) +- [GCP](backends/gcp) +- [HPC](backends/hpc) + +## Configuring a workflow engine + +An execution engine is required to run workflows. Two popular engines for running WDL-based workflows are [`miniwdl`](https://miniwdl.readthedocs.io/en/latest/getting_started.html) and [`Cromwell`](https://cromwell.readthedocs.io/en/stable/tutorials/FiveMinuteIntro/). + +See [backend-specific documentation](backends) for details on setting up an engine. + +| Engine | Azure | AWS | GCP | HPC | +| :- | :- | :- | :- | :- | +| [**miniwdl**](https://github.com/chanzuckerberg/miniwdl#scaling-up) | _Unsupported_ | Supported via the [Amazon Genomics CLI](https://aws.amazon.com/genomics-cli/) | _Unsupported_ | (SLURM only) Supported via the [`miniwdl-slurm`](https://github.com/miniwdl-ext/miniwdl-slurm) plugin | +| [**Cromwell**](https://cromwell.readthedocs.io/en/stable/backends/Backends/) | Supported via [Cromwell on Azure](https://github.com/microsoft/CromwellOnAzure) | Supported via the [Amazon Genomics CLI](https://aws.amazon.com/genomics-cli/) | Supported via Google's [Pipelines API](https://cromwell.readthedocs.io/en/stable/backends/Google/) | Supported - [Configuration varies depending on HPC infrastructure](https://cromwell.readthedocs.io/en/stable/tutorials/HPCIntro/) | + +## Filling out the inputs JSON + +The input to a workflow run is defined in JSON format. Template input files with reference dataset information filled out are available for each backend: + +- [Azure](backends/azure/inputs.azure.json) +- [AWS](backends/aws/inputs.aws.json) +- [GCP](backends/gcp/inputs.gcp.json) +- [HPC](backends/hpc/inputs.hpc.json) + +Using the appropriate inputs template file, fill in the cohort and sample information (see [Workflow Inputs](#workflow-inputs) for more information on the input structure). + +If using an HPC backend, you will need to download the reference bundle and replace the `` in the input template file with the local path to the reference datasets on your HPC. + +## Running the workflow + +Run the workflow using the engine and backend that you have configured ([miniwdl](#run-directly-using-miniwdl), [Cromwell](#run-directly-using-cromwell)). + +Note that the calls to `miniwdl` and `Cromwell` assume you are accessing the engine directly on the machine on which it has been deployed. Depending on the backend you have configured, you may be able to submit workflows using different methods (e.g. using trigger files in Azure, or using the Amazon Genomics CLI in AWS). + +### Run directly using miniwdl -The [AWS input file template](workflows/inputs.aws.json) has paths to the reference files in the blob storage prefilled. +`miniwdl run workflows/main.wdl -i ` -## Reference data hosted in GCP +### Run directly using Cromwell - +`java -jar run workflows/main.wdl -i ` + +If Cromwell is running in server mode, the workflow can be submitted using cURL. Fill in the values of CROMWELL_URL and INPUTS_JSON below, then from the root of the repository, run: + +```bash +# The base URL (and port, if applicable) of your Cromwell server +CROMWELL_URL= +# The path to your inputs JSON file +INPUTS_JSON= + +(cd workflows && zip -r dependencies.zip assembly_structs.wdl assemble_genome/ de_novo_assembly_sample/ de_novo_assembly_trio/ wdl-common/) +curl -X "POST" \ + "${CROMWELL_URL}/api/workflows/v1" \ + -H "accept: application/json" \ + -H "Content-Type: multipart/form-data" \ + -F "workflowSource=@workflows/main.wdl" \ + -F "workflowInputs=@${INPUTS_JSON};type=application/json" \ + -F "workflowDependencies=@workflows/dependencies.zip;type=application/zip" +``` + +To specify [workflow options](https://cromwell.readthedocs.io/en/latest/wf_options/Overview/), add the following to the request (assuming your options file is a file called `options.json` located in the `pwd`): `-F "workflowOptions=@options.json;type=application/json"`. # Workflow inputs +This section describes the inputs required for a run of the workflow. Typically, only the `de_novo_assembly.cohort` and potentially [run/backend-specific sections](#other-inputs) will be filled out by the user for each run of the workflow. Input templates with reference file locations filled out are provided [for each backend](backends). + ## [Cohort](workflows/humanwgs_structs.wdl) A cohort can include one or more samples. Samples need not be related. @@ -69,65 +127,31 @@ Sample information for each sample in the workflow run. | :- | :- | :- | :- | | String | sample_id | A unique name for the sample; used to name outputs | | | Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | movie_bams | The set of unaligned movie BAMs associated with this sample | | -| String? | sex | Sample sex | ["MALE", "FEMALE", "."]. "." or `null` will set sex to unknown. | | String? | father_id | Paternal `sample_id` | | | String? | mother_id | Maternal `sample_id` | | | Boolean | run_de_novo_assembly | If true, run single-sample _de novo_ assembly for this sample | \[true, false\] | ## [ReferenceData](workflows/humanwgs_structs.wdl) -Files associated with the reference genome. +Array of references and their associated names and indices. -These files are hosted publicly in each of the cloud backends; see `workflows/inputs.${backend}.json`. +These files are hosted publicly in each of the cloud backends; see `backends/${backend}/inputs.${backend}.json`. | Type | Name | Description | Notes | | :- | :- | :- | :- | | String | name | Reference name; used to name outputs (e.g., "GRCh38") | | -| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl) | fasta | Reference genome and index | | +| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl) | fasta | Reference genome and associated index | | ## Other inputs | Type | Name | Description | Notes | | :- | :- | :- | :- | -| String | backend | Backend where the workflow will be executed | \["Azure", "AWS", "GCP"\] | -| String? | zones | Zones where compute will take place; required if backend is set to 'AWS' or 'GCP'. | [Determining available zones in AWS and GCP](#determining-available-zones-in-aws-and-gcp). | -| String? | aws_spot_queue_arn | Queue ARN for the spot batch queue; required if backend is set to 'AWS' and `preemptible` is set to `true` | [Determining the AWS queue ARN](#determining-the-aws-batch-queue-arn) | -| String? | aws_on_demand_queue_arn | Queue ARN for the on demand batch queue; required if backend is set to 'AWS' and `preemptible` is set to `false` | [Determining the AWS queue ARN](#determining-the-aws-batch-queue-arn) | -| Boolean | preemptible | If set to `true`, run tasks preemptibly where possible. On-demand VMs will be used only for tasks that run for >24 hours if the backend is set to GCP. If set to `false`, on-demand VMs will be used for every task. | \[true, false\] | - -### Determining available zones in AWS and GCP - -#### AWS - -To determine available zones in AWS, look for the ZoneName attributes output by the following command: - -```bash -aws ec2 describe-availability-zones --region -``` -For example, the zones in region us-east-2 are `"us-east-2a us-east-2b us-east-2c"`. - -#### GCP - -To determine available zones in GCP, run the following; available zones within a region can be found in the first column of the output: - -```bash -gcloud compute zones list | grep -``` - -For example, the zones in region us-central1 are `"us-central1-a us-central1-b us-central1c us-central1f"`. - -### Determining the AWS batch queue ARN - -**Note that if you are using a `miniwdl` engine, you can skip these steps; workflows run via miniwdl will run exclusively in the job queue to which they are submitted.** - -1. Visit [the AWS console](https://console.aws.amazon.com/). -2. Navigate to the Batch service. -3. In the lefthand sidebar, select "Compute environments". Note the name of the compute environment with the provisioning model SPOT (if you have deployed a context using spot instances) and the name of the compute environment with provisioning model "EC2" (if you have deployed a context that does not use spot instances). -4. In the lefthand sidebar, select "Job queues". -5. Clicking into an individual queue will show information about the compute environment ("Compute environment order"). Identify the job queue with the Compute environment name that matches the name you identified for the SPOT compute environment; copy the Amazon Resource Name (ARN) for this job queue. This is the value that should be used for the `aws_spot_queue_arn`. Repeat this process to find the ARN for the `aws_on_demand_queue_arn`. - -- If `preemptible = true`, only the `aws_spot_queue_arn` is required. -- If `preemptible = false`, only the `aws_on_demand_queue_arn` is required. +| String | backend | Backend where the workflow will be executed | \["Azure", "AWS", "GCP", "HPC"\] | +| String? | zones | Zones where compute will take place; required if backend is set to 'AWS' or 'GCP'. |
  • [Determining available zones in AWS](backends/aws/README.md#determining-available-zones)
  • [Determining available zones in GCP](backends/gcp/README.md#determining-available-zones)
| +| String? | aws_spot_queue_arn | Queue ARN for the spot batch queue; required if backend is set to 'AWS' and `preemptible` is set to `true` | [Determining the AWS queue ARN](backends/aws/README.md#determining-the-aws-batch-queue-arn) | +| String? | aws_on_demand_queue_arn | Queue ARN for the on demand batch queue; required if backend is set to 'AWS' and `preemptible` is set to `false` | [Determining the AWS queue ARN](backends/aws/README.md#determining-the-aws-batch-queue-arn) | +| String? | container_registry | Container registry where workflow images are hosted. If left blank, [PacBio's public Quay.io registry](https://quay.io/organization/pacbio) will be used. | | +| Boolean | preemptible | If set to `true`, run tasks preemptibly where possible. On-demand VMs will be used only for tasks that run for >24 hours if the backend is set to GCP. If set to `false`, on-demand VMs will be used for every task. Ignored if backend is set to HPC. | \[true, false\] | # Workflow outputs @@ -141,9 +165,9 @@ These files will be output if `cohort.samples[sample]` is set to `true` for any | Array[Array[File]?] | assembly_noseq_gfas | Assembly graphs in [GFA format](https://github.com/chhylp123/hifiasm/blob/master/docs/source/interpreting-output.rst). | | | Array[Array[File]?] | assembly_lowQ_beds | Coordinates of low quality regions in BED format. | | | Array[Array[File]?] | assembly_stats | Assembly size and NG50 stats generated by [calN50](https://github.com/lh3/calN50). | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)?] | asm_bam | [minimap2](https://github.com/lh3/minimap2) alignment of assembly to reference. | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)?] | htsbox_vcf | Naive pileup variant calling of assembly against reference with [`htsbox`](https://github.com/lh3/htsbox) | | -| Array[File?] | htsbox_vcf_stats | [`bcftools stats`](https://samtools.github.io/bcftools/bcftools.html#stats) summary statistics for `htsbox` variant calls | | +| Array[Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)?]] | asm_bam | [minimap2](https://github.com/lh3/minimap2) alignment of assembly to reference. | | +| Array[Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)?]] | paftools_vcf | calls variants from coordinate-sorted assembly-to-reference alignment. It calls variants from the cs tag and identifies confident/callable regions as those covered by exactly one contig [`paftools`](https://github.com/lh3/minimap2/blob/master/misc/README.md#calling-variants-from-haploid-assemblies) | | +| Array[Array[File?]] | paftools_vcf_stats | [`bcftools stats`](https://samtools.github.io/bcftools/bcftools.html#stats) summary statistics for `paftools` variant calls | | ## De novo assembly - trio @@ -155,24 +179,24 @@ These files will be output if `cohort.de_novo_assembly_trio` is set to `true` an | Array[Array[File]]? | trio_assembly_noseq_gfas | Assembly graphs in [GFA format](https://github.com/chhylp123/hifiasm/blob/master/docs/source/interpreting-output.rst). | | | Array[Array[File]]? | trio_assembly_lowQ_beds | Coordinates of low quality regions in BED format. | | | Array[Array[File]]? | trio_assembly_stats | Assembly size and NG50 stats generated by [calN50](https://github.com/lh3/calN50). | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)]? | trio_asm_bams | [minimap2](https://github.com/lh3/minimap2) alignment of assembly to reference. | | +| Array[Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)]?] | trio_asm_bams | [minimap2](https://github.com/lh3/minimap2) alignment of assembly to reference. | | | Array[Map[String, String]]? | haplotype_key | Indication of which haplotype (`hap1`/`hap2`) corresponds to which parent. | | # Tool versions and Docker images -Docker images definitions used by the human WGS workflow can be found in [the wdl-dockerfiles repository](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a). Images are hosted in PacBio's [quay.io](https://quay.io/organization/pacbio). Docker images used in the workflow are pegged to specific versions by referring to their digests rather than tags. +Docker images definitions used by this workflow can be found in [the wdl-dockerfiles repository](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a). Images are hosted in PacBio's [quay.io](https://quay.io/organization/pacbio). Docker images used in the workflow are pegged to specific versions by referring to their digests rather than tags. The Docker image used by a particular step of the workflow can be identified by looking at the `docker` key in the `runtime` block for the given task. Images can be referenced in the following table by looking for the name after the final `/` character and before the `@sha256:...`. For example, the image referred to here is "align_hifiasm": > ~{runtime_attributes.container_registry}/**align_hifiasm**@sha256:3968cb<...>b01f80fe | Image | Major tool versions | Links | | :- | :- | :- | -| align_hifiasm |
  • [minimap2 2.17](https://github.com/lh3/minimap2/releases/tag/v2.17)
  • [samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/align_hifiasm) | -| bcftools |
  • [bcftools 1.14](https://github.com/samtools/bcftools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/bcftools) | -| gfatools |
  • [gfatools 0.4](https://github.com/lh3/gfatools/releases/tag/v0.4)
  • [htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)
  • [k8 0.2.5](https://github.com/attractivechaos/k8/releases/tag/0.2.5)
  • [caln50 01091f2](https://github.com/lh3/calN50/tree/01091f25bc24e17fbf0da3407ea24aa448c489ae)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/gfatools) | -| hifiasm |
  • [hifiasm 0.19.4](https://github.com/chhylp123/hifiasm/releases/tag/0.19.4)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/hifiasm) | -| htsbox |
  • [htsbox r346 (6964440)](https://github.com/lh3/htsbox/tree/6964440d791a60a22ca5ff25dc413a362bdc0abe)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/htsbox) | -| htslib |
  • [htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/htslib) | -| parse-cohort |
  • python 3.8.10; custom scripts
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/parse-cohort) | -| samtools |
  • [samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/samtools) | -| yak |
  • [yak 0.1](https://github.com/lh3/yak/releases/tag/v0.1)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a/docker/yak) | +| align_hifiasm |
  • [minimap2 2.17](https://github.com/lh3/minimap2/releases/tag/v2.17)
  • [samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/align_hifiasm) | +| bcftools |
  • [bcftools 1.14](https://github.com/samtools/bcftools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/bcftools) | +| gfatools |
  • [gfatools 0.4](https://github.com/lh3/gfatools/releases/tag/v0.4)
  • [htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)
  • [k8 0.2.5](https://github.com/attractivechaos/k8/releases/tag/0.2.5)
  • [caln50 01091f2](https://github.com/lh3/calN50/tree/01091f25bc24e17fbf0da3407ea24aa448c489ae)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/gfatools) | +| hifiasm |
  • [hifiasm 0.19.4](https://github.com/chhylp123/hifiasm/releases/tag/0.19.4)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/hifiasm) | +| htslib |
  • [htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/htslib) | +| paftools |
  • [paftools 2.26-r1182-dirty](https://github.com/lh3/minimap2/blob/bc588c0eeb26426d0d90a93fb0877358a389c515/misc/paftools.js)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/align_hifiasm) | +| parse-cohort |
  • python 3.8.10; custom scripts
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/parse-cohort) | +| samtools |
  • [samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/samtools) | +| yak |
  • [yak 0.1](https://github.com/lh3/yak/releases/tag/v0.1)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/yak) | diff --git a/backends/aws/.gitignore b/backends/aws/.gitignore new file mode 100644 index 0000000..10ef663 --- /dev/null +++ b/backends/aws/.gitignore @@ -0,0 +1 @@ +agc-project.yaml diff --git a/backends/aws/README.md b/backends/aws/README.md new file mode 100644 index 0000000..c07e5d5 --- /dev/null +++ b/backends/aws/README.md @@ -0,0 +1,124 @@ +# Configuring the Amazon Genomics CLI + +The Amazon Genomics CLI (`agc`) allows users to orchestrate workflow execution using AWS Batch. See the [documentation](https://docs.dnastack.com/docs/cromwell-on-aws-amazon-genomics-cli) for information on installing and using the `agc` to configure and run workflows. The following section provides additional information on deploying a project using the `agc`. + +## Deploying a context with `agc` + +Once you have installed and authenticated with the `agc`, you can deploy a context using an agc project YAML file. This file must be named `agc-project.yaml`. + +An [example agc-project.yaml file](agc-project.template.yaml) that has the workflow, reference data source, and both on-demand and spot contexts configured using Cromwell as the engine is provided here. This will create an agc project named `humanassemblyAgc`, with either (or both) a `spotContext` or an `onDemandContext`. The `spotContext` will allow you to run worklfows using [AWS spot instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-spot-instances.html), which can result in substantial cost savings relative to using [on-demand instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-on-demand-instances.html). + +Note that deploying a context **will incur costs** even if you are not actively running workflows; ensure that [contexts that are not in use are destroyed](https://aws.github.io/amazon-genomics-cli/docs/reference/agc_context_destroy/) to avoid incurring ongoing costs. + +To deploy the agc project using the template file, first copy the template file to a file named `agc-project.yaml` (`cp agc-project.template.yaml agc-project.yaml`). + +In the `data` section of the `agc-project.yaml` file, add any additional s3 buckets that the workflow will require access to, for example the bucket containing sample input data. Make sure that you do not remove the section granting access to the s3://dnastack-resources bucket; this is where [reference datasets are hosted](#reference-data-hosted-in-aws). + +``` +data: + - location: s3://dnastack-resources + readOnly: true + - location: s3:// + readOnly: true +``` + +Then from the directory containing the `agc-project.yaml` file, run: + +```bash +agc context deploy --context ${context} +``` + +Where `${context}` is either `spotContext` or `onDemandContext`. + +If you want both spot and on-demand contexts, all contexts can be deployed at once by running: + +``` +agc context deploy --all +``` + +Note that the `miniwdl` engine run via AWS is currently not supported for this workflow. + +# Checking and requesting quota in AWS + +See [resources requirements](../../README.md#resource-requirements) for information on the minimum requirements for running the workflow. Typically in a new AWS environment, additional vCPU quota will be required. + +## Checking current quota + +1. Navigate to [the AWS console](https://console.aws.amazon.com/). +2. In the top right corner, select the region where your `agc` deployment is located. +3. Navigate to EC2. +4. In the menu on the left, select 'Limits'. +5. Filter the limits by searching for "Standard". The current limit field indicates the number of vCPUs that you currently have access to. +- Spot instance limit: `All Standard (A, C, D, H, I, M, R, T, Z) Spot Instance Requests` +- On-demand instance limit: `Running On-Demand All Standard (A, C, D, H, I, M, R, T, Z) instances` + +If the number of vCPUs in the context you plan to run the workflow in is less than the limites specified in [the resources requirements](../../README.md#resource-requirements) section, you will need to request additional quota before you can run the workflow. + +## Requesting additional quota + +5. Continuing from the steps outlined in [checking the current quota](#checking-current-quota), select the service you want to request an increase for. +6. In the top right corner, select 'Request limit increase'. +7. Fill out the appropriate fields in the request form, ensuring that the region you select is the region where you have deployed your `agc` and where your data is located. 256 vCPUs are recommended for running trio data. + +Low quota increase requests are typically fulfilled within a 1-2 hours. + +# Configuring and running the workflow + +## Filling out workflow inputs + +Fill out any information missing in [the inputs file](inputs.aws.json). Ensure that all data files used by the workflow are at locations that have been configured in the agc-project.yaml file; see the [granting access to other data files](#granting-access-to-other-data-files) for more information. + +See [the inputs section of the main README](../../README.md#workflow-inputs) for more information on the structure of the inputs.json file. + +Note that you only need to fill out the queueArn corresponding to the context you are submitting the workflow to (spot or on-demand). + +### Determining available zones + +To determine available zones in AWS, look for the `ZoneName` attribute output by the following command: + +```bash +aws ec2 describe-availability-zones --region +``` + +For example, the zones in region us-east-2 are `"us-east-2a us-east-2b us-east-2c"`. + +### Determining the AWS batch queue ARN + +**Note that if you are using a `miniwdl` engine, you can skip these steps; workflows run via miniwdl will run exclusively in the job queue to which they are submitted.** + +1. Visit [the AWS console](https://console.aws.amazon.com/). +2. Navigate to the Batch service. +3. In the lefthand sidebar, select "Compute environments". Note the name of the compute environment with the provisioning model SPOT (if you have deployed a context using spot instances) and the name of the compute environment with provisioning model "EC2" (if you have deployed a context that does not use spot instances). +4. In the lefthand sidebar, select "Job queues". +5. Clicking into an individual queue will show information about the compute environment ("Compute environment order"). Identify the job queue with the Compute environment name that matches the name you identified for the SPOT compute environment; copy the Amazon Resource Name (ARN) for this job queue. This is the value that should be used for the `aws_spot_queue_arn`. Repeat this process to find the ARN for the `aws_on_demand_queue_arn`. + +- If `preemptible = true`, only the `aws_spot_queue_arn` is required. +- If `preemptible = false`, only the `aws_on_demand_queue_arn` is required. + +## Running the workflow + +### Running via `agc` + +From the directory where your `agc-project.yaml` is located, run: + +`agc workflow run humanassembly --context --inputsFile ` + +The running workflow can be monitored via [`agc workflow` commands](https://aws.github.io/amazon-genomics-cli/docs/reference/agc_workflow/), or via the AWS console. + +# Reference data hosted in AWS + +AWS reference data is hosted in the `us-west-2` region in the bucket `s3://dnastack-resources`. + +To use AWS reference data, add the following line to the data section of your [`agc-project.yaml`](https://aws.github.io/amazon-genomics-cli/docs/concepts/projects/): + +```yaml +data: + - location: s3://dnastack-resources + readOnly: true +``` + +The [AWS input file template](inputs.aws.json) has paths to the reference files in s3 prefilled. The template [agc-project.template.yaml file](agc-project.template.yaml) has this section filled out already. + +### Granting access to other data files + +S3 buckets outside of the reference files can be accessed by adding additional data blocks to the agc-project.yaml file. See the [agc documentation](https://aws.github.io/amazon-genomics-cli/docs/concepts/data/) for more details on adding additional data sources. All inputs referenced in the inputs.json file will need to be at locations that have been configured in the agc-project.yaml. diff --git a/backends/aws/agc-project.template.yaml b/backends/aws/agc-project.template.yaml new file mode 100644 index 0000000..ef58c89 --- /dev/null +++ b/backends/aws/agc-project.template.yaml @@ -0,0 +1,167 @@ +name: humanassemblyAgc +schemaVersion: 1 +data: + - location: s3://dnastack-resources + readOnly: true +workflows: + humanassembly: + type: + language: wdl + version: 1.0 + sourceURL: ../../workflows +contexts: + onDemandContext: + instanceTypes: [ + "c5.large", + "c5.xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.metal", + "c5a.large", + "c5a.xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5n.large", + "c5n.xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.18xlarge", + "m5.large", + "m5.xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5a.large", + "m5a.xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5n.large", + "m5n.xlarge", + "m5n.2xlarge", + "m5n.4xlarge", + "m5n.8xlarge", + "m5n.12xlarge", + "m5n.16xlarge", + "m5n.24xlarge", + "r5.large", + "r5.xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5a.large", + "r5a.xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5n.large", + "r5n.xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + ] + engines: + - type: wdl + engine: cromwell + spotContext: + requestSpotInstances: true + instanceTypes: [ + "c5.large", + "c5.xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.metal", + "c5a.large", + "c5a.xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5n.large", + "c5n.xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.18xlarge", + "m5.large", + "m5.xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5a.large", + "m5a.xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5n.large", + "m5n.xlarge", + "m5n.2xlarge", + "m5n.4xlarge", + "m5n.8xlarge", + "m5n.12xlarge", + "m5n.16xlarge", + "m5n.24xlarge", + "r5.large", + "r5.xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5a.large", + "r5a.xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5n.large", + "r5n.xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + ] + engines: + - type: wdl + engine: cromwell diff --git a/backends/aws/inputs.aws.json b/backends/aws/inputs.aws.json new file mode 100644 index 0000000..c3530ae --- /dev/null +++ b/backends/aws/inputs.aws.json @@ -0,0 +1,38 @@ +{ + "de_novo_assembly.cohort": { + "cohort_id": "String", + "samples": [ + { + "sample_id": "String", + "movie_bams": [ + "File" + ], + "father_id": "String?", + "mother_id": "String?", + "run_de_novo_assembly": "Boolean" + } + ], + "run_de_novo_assembly_trio": "Boolean" + }, + "de_novo_assembly.references": [ + { + "name": "GRCh38", + "fasta": { + "data": "s3://dnastack-resources/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "data_index": "s3://dnastack-resources/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + } + }, + { + "name": "chm13", + "fasta": { + "data": "s3://dnastack-resources/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta", + "data_index": "s3://dnastack-resources/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta.fai" + } + } + ], + "de_novo_assembly.backend": "AWS", + "de_novo_assembly.zones": "us-east-2a us-east-2b us-east-2c", + "de_novo_assembly.aws_spot_queue_arn": "", + "de_novo_assembly.aws_on_demand_queue_arn": "", + "de_novo_assembly.preemptible": "Boolean" +} diff --git a/backends/azure/README.md b/backends/azure/README.md new file mode 100644 index 0000000..0adce10 --- /dev/null +++ b/backends/azure/README.md @@ -0,0 +1,25 @@ +# Configuring Cromwell on Azure + +Workflows can be run in Azure by setting up [Cromwell on Azure (CoA)](https://github.com/microsoft/CromwellOnAzure). Documentation on deploying and configuring an instance of CoA can be found [here](https://github.com/microsoft/CromwellOnAzure/wiki/Deploy-your-instance-of-Cromwell-on-Azure). + +## Requirements + +- [Cromwell on Azure](https://github.com/microsoft/CromwellOnAzure) version 3.2+; version 4.0+ is recommended + +# Configuring and running the workflow + +## Filling out workflow inputs + +Fill out any information missing in [the inputs file](inputs.azure.json). + +See [the inputs section of the main README](../../README.md#workflow-inputs) for more information on the structure of the inputs.json file. + +## Running the workflow + +# Reference data hosted in Azure + +To use Azure reference data, add the following line to your `containers-to-mount` file in your Cromwell on Azure installation ([more info here](https://github.com/microsoft/CromwellOnAzure/blob/develop/docs/troubleshooting-guide.md#use-input-data-files-from-an-existing-azure-storage-account-that-my-lab-or-team-is-currently-using)): + +`https://datasetpbrarediseases.blob.core.windows.net/dataset?si=public&spr=https&sv=2021-06-08&sr=c&sig=o6OkcqWWlGcGOOr8I8gCA%2BJwlpA%2FYsRz0DMB8CCtCJk%3D` + +The [Azure input file template](inputs.azure.json) has paths to the reference files in this blob storage prefilled. diff --git a/backends/azure/inputs.azure.json b/backends/azure/inputs.azure.json new file mode 100644 index 0000000..dcfee99 --- /dev/null +++ b/backends/azure/inputs.azure.json @@ -0,0 +1,35 @@ +{ + "de_novo_assembly.cohort": { + "cohort_id": "String", + "samples": [ + { + "sample_id": "String", + "movie_bams": [ + "File" + ], + "father_id": "String?", + "mother_id": "String?", + "run_de_novo_assembly": "Boolean" + } + ], + "run_de_novo_assembly_trio": "Boolean" + }, + "de_novo_assembly.references": [ + { + "name": "GRCh38", + "fasta": { + "data": "/datasetpbrarediseases/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "data_index": "/datasetpbrarediseases/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + } + }, + { + "name": "chm13", + "fasta": { + "data": "/datasetpbrarediseases/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta", + "data_index": "/datasetpbrarediseases/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta.fai" + } + } + ], + "de_novo_assembly.backend": "Azure", + "de_novo_assembly.preemptible": "Boolean" +} diff --git a/backends/gcp/README.md b/backends/gcp/README.md new file mode 100644 index 0000000..1d2e840 --- /dev/null +++ b/backends/gcp/README.md @@ -0,0 +1,29 @@ +# Configuring Cromwell on GCP + +[Cromwell's documentation](https://cromwell.readthedocs.io/en/stable/tutorials/PipelinesApi101/) on getting started with Google's genomics Pipelines API can be used to set up the resources needed to run the workflow. + +# Configuring and running the workflow + +## Filling out workflow inputs + +Fill out any information missing in [the inputs file](inputs.gcp.json). + +See [the inputs section of the main README](../../README.md#workflow-inputs) for more information on the structure of the inputs.json file. + +### Determining available zones + +To determine available zones in GCP, run the following; available zones within a region can be found in the first column of the output: + +```bash +gcloud compute zones list | grep +``` + +For example, the zones in region us-central1 are `"us-central1-a us-central1-b us-central1c us-central1f"`. + +## Running the workflow + +# Reference data hosted in GCP + +GCP reference data is hosted in the `us-west1` region in the bucket `gs://pacbio-wdl`. This bucket is requester-pays, meaning that users will need to [provide a billing project in their Cromwell configuration](https://cromwell.readthedocs.io/en/stable/filesystems/GoogleCloudStorage/) in order to use files located in this bucket. + +To avoid egress charges, Cromwell should be set up to spin up compute resources in the same region in which the data is located. If possible, add cohort data to the same region as the reference dataset, or consider mirroring this dataset in the region where your data is located. See [Google's information about data storage and egress charges for more information](https://cloud.google.com/storage/pricing). diff --git a/backends/gcp/inputs.gcp.json b/backends/gcp/inputs.gcp.json new file mode 100644 index 0000000..be7ff33 --- /dev/null +++ b/backends/gcp/inputs.gcp.json @@ -0,0 +1,36 @@ +{ + "de_novo_assembly.cohort": { + "cohort_id": "String", + "samples": [ + { + "sample_id": "String", + "movie_bams": [ + "File" + ], + "father_id": "String?", + "mother_id": "String?", + "run_de_novo_assembly": "Boolean" + } + ], + "run_de_novo_assembly_trio": "Boolean" + }, + "de_novo_assembly.references": [ + { + "name": "GRCh38", + "fasta": { + "data": "gs://pacbio-wdl/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "data_index": "gs://pacbio-wdl/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + } + }, + { + "name": "chm13", + "fasta": { + "data": "gs://pacbio-wdl/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta", + "data_index": "gs://pacbio-wdl/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta.fai" + } + } + ], + "de_novo_assembly.backend": "GCP", + "de_novo_assembly.zones": "String", + "de_novo_assembly.preemptible": "Boolean" +} diff --git a/backends/hpc/README.md b/backends/hpc/README.md new file mode 100644 index 0000000..850925a --- /dev/null +++ b/backends/hpc/README.md @@ -0,0 +1,49 @@ +Either `miniwdl` or `Cromwell` can be used to run workflows on the HPC. + +# Installing and configuring `miniwdl` + +## Requirements + +- [`miniwdl`](https://github.com/chanzuckerberg/miniwdl) >= 1.9.0 +- [`miniwdl-slurm`](https://github.com/miniwdl-ext/miniwdl-slurm) + +## Configuring + +An [example miniwdl.cfg file](miniwdl.cfg) is provided here. This should be placed at `~/.config/miniwdl.cfg` and edited to match your slurm configuration. This allows running workflows using a basic SLURM setup. + +# Installing and configuring `Cromwell` + +Cromwell supports a number of different HPC backends; see [Cromwell's documentation](https://cromwell.readthedocs.io/en/stable/backends/HPC/) for more information on configuring each of the backends. + +# Configuring and running the workflow + +## Filling out workflow inputs + +Fill out any information missing in [the inputs file](inputs.hpc.json). Once you have downloaded the reference data bundle, ensure that you have replaced the `` in the input template file with the local path to the reference datasets on your HPC. + +See [the inputs section of the main README](../../README.md#workflow-inputs) for more information on the structure of the inputs.json file. + +## Running the workflow + +### Running via miniwdl + +`miniwdl run workflows/main.wdl -i ` + +### Running via Cromwell + +`cromwell run workflows/main.wdl -i ` + +# Reference data bundle + +10.5281/zenodo.10059671 +![https://doi.org/10.5281/zenodo.10059671](https://zenodo.org/badge/DOI/10.5281/zenodo.10059671.svg) + +Reference data is hosted on Zenodo at [10.5281/zenodo.10059671](https://zenodo.org/record/10059671). Download the reference data bundle and extract it to a location on your HPC, then update the input template file with the path to the reference data. + +```bash +# download the reference data bundle +wget https://zenodo.org/record/10059671/files/wdl-humanassembly.v0.9.0.resource.tgz + +# extract the reference data bundle and rename as dataset +tar -xzf wdl-humanassembly.v9.9.0.resource.tgz && mv static_resources dataset +``` diff --git a/backends/hpc/inputs.hpc.json b/backends/hpc/inputs.hpc.json new file mode 100644 index 0000000..5509a0e --- /dev/null +++ b/backends/hpc/inputs.hpc.json @@ -0,0 +1,35 @@ +{ + "de_novo_assembly.cohort": { + "cohort_id": "String", + "samples": [ + { + "sample_id": "String", + "movie_bams": [ + "File" + ], + "father_id": "String?", + "mother_id": "String?", + "run_de_novo_assembly": "Boolean" + } + ], + "run_de_novo_assembly_trio": "Boolean" + }, + "de_novo_assembly.references": [ + { + "name": "GRCh38", + "fasta": { + "data": "/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "data_index": "/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + } + }, + { + "name": "chm13", + "fasta": { + "data": "/dataset/CHM13/human_chm13v2.0_maskedY_rCRS.fasta", + "data_index": "= 1.9.0 +command_shell = /bin/bash +defaults = { + "maxRetries": 2, + "docker": "ubuntu:20.04" + } + +[singularity] +# This plugin wraps the singularity backend. Make sure the settings are +# appropriate for your cluster. +exe = ["/usr/bin/singularity"] + +# the miniwdl default options contain options to run as a fake root, which +# is not available on most clusters. +run_options = [ + "--containall" + ] + +# Location of the singularity images (optional). The miniwdl-slurm plugin +# will set it to a directory inside $PWD. This location must be reachable +# for the submit nodes. +image_cache = "$PWD/miniwdl_singularity_cache" + +[slurm] +# extra arguments passed to the srun command (optional). +extra_args="--partition compute --comment 'run with miniwdl'" diff --git a/wdl-ci.config.json b/wdl-ci.config.json index e29a369..4315946 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -19,26 +19,8 @@ "tasks": { "bcftools_stats": { "key": "bcftools_stats", - "digest": "zgvmaa6wtldmojg3gixeiah5aak7khb5", - "tests": [ - { - "inputs": { - "vcf": "${resources_file_path}/HG005.GRCh38.deepvariant.vcf.gz", - "params": "--apply-filters PASS --samples ${sample_id}", - "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "runtime_attributes": "${default_runtime_attributes}" - }, - "output_tests": { - "stats": { - "value": "${resources_file_path}/HG005.GRCh38.deepvariant.vcf.stats.txt", - "test_tasks": [ - "compare_file_basename", - "check_empty_lines" - ] - } - } - } - ] + "digest": "cu73ojtpnhesxaa2jh7a7l23vlieds3i", + "tests": [] } } }, @@ -49,26 +31,8 @@ "tasks": { "zip_index_vcf": { "key": "zip_index_vcf", - "digest": "yxnm7toivkmsrrs4h3x72wdrtxd2lo72", - "tests": [ - { - "inputs": { - "vcf": "${resources_file_path}/HG005.GRCh38.pbsv.vcf", - "runtime_attributes": "${default_runtime_attributes}" - }, - "output_tests": { - "zipped_vcf": { - "value": "${resources_file_path}/HG005.GRCh38.pbsv.vcf.gz", - "test_tasks": [ - "calculate_md5sum", - "compare_file_basename", - "vcftools_validator", - "check_gzip" - ] - } - } - } - ] + "digest": "cflenxzb6uj2ujfv4pkllo3vztdkev45", + "tests": [] } } }, @@ -79,7 +43,7 @@ "tasks": { "samtools_fasta": { "key": "samtools_fasta", - "digest": "i2tzlr7ni5gglbe7regxody2bttg35na", + "digest": "x336uu76d5c6nzls2vgntvoqrnhex5q4", "tests": [ { "inputs": { @@ -115,7 +79,7 @@ "tasks": { "parse_families": { "key": "parse_families", - "digest": "fczc72mlu6iw3glecpbme5jj4ceqyqtw", + "digest": "rprxafsnidgno35awynatngwbnuw6suo", "tests": [ { "inputs": { @@ -137,7 +101,7 @@ }, "yak_count": { "key": "yak_count", - "digest": "32uc62rlhwfjfatcb4qfpr44ohg2wdlb", + "digest": "i4jt54vu25mhikalp47febetx7mn6xmo", "tests": [ { "inputs": { @@ -150,6 +114,8 @@ "${resources_file_path}/m64017_200108_232219.hifi_reads.fasta", "${resources_file_path}/m64017_200112_090459.hifi_reads.fasta" ], + "yak_params": "-b37", + "mem_gb": 70, "runtime_attributes": "${default_runtime_attributes}" }, "output_tests": { @@ -173,7 +139,7 @@ "tasks": { "hifiasm_assemble": { "key": "hifiasm_assemble", - "digest": "r4ikydzmdaed4hzsmc3t7efh6mz5e4mx", + "digest": "yt7mrvhlur5xzn5sxbhe52kvvu6r4ejr", "tests": [ { "inputs": { @@ -230,7 +196,7 @@ }, "gfa2fa": { "key": "gfa2fa", - "digest": "liyb2m4cbkovxctcgaxwunqkn5az77ev", + "digest": "r2xbqxqkae5owmzwkmvfk6atpdiu75vf", "tests": [ { "inputs": { @@ -262,22 +228,19 @@ }, "align_hifiasm": { "key": "align_hifiasm", - "digest": "77gs34t4c2i6epsg2epukfoaign2fmnt", + "digest": "ib3l4i6tdenf5sa3s54yzxfraxhk5u73", "tests": [ { "inputs": { "sample_id": "HG005", - "query_sequences": [ - "${resources_file_path}/HG005.asm.bp.hap1.p_ctg.fasta.gz", - "${resources_file_path}/HG005.asm.bp.hap2.p_ctg.fasta.gz" - ], + "query_sequences": "${resources_file_path}/HG005.asm.bp.hap1.p_ctg.fasta.gz", "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", "reference_name": "GRCh38", "runtime_attributes": "${default_runtime_attributes}" }, "output_tests": { "asm_bam": { - "value": "${resources_file_path}/HG005.asm.GRCh38.bam", + "value": "${resources_file_path}/HG005.hap1.asm.GRCh38.bam", "test_tasks": [ "compare_file_basename", "samtools_quickcheck", @@ -287,6 +250,11 @@ } } ] + }, + "merge_haps": { + "key": "merge_haps", + "digest": "", + "tests": [] } } }, @@ -297,7 +265,7 @@ "tasks": { "htsbox": { "key": "htsbox", - "digest": "hgv6puzttllbwzgmunnigqiopcq3gl7x", + "digest": "wzaxerbnwe327lejeyudsposw4ywor7t", "tests": [ { "inputs": { @@ -317,6 +285,11 @@ } } ] + }, + "paftools": { + "key": "paftools", + "digest": "", + "tests": [] } } }, @@ -325,13 +298,180 @@ "name": "", "description": "", "tasks": {} + }, + "workflows/wdl-common/wdl/tasks/glnexus.wdl": { + "key": "workflows/wdl-common/wdl/tasks/glnexus.wdl", + "name": "", + "description": "", + "tasks": { + "glnexus": { + "key": "glnexus", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/mosdepth.wdl": { + "key": "workflows/wdl-common/wdl/tasks/mosdepth.wdl", + "name": "", + "description": "", + "tasks": { + "mosdepth": { + "key": "mosdepth", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/pbsv_call.wdl": { + "key": "workflows/wdl-common/wdl/tasks/pbsv_call.wdl", + "name": "", + "description": "", + "tasks": { + "pbsv_call": { + "key": "pbsv_call", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/pbsv_discover.wdl": { + "key": "workflows/wdl-common/wdl/tasks/pbsv_discover.wdl", + "name": "", + "description": "", + "tasks": { + "pbsv_discover": { + "key": "pbsv_discover", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/whatshap_haplotag.wdl": { + "key": "workflows/wdl-common/wdl/tasks/whatshap_haplotag.wdl", + "name": "", + "description": "", + "tasks": { + "whatshap_haplotag": { + "key": "whatshap_haplotag", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/whatshap_phase.wdl": { + "key": "workflows/wdl-common/wdl/tasks/whatshap_phase.wdl", + "name": "", + "description": "", + "tasks": { + "whatshap_phase": { + "key": "whatshap_phase", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/whatshap_stats.wdl": { + "key": "workflows/wdl-common/wdl/tasks/whatshap_stats.wdl", + "name": "", + "description": "", + "tasks": { + "whatshap_stats": { + "key": "whatshap_stats", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/workflows/deepvariant/deepvariant.wdl": { + "key": "workflows/wdl-common/wdl/workflows/deepvariant/deepvariant.wdl", + "name": "", + "description": "", + "tasks": { + "deepvariant_make_examples": { + "key": "deepvariant_make_examples", + "digest": "", + "tests": [] + }, + "deepvariant_call_variants": { + "key": "deepvariant_call_variants", + "digest": "", + "tests": [] + }, + "deepvariant_postprocess_variants": { + "key": "deepvariant_postprocess_variants", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/workflows/phase_vcf/phase_vcf.wdl": { + "key": "workflows/wdl-common/wdl/workflows/phase_vcf/phase_vcf.wdl", + "name": "", + "description": "", + "tasks": { + "split_vcf": { + "key": "split_vcf", + "digest": "", + "tests": [] + }, + "bcftools_concat": { + "key": "bcftools_concat", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/workflows/hiphase/hiphase.wdl": { + "key": "workflows/wdl-common/wdl/workflows/hiphase/hiphase.wdl", + "name": "", + "description": "", + "tasks": { + "run_hiphase": { + "key": "run_hiphase", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/workflows/pharmcat/pharmcat.wdl": { + "key": "workflows/wdl-common/wdl/workflows/pharmcat/pharmcat.wdl", + "name": "", + "description": "", + "tasks": { + "pangu_cyp2d6": { + "key": "pangu_cyp2d6", + "digest": "", + "tests": [] + }, + "pharmcat_preprocess": { + "key": "pharmcat_preprocess", + "digest": "", + "tests": [] + }, + "filter_preprocessed_vcf": { + "key": "filter_preprocessed_vcf", + "digest": "", + "tests": [] + }, + "run_pharmcat": { + "key": "run_pharmcat", + "digest": "", + "tests": [] + } + } } }, "engines": { "f1ed5b40-6a26-4eac-a2b8-9960516e4164": { "key": "f1ed5b40-6a26-4eac-a2b8-9960516e4164", - "enabled": true, + "enabled": false, "name": "PacBio CoA installation" + }, + "abc123": { + "key": "abc123", + "enabled": true, + "name": "pacbio-hpc" } }, "test_params": { @@ -356,8 +496,13 @@ "engine_params": { "f1ed5b40-6a26-4eac-a2b8-9960516e4164": { "input_file_path": "/coac74908838b5dd7/inputs/small_dataset/chr6.p23", - "resources_file_path": "/coac74908838b5dd7/inputs/wdl-ci/humanwgs", + "resources_file_path": "/coac74908838b5dd7/inputs/wdl-ci/humanassembly", "datasets_file_path": "/datasetpbrarediseases/dataset" + }, + "abc123": { + "input_file_path": "/pbi/collections/appslabht/cromwell_output/testdata/inputs/chr6.p23", + "resources_file_path": "/pbi/collections/appslabht/cromwell_output/testdata/wdl-ci/humanassembly", + "datasets_file_path": "/pbi/collections/appslabht/cromwell_output/testdata/datasetpbrarediseases/dataset" } } } diff --git a/workflows/assemble_genome/assemble_genome.wdl b/workflows/assemble_genome/assemble_genome.wdl index 96a733d..4e42ef6 100644 --- a/workflows/assemble_genome/assemble_genome.wdl +++ b/workflows/assemble_genome/assemble_genome.wdl @@ -9,7 +9,7 @@ workflow assemble_genome { String sample_id Array[File] reads_fastas - ReferenceData reference + Array[ReferenceData] references String? hifiasm_extra_params File? father_yak @@ -38,32 +38,56 @@ workflow assemble_genome { call gfa2fa { input: gfa = gfa, - reference_index = reference.fasta.data_index, - runtime_attributes = default_runtime_attributes + runtime_attributes = default_runtime_attributes } } - call align_hifiasm { - input: - sample_id = sample_id, - query_sequences = gfa2fa.zipped_fasta, - reference = reference.fasta.data, - reference_name = reference.name, - runtime_attributes = default_runtime_attributes - } + scatter (ref in references) { + scatter (hap in gfa2fa.zipped_fasta) { + call align_hifiasm { + input: + sample_id = sample_id, + query_sequences = hap, + reference = ref.fasta.data, + reference_name = ref.name, + runtime_attributes = default_runtime_attributes + } + + IndexData sample_aligned_bam = { + "data": align_hifiasm.asm_bam, + "data_index": align_hifiasm.asm_bam_index + } + + Pair[ReferenceData,IndexData] align_data = (ref, sample_aligned_bam) + } + + Array[File] bamlist = align_hifiasm.asm_bam + + call merge_haps { + input: + sample_id = sample_id, + bams = bamlist, + refname = ref.name, + runtime_attributes = default_runtime_attributes + } + + + } output { Array[File] assembly_noseq_gfas = hifiasm_assemble.assembly_noseq_gfas Array[File] assembly_lowQ_beds = hifiasm_assemble.assembly_lowQ_beds Array[File] zipped_assembly_fastas = gfa2fa.zipped_fasta Array[File] assembly_stats = gfa2fa.assembly_stats - IndexData asm_bam = {"data": align_hifiasm.asm_bam, "data_index": align_hifiasm.asm_bam_index} + Array[IndexData] asm_bams = flatten(sample_aligned_bam) + Array[IndexData] merged_bams = merge_haps.merged_bam + Array[Pair[ReferenceData,IndexData]] alignments = flatten(align_data) } parameter_meta { sample_id: {help: "Sample ID; used for naming files"} reads_fastas: {help: "Reads in fasta format to be used for assembly; one for each movie bam to be used in assembly. Reads fastas from one or more sample may be combined to use in the assembly"} - reference: {help: "Reference genome data"} + references: {help: "Array of Reference genomes data"} hiiasm_extra_params: {help: "[OPTIONAL] Additional parameters to pass to hifiasm assembly"} father_yak: {help: "[OPTIONAL] kmer counts for the father; required if running trio-based assembly"} mother_yak: {help: "[OPTIONAL] kmer counts for the mother; required if running trio-based assembly"} @@ -88,11 +112,13 @@ task hifiasm_assemble { String prefix = "~{sample_id}.asm" Int threads = 48 Int mem_gb = threads * 6 - Int disk_size = ceil((size(reads_fastas[0], "GB") * length(reads_fastas)) * 4 + 20) + Int disk_size = ceil(size(reads_fastas, "GB") * 4 + 20) command <<< set -euo pipefail + echo "hifiasm version: $(hifiasm --version)" + hifiasm \ -o ~{prefix} \ -t ~{threads} \ @@ -132,8 +158,6 @@ task gfa2fa { input { File gfa - File reference_index - RuntimeAttributes runtime_attributes } @@ -144,24 +168,32 @@ task gfa2fa { command <<< set -euo pipefail + echo "gfatools version: $(gfatools version)" + gfatools gfa2fa \ ~{gfa} \ > ~{gfa_basename}.fasta + bgzip --version + bgzip \ --threads ~{threads} \ --stdout \ ~{gfa_basename}.fasta \ > ~{gfa_basename}.fasta.gz + echo "calN50.js version: $(k8 /opt/calN50/calN50.js -v)" + # Calculate assembly stats k8 \ /opt/calN50/calN50.js \ - -f ~{reference_index} \ + -L3.1g \ ~{gfa_basename}.fasta.gz \ > ~{gfa_basename}.fasta.stats.txt >>> + + output { File zipped_fasta = "~{gfa_basename}.fasta.gz" File assembly_stats = "~{gfa_basename}.fasta.stats.txt" @@ -184,7 +216,7 @@ task gfa2fa { task align_hifiasm { input { String sample_id - Array[File] query_sequences + File query_sequences File reference String reference_name @@ -193,40 +225,48 @@ task align_hifiasm { } Int threads = 16 - Int disk_size = ceil((size(query_sequences[0], "GB") * length(query_sequences) + size(reference, "GB")) * 2 + 20) + Int mem_gb = threads * 8 + Int disk_size = ceil((size(query_sequences, "GB") + size(reference, "GB")) * 2 + 20) command <<< - set -euo pipefail + + + echo "minimap2 version: $(minimap2 --version)" + haplotype=$(basename ~{query_sequences} | sed -n 's/.*\(hap.\).*/\1/p') + echo "$haplotype" > hap.txt + + samtools --version minimap2 \ -t ~{threads - 4} \ -L \ --secondary=no \ --eqx \ + --cs \ -a \ -x asm5 \ -R "@RG\\tID:~{sample_id}_hifiasm\\tSM:~{sample_id}" \ ~{reference} \ - ~{sep=' ' query_sequences} \ + ~{query_sequences} \ | samtools sort \ - -@ 4 \ + -@ 3 \ -T ./TMP \ -m 8G \ -O BAM \ - -o ~{sample_id}.asm.~{reference_name}.bam + -o "~{sample_id}.$haplotype.asm.~{reference_name}.bam" - samtools index ~{sample_id}.asm.~{reference_name}.bam + samtools index "~{sample_id}.$haplotype.asm.~{reference_name}.bam" >>> output { - File asm_bam = "~{sample_id}.asm.~{reference_name}.bam" - File asm_bam_index = "~{sample_id}.asm.~{reference_name}.bam.bai" + File asm_bam = glob("*.bam")[0] + File asm_bam_index = glob("*.bam.bai")[0] } runtime { - docker: "~{runtime_attributes.container_registry}/align_hifiasm@sha256:3968cb152a65163005ffed46297127536701ec5af4c44e8f3e7051f7b01f80fe" + docker: "~{runtime_attributes.container_registry}/align_hifiasm@sha256:0e8ad680b0e89376eb94fa8daa1a0269a4abe695ba39523a5c56a59d5c0e3953" cpu: threads - memory: "128 GB" + memory: mem_gb + " GB" disk: disk_size + " GB" disks: "local-disk " + disk_size + " HDD" preemptible: runtime_attributes.preemptible_tries @@ -236,3 +276,100 @@ task align_hifiasm { zones: runtime_attributes.zones } } + +task merge_haps { + input { + Array[File] bams + String sample_id + String refname + + RuntimeAttributes runtime_attributes + } + + Int threads = 3 + Int disk_size = 20 + Int mem_gb = threads * 8 + + command <<< + + samtools merge \ + -@3 \ + -b \ + -o ~{sample_id}.asm.~{refname}.bam \ + ~{sep=' ' bams} + + samtools index ~{sample_id}.asm.~{refname}.bam + + + >>> + + output { + IndexData merged_bam = {"data": "~{sample_id}.asm.~{refname}.bam", + "data_index": "~{sample_id}.asm.~{refname}.bam.bai"} + } + + runtime { + docker: "~{runtime_attributes.container_registry}/align_hifiasm@sha256:0e8ad680b0e89376eb94fa8daa1a0269a4abe695ba39523a5c56a59d5c0e3953" + cpu: threads + memory: mem_gb + " GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +} + +task paftools { + input { + File bam + File bam_index + + File reference + + String sample + + RuntimeAttributes runtime_attributes + } + + String bam_basename = basename(bam, ".bam") + Int threads = 2 + Int disk_size = ceil((size(bam, "GB") + size(reference, "GB")) * 3 + 20) + Int mem_gb = threads * 8 + + command <<< + set -euo pipefail + + k8 /opt/minimap2-2.17/misc/paftools.js version + + samtools view -h ~{bam} | \ + k8 /opt/minimap2-2.17/misc/paftools.js sam2paf - | \ + sort -k6,6 -k8,8n | \ + k8 /opt/minimap2-2.17/misc/paftools.js call \ + -L5000 \ + -f ~{reference} \ + -s ~{sample} \ + - \ + > ~{bam_basename}.paftools.vcf + + >>> + + output { + File paftools_vcf = "~{bam_basename}.paftools.vcf" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/align_hifiasm@sha256:0e8ad680b0e89376eb94fa8daa1a0269a4abe695ba39523a5c56a59d5c0e3953" + cpu: threads + memory: mem_gb + " GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +} \ No newline at end of file diff --git a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl index 72f7957..bae1906 100644 --- a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl +++ b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl @@ -12,7 +12,7 @@ workflow de_novo_assembly_sample { input { Sample sample - ReferenceData reference + Array[ReferenceData] references String backend RuntimeAttributes default_runtime_attributes @@ -31,95 +31,63 @@ workflow de_novo_assembly_sample { input: sample_id = sample.sample_id, reads_fastas = samtools_fasta.reads_fasta, - reference = reference, - hifiasm_extra_params = "", + references = references, backend = backend, default_runtime_attributes = default_runtime_attributes, on_demand_runtime_attributes = on_demand_runtime_attributes } - call htsbox { - input: - bam = assemble_genome.asm_bam.data, - bam_index = assemble_genome.asm_bam.data_index, - reference = reference.fasta.data, - runtime_attributes = default_runtime_attributes - } + scatter (aln in assemble_genome.alignments) { + ReferenceData ref = aln.left + IndexData bam = aln.right - call ZipIndexVcf.zip_index_vcf { - input: - vcf = htsbox.htsbox_vcf, - runtime_attributes = default_runtime_attributes - } + call AssembleGenome.paftools { + input: + bam = bam.data, + sample = sample.sample_id, + bam_index = bam.data_index, + reference = ref.fasta.data, + runtime_attributes = default_runtime_attributes + } + + + call ZipIndexVcf.zip_index_vcf { + input: + vcf = paftools.paftools_vcf, + runtime_attributes = default_runtime_attributes + } + + IndexData paftools_vcf = { + "data": zip_index_vcf.zipped_vcf, + "data_index": zip_index_vcf.zipped_vcf_index + } + + call BcftoolsStats.bcftools_stats { + input: + vcf = zip_index_vcf.zipped_vcf, + params = "--samples ~{sample.sample_id}", + reference = ref.fasta.data, + runtime_attributes = default_runtime_attributes + } - call BcftoolsStats.bcftools_stats { - input: - vcf = zip_index_vcf.zipped_vcf, - params = "--samples ~{basename(assemble_genome.asm_bam.data)}", - reference = reference.fasta.data, - runtime_attributes = default_runtime_attributes } + output { Array[File] assembly_noseq_gfas = assemble_genome.assembly_noseq_gfas Array[File] assembly_lowQ_beds = assemble_genome.assembly_lowQ_beds Array[File] zipped_assembly_fastas = assemble_genome.zipped_assembly_fastas Array[File] assembly_stats = assemble_genome.assembly_stats - IndexData asm_bam = assemble_genome.asm_bam - IndexData htsbox_vcf = {"data": zip_index_vcf.zipped_vcf, "data_index": zip_index_vcf.zipped_vcf_index} - File htsbox_vcf_stats = bcftools_stats.stats + Array[IndexData] merged_bams = assemble_genome.merged_bams + + Array[IndexData] paftools_vcfs = paftools_vcf + Array[File] paftools_vcf_stats = bcftools_stats.stats } parameter_meta { sample: {help: "Sample information and associated data files"} - reference: {help: "Reference genome data"} + references: {help: "Array of Reference genomes data"} default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"} on_demand_runtime_attributes: {help: "RuntimeAttributes for tasks that require dedicated instances"} } -} - -task htsbox { - input { - File bam - File bam_index - - File reference - - RuntimeAttributes runtime_attributes - } - - String bam_basename = basename(bam, ".bam") - Int threads = 2 - Int disk_size = ceil((size(bam, "GB") + size(reference, "GB")) * 3 + 200) - - command <<< - set -euo pipefail - - # Ensure the sample is named based on the bam basename (not the full path) - cp ~{bam} . - - htsbox pileup \ - -q20 \ - -c \ - -f ~{reference} \ - ~{basename(bam)} \ - > ~{bam_basename}.htsbox.vcf - >>> - - output { - File htsbox_vcf = "~{bam_basename}.htsbox.vcf" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/htsbox@sha256:740b7962584a582757ee9601719fa98403517db669037bc3946e9ecc5f970654" - cpu: threads - memory: "14 GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } -} +} \ No newline at end of file diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 38431c8..b11e5a0 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -6,12 +6,14 @@ version 1.0 import "../assembly_structs.wdl" import "../wdl-common/wdl/tasks/samtools_fasta.wdl" as SamtoolsFasta import "../assemble_genome/assemble_genome.wdl" as AssembleGenome +import "../wdl-common/wdl/tasks/zip_index_vcf.wdl" as ZipIndexVcf +import "../wdl-common/wdl/tasks/bcftools_stats.wdl" as BcftoolsStats workflow de_novo_assembly_trio { input { Cohort cohort - ReferenceData reference + Array[ReferenceData] references String backend RuntimeAttributes default_runtime_attributes @@ -41,13 +43,6 @@ workflow de_novo_assembly_trio { } } - call yak_count as yak_count_father { - input: - sample_id = father.sample_id, - reads_fastas = samtools_fasta_father.reads_fasta, - runtime_attributes = default_runtime_attributes - } - scatter (movie_bam in mother.movie_bams) { call SamtoolsFasta.samtools_fasta as samtools_fasta_mother { input: @@ -56,10 +51,32 @@ workflow de_novo_assembly_trio { } } + # if parental coverage is low (<15x), keep singleton kmers from parents and use them to bin child reads + # if parental coverage is high (>=15x), use bloom filter and require that a kmer occur >= 5 times in + # one parent and <2 times in the other parent to be used for binning + # 60GB uncompressed FASTA ~= 10x coverage (this is not robust to big changes in mean read length) + # memory for 24 threads is 48GB with bloom filter (<=50x coverage) and 65GB without bloom filter (<=30x coverage) + Boolean low_depth = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false + + String yak_params = if (low_depth) then "-b0" else "-b37" + Int yak_mem_gb = if (low_depth) then 70 else 50 + String hifiasm_extra_params = if (low_depth) then "-c1 -d1" else "-c2 -d5" + + call yak_count as yak_count_father { + input: + sample_id = father.sample_id, + reads_fastas = samtools_fasta_father.reads_fasta, + yak_params = yak_params, + mem_gb = yak_mem_gb, + runtime_attributes = default_runtime_attributes + } + call yak_count as yak_count_mother { input: sample_id = mother.sample_id, reads_fastas = samtools_fasta_mother.reads_fasta, + yak_params = yak_params, + mem_gb = yak_mem_gb, runtime_attributes = default_runtime_attributes } @@ -84,14 +101,50 @@ workflow de_novo_assembly_trio { input: sample_id = "~{cohort.cohort_id}.~{child.sample_id}", reads_fastas = samtools_fasta_child.reads_fasta, - reference = reference, - hifiasm_extra_params = "-c1 -d1", + references = references, + hifiasm_extra_params = hifiasm_extra_params, father_yak = yak_count_father.yak, mother_yak = yak_count_mother.yak, backend = backend, default_runtime_attributes = default_runtime_attributes, on_demand_runtime_attributes = on_demand_runtime_attributes } + + scatter (aln in assemble_genome.alignments) { + ReferenceData ref = aln.left + IndexData bam = aln.right + + call AssembleGenome.paftools { + input: + bam = bam.data, + sample = child.sample_id, + bam_index = bam.data_index, + reference = ref.fasta.data, + runtime_attributes = default_runtime_attributes + } + + + call ZipIndexVcf.zip_index_vcf { + input: + vcf = paftools.paftools_vcf, + runtime_attributes = default_runtime_attributes + } + + IndexData paftools_vcf = { + "data": zip_index_vcf.zipped_vcf, + "data_index": zip_index_vcf.zipped_vcf_index + } + + call BcftoolsStats.bcftools_stats { + input: + vcf = zip_index_vcf.zipped_vcf, + params = "--samples ~{child.sample_id}", + reference = ref.fasta.data, + runtime_attributes = default_runtime_attributes + } + + } + } } @@ -101,12 +154,16 @@ workflow de_novo_assembly_trio { Array[Array[File]] assembly_lowQ_beds = flatten(assemble_genome.assembly_lowQ_beds) Array[Array[File]] zipped_assembly_fastas = flatten(assemble_genome.zipped_assembly_fastas) Array[Array[File]] assembly_stats = flatten(assemble_genome.assembly_stats) - Array[IndexData] asm_bams = flatten(assemble_genome.asm_bam) + + Array[Array[IndexData]] merged_bams = flatten(assemble_genome.merged_bams) + Array[Array[IndexData]] paftools_vcfs = flatten(paftools_vcf) + Array[Array[File]] paftools_vcf_stats = flatten(bcftools_stats.stats) + } parameter_meta { cohort: {help: "Sample information for the cohort"} - reference: {help: "Reference genome data"} + references: {help: "Array of Reference genomes data"} default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"} on_demand_runtime_attributes: {help: "RuntimeAttributes for tasks that require dedicated instances"} } @@ -122,6 +179,8 @@ task parse_families { command <<< set -euo pipefail + parse_cohort.py --version + parse_cohort.py \ --cohort_json ~{cohort_json} \ --parse_families @@ -132,7 +191,7 @@ task parse_families { } runtime { - docker: "~{runtime_attributes.container_registry}/parse-cohort@sha256:94444e7e3fd151936c9bbcb8a64b6a5e7d8c59de53b256a83f15c4ea203977b4" + docker: "~{runtime_attributes.container_registry}/parse-cohort@sha256:e6a8ac24ada706644e62878178790a0006db9a6abec7a312232052bb0666fe8f" cpu: 2 memory: "4 GB" disk: "20 GB" @@ -150,21 +209,24 @@ task yak_count { String sample_id Array[File] reads_fastas + String yak_params + Int mem_gb + RuntimeAttributes runtime_attributes } - Int threads = 10 - - # Usage up to 140 GB @ 10 threads for Revio samples - Int mem_gb = 16 * threads - Int disk_size = ceil(size(reads_fastas[0], "GB") * length(reads_fastas) * 2 + 20) + Int threads = 24 + Int disk_size = ceil(size(reads_fastas, "GB") * 2 + 20) command <<< set -euo pipefail + echo "yak version: $(yak version)" + yak count \ -t ~{threads} \ -o ~{sample_id}.yak \ + ~{yak_params} \ ~{sep=' ' reads_fastas} >>> diff --git a/workflows/input_template.json b/workflows/input_template.json new file mode 100644 index 0000000..64e5d62 --- /dev/null +++ b/workflows/input_template.json @@ -0,0 +1,31 @@ +{ + "de_novo_assembly.cohort": { + "cohort_id": "String", + "samples": [ + { + "sample_id": "String", + "movie_bams": "Array[File]", + "sex": "String?", + "father_id": "String?", + "mother_id": "String?", + "run_de_novo_assembly": "Boolean" + } + ], + "run_de_novo_assembly_trio": "Boolean" + }, + "de_novo_assembly.references": [ + { + "name": "String", + "fasta": { + "data": "File", + "data_index": "File" + } + } + ], + "de_novo_assembly.zones": "String? (optional); required if backend is set to 'AWS'", + "de_novo_assembly.aws_spot_queue_arn": "String? (optional); required if backend is set to 'AWS'", + "de_novo_assembly.aws_on_demand_queue_arn": "String? (optional)", + "de_novo_assembly.preemptible": "Boolean", + "de_novo_assembly.backend": "String ['GCP', 'Azure', 'AWS', or 'HPC']", + "de_novo_assembly.container_registry": "String? (optional)" +} \ No newline at end of file diff --git a/workflows/inputs.aws.json b/workflows/inputs.aws.json deleted file mode 100644 index 457ce7a..0000000 --- a/workflows/inputs.aws.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "de_novo_assembly.cohort": { - "cohort_id": "String", - "samples": [ - { - "sample_id": "String", - "movie_bams": "Array[File]", - "sex": "String?", - "father_id": "String?", - "mother_id": "String?", - "run_de_novo_assembly": "Boolean" - } - ], - "run_de_novo_assembly_trio": "Boolean" - }, - "de_novo_assembly.reference": { - "name": "GRCh38", - "fasta": { - "data": "s3://dnastack-resources/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "data_index": "s3://dnastack-resources/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" - } - }, - "de_novo_assembly.backend": "AWS", - "de_novo_assembly.zones": "us-east-2a us-east-2b us-east-2c", - "de_novo_assembly.aws_spot_queue_arn": "", - "de_novo_assembly.aws_on_demand_queue_arn": "", - "de_novo_assembly.preemptible": "Boolean" -} diff --git a/workflows/inputs.azure.json b/workflows/inputs.azure.json deleted file mode 100644 index 64a1911..0000000 --- a/workflows/inputs.azure.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "de_novo_assembly.cohort": { - "cohort_id": "String", - "samples": [ - { - "sample_id": "String", - "movie_bams": "Array[File]", - "sex": "String?", - "father_id": "String?", - "mother_id": "String?", - "run_de_novo_assembly": "Boolean" - } - ], - "run_de_novo_assembly_trio": "Boolean" - }, - "de_novo_assembly.reference": { - "name": "GRCh38", - "fasta": { - "data": "/datasetpbrarediseases/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "data_index": "/datasetpbrarediseases/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" - } - }, - "de_novo_assembly.backend": "Azure", - "de_novo_assembly.preemptible": "Boolean" -} diff --git a/workflows/inputs.gcp.json b/workflows/inputs.gcp.json deleted file mode 100644 index f4cc3c5..0000000 --- a/workflows/inputs.gcp.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "de_novo_assembly.cohort": { - "cohort_id": "String", - "samples": [ - { - "sample_id": "String", - "movie_bams": "Array[File]", - "sex": "String?", - "father_id": "String?", - "mother_id": "String?", - "run_de_novo_assembly": "Boolean" - } - ], - "run_de_novo_assembly_trio": "Boolean" - }, - "de_novo_assembly.reference": { - "name": "GRCh38", - "fasta": { - "data": "gs:///dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "data_index": "gs:///dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" - } - }, - "de_novo_assembly.backend": "GCP", - "de_novo_assembly.zones": "String", - "de_novo_assembly.preemptible": "Boolean" -} diff --git a/workflows/inputs.hpc.json b/workflows/inputs.hpc.json deleted file mode 100644 index 338e58d..0000000 --- a/workflows/inputs.hpc.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "de_novo_assembly.cohort": { - "cohort_id": "String", - "samples": [ - { - "sample_id": "String", - "movie_bams": "Array[File]", - "sex": "String?", - "father_id": "String?", - "mother_id": "String?", - "run_de_novo_assembly": "Boolean" - } - ], - "run_de_novo_assembly_trio": "Boolean" - }, - "de_novo_assembly.reference": { - "name": "GRCh38", - "fasta": { - "data": "/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "data_index": "/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" - } - }, - "de_novo_assembly.backend": "HPC", - "de_novo_assembly.preemptible": false -} diff --git a/workflows/inputs.json b/workflows/inputs.json deleted file mode 100644 index 148b817..0000000 --- a/workflows/inputs.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "de_novo_assembly.cohort": { - "cohort_id": "String", - "samples": [ - { - "sample_id": "String", - "movie_bams": "Array[File]", - "sex": "String?", - "father_id": "String?", - "mother_id": "String?", - "run_de_novo_assembly": "Boolean" - } - ], - "run_de_novo_assembly_trio": "Boolean" - }, - "de_novo_assembly.reference": { - "name": "String", - "fasta": { - "data": "File", - "data_index": "File" - } - }, - "de_novo_assembly.backend": "String ['GCP', 'Azure', 'AWS', or 'HPC']", - "de_novo_assembly.zones": "String? (optional); required if backend is set to 'GCP' or 'AWS'", - "de_novo_assembly.aws_spot_queue_arn": "String? (optional); required if backend is set to 'AWS'", - "de_novo_assembly.aws_on_demand_queue_arn": "String? (optional); required if backend is set to 'AWS'", - "de_novo_assembly.preemptible": "Boolean" -} \ No newline at end of file diff --git a/workflows/main.wdl b/workflows/main.wdl index 03ff494..29f475e 100644 --- a/workflows/main.wdl +++ b/workflows/main.wdl @@ -10,13 +10,14 @@ workflow de_novo_assembly { input { Cohort cohort - ReferenceData reference + Array[ReferenceData] references # Backend configuration String backend String? zones String? aws_spot_queue_arn String? aws_on_demand_queue_arn + String? container_registry Boolean preemptible } @@ -26,7 +27,8 @@ workflow de_novo_assembly { backend = backend, zones = zones, aws_spot_queue_arn = aws_spot_queue_arn, - aws_on_demand_queue_arn = aws_on_demand_queue_arn + aws_on_demand_queue_arn = aws_on_demand_queue_arn, + container_registry = container_registry } RuntimeAttributes default_runtime_attributes = if preemptible then backend_configuration.spot_runtime_attributes else backend_configuration.on_demand_runtime_attributes @@ -36,7 +38,7 @@ workflow de_novo_assembly { call DeNovoAssemblySample.de_novo_assembly_sample { input: sample = sample, - reference = reference, + references = references, backend = backend, default_runtime_attributes = default_runtime_attributes, on_demand_runtime_attributes = backend_configuration.on_demand_runtime_attributes @@ -49,7 +51,7 @@ workflow de_novo_assembly { call DeNovoAssemblyTrio.de_novo_assembly_trio { input: cohort = cohort, - reference = reference, + references = references, backend = backend, default_runtime_attributes = default_runtime_attributes, on_demand_runtime_attributes = backend_configuration.on_demand_runtime_attributes @@ -63,9 +65,10 @@ workflow de_novo_assembly { Array[Array[File]?] assembly_lowQ_beds = de_novo_assembly_sample.assembly_lowQ_beds Array[Array[File]?] zipped_assembly_fastas = de_novo_assembly_sample.zipped_assembly_fastas Array[Array[File]?] assembly_stats = de_novo_assembly_sample.assembly_stats - Array[IndexData?] asm_bam = de_novo_assembly_sample.asm_bam - Array[IndexData?] htsbox_vcf = de_novo_assembly_sample.htsbox_vcf - Array[File?] htsbox_vcf_stats = de_novo_assembly_sample.htsbox_vcf_stats + + Array[Array[IndexData]?] merged_bams = de_novo_assembly_sample.merged_bams + Array[Array[IndexData]?] paftools_vcf = de_novo_assembly_sample.paftools_vcfs + Array[Array[File]?] paftools_vcf_stats = de_novo_assembly_sample.paftools_vcf_stats # de_novo_assembly_trio output Array[Map[String, String]]? haplotype_key = de_novo_assembly_trio.haplotype_key @@ -73,16 +76,22 @@ workflow de_novo_assembly { Array[Array[File]]? trio_assembly_lowQ_beds = de_novo_assembly_trio.assembly_lowQ_beds Array[Array[File]]? trio_zipped_assembly_fastas = de_novo_assembly_trio.zipped_assembly_fastas Array[Array[File]]? trio_assembly_stats = de_novo_assembly_trio.assembly_stats - Array[IndexData]? trio_asm_bams = de_novo_assembly_trio.asm_bams + Array[Array[IndexData]]? trio_merged_asm_bams = de_novo_assembly_trio.merged_bams + + Array[Array[IndexData]]? trio_paftools_vcf = de_novo_assembly_trio.paftools_vcfs + Array[Array[File]]? trio_paftools_vcf_stats = de_novo_assembly_trio.paftools_vcf_stats + + } parameter_meta { cohort: {help: "Sample information for the cohort"} - reference: {help: "Reference genome data"} - backend: {help: "Backend where the workflow will be executed ['GCP', 'Azure', 'AWS']"} + references: {help: "Array of Reference genome data"} + backend: {help: "Backend where the workflow will be executed ['GCP', 'Azure', 'AWS', 'HPC']"} zones: {help: "Zones where compute will take place; required if backend is set to 'AWS' or 'GCP'"} aws_spot_queue_arn: {help: "Queue ARN for the spot batch queue; required if backend is set to 'AWS'"} aws_on_demand_queue_arn: {help: "Queue ARN for the on demand batch queue; required if backend is set to 'AWS'"} + container_registry: {help: "Container registry where workflow images are hosted. If left blank, PacBio's public Quay.io registry will be used."} preemptible: {help: "Where possible, run tasks preemptibly"} } } diff --git a/workflows/wdl-common b/workflows/wdl-common index 0b034ff..e37b327 160000 --- a/workflows/wdl-common +++ b/workflows/wdl-common @@ -1 +1 @@ -Subproject commit 0b034ff68b995b8667ff711fdbb40ce803e9892a +Subproject commit e37b3274f6e78a612adeae0e36a104a5752de9f7