Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🚧 exome/panel genotyping #53

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/dockers_gatk_genotyping.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ bcftools_concat.cwl|pgc-images.sbgenomics.com/d3b-bixu/vcfutils:latest
bcftools_filter_vcf.cwl|pgc-images.sbgenomics.com/d3b-bixu/bcftools:1.20
bcftools_strip_ann.cwl|pgc-images.sbgenomics.com/d3b-bixu/vcfutils:latest
echtvar_anno.cwl|pgc-images.sbgenomics.com/d3b-bixu/echtvar:0.2.0
filtering_defaults.cwl|None
gatk_applyrecalibration.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
gatk_gatherfinalvcf.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
gatk_gathertranches.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
gatk_gathervcfs.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
gatk_import_genotype_filtergvcf_merge.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
gatk_genomicsdbimport_genotypegvcfs.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
gatk_indelsvariantrecalibrator.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
gatk_makesitesonlyvcf.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
gatk_selectvariants.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.2.0.0R
gatk_snpsvariantrecalibratorcreatemodel.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
gatk_snpsvariantrecalibratorscattered.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
Expand Down
4 changes: 3 additions & 1 deletion docs/dockers_germline_variant.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ echtvar_anno.cwl|pgc-images.sbgenomics.com/d3b-bixu/echtvar:0.2.0
expression_create_index_array.cwl|None
expression_transpose_two_dimension_array.cwl|None
file_to_file_array.cwl|None
filtering_defaults.cwl|None
freebayes.cwl|staphb/freebayes:1.3.6
gatk_applyrecalibration.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
gatk_bedtointervallist.cwl|broadinstitute/gatk:4.4.0.0
Expand All @@ -32,12 +33,13 @@ gatk_determinegermlinecontigploidy_case.cwl|broadinstitute/gatk:4.2.0.0
gatk_gatherfinalvcf.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
gatk_gathertranches.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
gatk_gathervcfs.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
gatk_genomicsdbimport_genotypegvcfs.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
gatk_germlinecnvcaller_case.cwl|broadinstitute/gatk:4.2.0.0
gatk_haplotypecaller.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.beta.1-3.5
gatk_import_genotype_filtergvcf_merge.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
gatk_indelsvariantrecalibrator.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
gatk_intervallisttobed.cwl|broadinstitute/gatk:4.4.0.0
gatk_intervallisttools.cwl|broadinstitute/gatk:4.4.0.0
gatk_makesitesonlyvcf.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
gatk_mergevcfs.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.1.1.0
gatk_postprocessgermlinecnvcalls.cwl|broadinstitute/gatk:4.2.0.0
gatk_preprocessintervals.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.2.0.0R
Expand Down
35 changes: 28 additions & 7 deletions subworkflows/kfdrc-gatk-hardfiltering.cwl
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
cwlVersion: v1.0
cwlVersion: v1.2
class: Workflow
id: kfdrc-gatk-hardfiltering
requirements:
- class: StepInputExpressionRequirement
- class: InlineJavascriptRequirement
doc: |-
This workflow performs manual site-level variant filtration on an input VCF using the generic hard-filtering thresholds and example commands in the
[documentation from Broad](https://gatk.broadinstitute.org/hc/en-us/articles/360035531112--How-to-Filter-variants-either-with-VQSR-or-by-hard-filtering#2).
Expand All @@ -9,11 +12,17 @@ doc: |-
Finally the VCFs are merged back together using bcftools concat and returned.

inputs:
input_vcf: {type: 'File', secondaryFiles: [.tbi], doc: "Input VCF containing INDEL and SNP variants"}
input_vcf: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], doc: "Input VCF containing INDEL and SNP variants"}
output_basename: {type: 'string', doc: "String value to use as the base for the filename of the output"}
snp_hardfilters: {type: 'string', doc: "String value of hardfilters to set for SNPs in input_vcf" }
indel_hardfilters: {type: 'string', doc: "String value of hardfilters to set for INDELs in input_vcf" }
snp_filtration_extra_args: {type: 'string?', doc: "Any extra arguments for SNP VariantFiltration" }
indel_filtration_extra_args: {type: 'string?', doc: "Any extra arguments for INDEL VariantFiltration" }
filtration_cpu: { type: 'int?', doc: "CPUs to allocate to GATK VariantFiltration" }
filtration_ram: { type: 'int?', doc: "GB of RAM to allocate to GATK VariantFiltration" }

outputs:
hardfiltered_vcf: {type: 'File', outputSource: bcftools_concat_snps_indels/output}
hardfiltered_vcf: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], outputSource: bcftools_concat_snps_indels/output}

steps:
gatk_selectvariants_snps:
Expand All @@ -34,15 +43,27 @@ steps:
run: ../tools/gatk_variantfiltration.cwl
in:
input_vcf: gatk_selectvariants_snps/output
output_basename: output_basename
selection: {valueFrom: "SNP"}
output_basename:
source: output_basename
valueFrom: |
$(self).snp.filtered
variant_filters: snp_hardfilters
extra_args: snp_filtration_extra_args
max_memory: filtration_ram
cpu: filtration_cpu
out: [output]
gatk_variantfiltration_indels:
run: ../tools/gatk_variantfiltration.cwl
in:
input_vcf: gatk_selectvariants_indels/output
output_basename: output_basename
selection: {valueFrom: "INDEL"}
output_basename:
source: output_basename
valueFrom: |
$(self).indel.filtered
variant_filters: indel_hardfilters
extra_args: indel_filtration_extra_args
max_memory: filtration_ram
cpu: filtration_cpu
out: [output]
bcftools_concat_snps_indels:
run: ../tools/bcftools_concat.cwl
Expand Down
177 changes: 177 additions & 0 deletions subworkflows/kfdrc-gatk-vqsr.cwl
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
cwlVersion: v1.2
class: Workflow
id: kfdrc-gatk-vqsr
doc: |-
GATK workflow for Variant Quality Score Recalibration (VQSR)
requirements:
- class: ScatterFeatureRequirement
- class: InlineJavascriptRequirement
- class: StepInputExpressionRequirement

inputs:
genotyped_vcfs: {type: 'File[]', secondaryFiles: [{pattern: '.tbi', required: true}], doc: "Input VCF that has been jointly genotyped"}
output_basename: {type: 'string', doc: "String value to use as the base for the filename of the output"}

axiomPoly_resource_vcf: {type: File, secondaryFiles: [{pattern: '.tbi', required: true}], doc: 'Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz',
"sbg:suggestedValue": {class: File, path: 60639016357c3a53540ca7c7, name: Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz,
secondaryFiles: [{class: File, path: 6063901d357c3a53540ca81b, name: Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi}]}}
dbsnp_vcf: {type: File, secondaryFiles: [{pattern: '.idx', required: true}], doc: 'Homo_sapiens_assembly38.dbsnp138.vcf', "sbg:suggestedValue": {
class: File, path: 6063901f357c3a53540ca84b, name: Homo_sapiens_assembly38.dbsnp138.vcf, secondaryFiles: [{class: File, path: 6063901e357c3a53540ca834,
name: Homo_sapiens_assembly38.dbsnp138.vcf.idx}]}}
hapmap_resource_vcf: {type: File, secondaryFiles: [{pattern: '.tbi', required: true}], doc: 'Hapmap genotype SNP input vcf', "sbg:suggestedValue": {
class: File, path: 60639016357c3a53540ca7be, name: hapmap_3.3.hg38.vcf.gz, secondaryFiles: [{class: File, path: 60639016357c3a53540ca7c5,
name: hapmap_3.3.hg38.vcf.gz.tbi}]}}
mills_resource_vcf: {type: File, secondaryFiles: [{pattern: '.tbi', required: true}], doc: 'Mills_and_1000G_gold_standard.indels.hg38.vcf.gz',
"sbg:suggestedValue": {class: File, path: 6063901a357c3a53540ca7f3, name: Mills_and_1000G_gold_standard.indels.hg38.vcf.gz, secondaryFiles: [
{class: File, path: 6063901c357c3a53540ca806, name: Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi}]}}
omni_resource_vcf: {type: File, secondaryFiles: [{pattern: '.tbi', required: true}], doc: '1000G_omni2.5.hg38.vcf.gz', "sbg:suggestedValue": {
class: File, path: 6063901e357c3a53540ca835, name: 1000G_omni2.5.hg38.vcf.gz, secondaryFiles: [{class: File, path: 60639016357c3a53540ca7b1,
name: 1000G_omni2.5.hg38.vcf.gz.tbi}]}}
one_thousand_genomes_resource_vcf: {type: File, secondaryFiles: [{pattern: '.tbi', required: true}], doc: '1000G_phase1.snps.high_confidence.hg38.vcf.gz,
high confidence snps', "sbg:suggestedValue": {class: File, path: 6063901c357c3a53540ca80f, name: 1000G_phase1.snps.high_confidence.hg38.vcf.gz,
secondaryFiles: [{class: File, path: 6063901e357c3a53540ca845, name: 1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi}]}}
snp_max_gaussians: {type: 'int?', doc: "Interger value for max gaussians in SNP VariantRecalibration. If a dataset gives fewer variants
than the expected scale, the number of Gaussians for training should be turned down. Lowering the max-Gaussians forces the program
to group variants into a smaller number of clusters, which results in more variants per cluster."}
indel_max_gaussians: {type: 'int?', doc: "Interger value for max gaussians in INDEL VariantRecalibration. If a dataset gives fewer
variants than the expected scale, the number of Gaussians for training should be turned down. Lowering the max-Gaussians forces
the program to group variants into a smaller number of clusters, which results in more variants per cluster."}
snp_tranches: { type: 'string[]', doc: "The levels of truth sensitivity at which to slice the SNP recalibration data, in percent." }
snp_annotations: { type: 'string[]', doc: "The names of the annotations which should used for SNP recalibration calculations." }
indel_tranches: { type: 'string[]', doc: "The levels of truth sensitivity at which to slice the INDEL recalibration data, in percent." }
indel_annotations: { type: 'string[]', doc: "The names of the annotations which should used for INDEL recalibration calculations." }
snp_ts_filter_level: { type: 'float', doc: "The truth sensitivity level at which to start filtering SNP data" }
indel_ts_filter_level: { type: 'float', doc: "The truth sensitivity level at which to start filtering INDEL data" }

# Resource Control
snp_model_cpu: { type: 'int?', doc: "CPUs to allocate to VariantRecalibrator for SNP model creation." }
snp_model_ram: { type: 'int?', doc: "GB of RAM to allocate to VariantRecalibrator for SNP model creation." }
indel_recal_cpu: { type: 'int?', doc: "CPUs to allocate to VariantRecalibrator for INDEL recalibration." }
indel_recal_ram: { type: 'int?', doc: "GB of RAM to allocate to VariantRecalibrator for INDEL recalibration." }
snp_recal_cpu: { type: 'int?', doc: "CPUs to allocate to VariantRecalibrator for scattered SNP recalibration." }
snp_recal_ram: { type: 'int?', doc: "GB of RAM to allocate to VariantRecalibrator for scattered SNP recalibration." }
gathertranche_cpu: { type: 'int?', doc: "CPUs to allocate to GatherTranches." }
gathertranche_ram: { type: 'int?', doc: "GB of RAM to allocate to GatherTranches." }
apply_cpu: { type: 'int?', doc: "CPUs to allocate to ApplyVQSR for INDELs and SNPs." }
apply_ram: { type: 'int?', doc: "GB of RAM to allocate to ApplyVQSR for INDELs and SNPs." }
gathervcf_cpu: { type: 'int?', doc: "CPUs to allocate to GatherVcfsCloud." }
gathervcf_ram: { type: 'int?', doc: "GB of RAM to allocate to GatherVcfsCloud." }

outputs:
recalibrated_vcf: { type: 'File', secondaryFiles: [.tbi], outputSource: gatk_gatherfinalvcf/output }

steps:
gatk_filter_excesshet:
run: ../tools/gatk_variantfiltration.cwl
scatter: [input_vcf]
hints:
- class: 'sbg:AWSInstanceType'
value: m5.4xlarge
in:
input_vcf: genotyped_vcfs
output_basename:
valueFrom: 'excesshet_filtered'
variant_filters:
valueFrom: '--filter-expression "ExcessHet > 54.69" --filter-name ExcessHet'
out: [output]
gatk_makesitesonlyvcf:
run: ../tools/gatk_makesitesonlyvcf.cwl
scatter: [input_vcf]
hints:
- class: 'sbg:AWSInstanceType'
value: m5.4xlarge
in:
input_vcf: gatk_filter_excesshet/output
output_filename:
valueFrom: 'sites_only.variant_filtered.vcf.gz'
out: [sites_vcf]
gatk_gathervcfs:
run: ../tools/gatk_gathervcfs.cwl
in:
input_vcfs: gatk_makesitesonlyvcf/sites_vcf
out: [output]
gatk_snpsvariantrecalibratorcreatemodel:
run: ../tools/gatk_snpsvariantrecalibratorcreatemodel.cwl
in:
dbsnp_resource_vcf: dbsnp_vcf
hapmap_resource_vcf: hapmap_resource_vcf
omni_resource_vcf: omni_resource_vcf
one_thousand_genomes_resource_vcf: one_thousand_genomes_resource_vcf
sites_only_variant_filtered_vcf: gatk_gathervcfs/output
max_gaussians: snp_max_gaussians
tranche: snp_tranches
annotations: snp_annotations
cpu: snp_model_cpu
ram: snp_model_ram
out: [model_report]
gatk_indelsvariantrecalibrator:
run: ../tools/gatk_indelsvariantrecalibrator.cwl
in:
axiomPoly_resource_vcf: axiomPoly_resource_vcf
dbsnp_resource_vcf: dbsnp_vcf
mills_resource_vcf: mills_resource_vcf
sites_only_variant_filtered_vcf: gatk_gathervcfs/output
max_gaussians: indel_max_gaussians
tranche: indel_tranches
annotations: indel_annotations
cpu: indel_recal_cpu
ram: indel_recal_ram
out: [recalibration, tranches]
gatk_snpsvariantrecalibratorscattered:
run: ../tools/gatk_snpsvariantrecalibratorscattered.cwl
scatter: [sites_only_variant_filtered_vcf]
hints:
- class: 'sbg:AWSInstanceType'
value: r5.2xlarge
in:
sites_only_variant_filtered_vcf: gatk_filter_excesshet/output
model_report: gatk_snpsvariantrecalibratorcreatemodel/model_report
hapmap_resource_vcf: hapmap_resource_vcf
omni_resource_vcf: omni_resource_vcf
one_thousand_genomes_resource_vcf: one_thousand_genomes_resource_vcf
dbsnp_resource_vcf: dbsnp_vcf
max_gaussians: snp_max_gaussians
tranche: snp_tranches
annotations: snp_annotations
cpu: snp_recal_cpu
ram: snp_recal_ram
out: [recalibration, tranches]
gatk_gathertranches:
run: ../tools/gatk_gathertranches.cwl
hints:
- class: 'sbg:AWSInstanceType'
value: r5.2xlarge
in:
tranches: gatk_snpsvariantrecalibratorscattered/tranches
cpu: gathertranche_cpu
ram: gathertranche_ram
out: [output]
gatk_applyrecalibration:
run: ../tools/gatk_applyrecalibration.cwl
scatter: [input_vcf, snps_recalibration]
scatterMethod: dotproduct
hints:
- class: 'sbg:AWSInstanceType'
value: r5.2xlarge
in:
indels_recalibration: gatk_indelsvariantrecalibrator/recalibration
indels_tranches: gatk_indelsvariantrecalibrator/tranches
input_vcf: gatk_filter_excesshet/output
snps_recalibration: gatk_snpsvariantrecalibratorscattered/recalibration
snps_tranches: gatk_gathertranches/output
snp_ts_filter_level: snp_ts_filter_level
indel_ts_filter_level: indel_ts_filter_level
cpu: apply_cpu
ram: apply_ram
out: [recalibrated_vcf]
gatk_gatherfinalvcf:
run: ../tools/gatk_gatherfinalvcf.cwl
in:
input_vcfs: gatk_applyrecalibration/recalibrated_vcf
output_basename: output_basename
cpu: gathervcf_cpu
ram: gathervcf_ram
out: [output]

$namespaces:
sbg: https://sevenbridges.com
8 changes: 4 additions & 4 deletions tools/bcftools_concat.cwl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cwlVersion: v1.0
cwlVersion: v1.2
class: CommandLineTool
id: bcftools_concat
requirements:
Expand Down Expand Up @@ -27,8 +27,8 @@ arguments:
tabix $(inputs.output_basename).merged.vcf.gz

inputs:
indel_vcf: { type: 'File', secondaryFiles: ['.tbi'], doc: "VCF file containing INDELs" }
snp_vcf: { type: 'File', secondaryFiles: ['.tbi'], doc: "VCF file containing SNPs" }
indel_vcf: { type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], doc: "VCF file containing INDELs" }
snp_vcf: { type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], doc: "VCF file containing SNPs" }
output_basename: { type: 'string', doc: "String value to use as the base of the output filename" }
ram: { type: 'int?', default: 8, doc: "GB of memory to allocate to this task. default: 8; softcap" }
cpu: { type: 'int?', default: 4, doc: "Number of CPUs to allocate to this task. default: 4" }
Expand All @@ -38,4 +38,4 @@ outputs:
type: 'File'
outputBinding:
glob: '$(inputs.output_basename).merged.vcf.gz'
secondaryFiles: ['.tbi']
secondaryFiles: [{pattern: '.tbi', required: true}]
Loading