From 1173b0de6a7354177daf75af8d03dbd2efb4bfd7 Mon Sep 17 00:00:00 2001 From: skchronicles Date: Thu, 20 Jun 2024 15:34:35 -0400 Subject: [PATCH] Adding genotype refinement rule. --- config/cluster/slurm.json | 5 +++ config/cluster/uge.json | 5 +++ config/genome.json | 2 + workflow/rules/gatk_germline.smk | 74 ++++++++++++++++++++++++++++++++ 4 files changed, 86 insertions(+) diff --git a/config/cluster/slurm.json b/config/cluster/slurm.json index 4a0942f..f86ec89 100644 --- a/config/cluster/slurm.json +++ b/config/cluster/slurm.json @@ -254,6 +254,11 @@ "mem": "48G", "time": "2-00:00:00" }, + "gatk_germline_scatter_genotype_refinement": { + "threads": "2", + "mem": "48G", + "time": "2-00:00:00" + }, "hla": { "threads": "8", "partition": "norm", diff --git a/config/cluster/uge.json b/config/cluster/uge.json index 416b3c2..6dc5339 100644 --- a/config/cluster/uge.json +++ b/config/cluster/uge.json @@ -246,6 +246,11 @@ "partition": "", "mem": "6G" }, + "gatk_germline_scatter_genotype_refinement": { + "mem": "24G", + "partition": "", + "threads": "2" + }, "hla": { "mem": "12G", "partition": "", diff --git a/config/genome.json b/config/genome.json index 7df7660..685e6b2 100644 --- a/config/genome.json +++ b/config/genome.json @@ -55,6 +55,7 @@ "chrX:120000001-150000001", "chrX:150000001-156040895", "chrY:1-30000001", "chrY:30000001-57227415", "chrM:1-16569" ], + "EXAC": "/data/OpenOmics/references/genome-seek/GATK_resource_bundle/hg38bundle/exacv1_grch38_release1_ExAC.r1.sites.liftover.GRCh38.vcf.gz", "GENOME": "/data/OpenOmics/references/genome-seek/Homo_sapiens_assembly38.fasta", "GNOMAD": "/data/OpenOmics/references/genome-seek/GNOMAD/somatic-hg38-af-only-gnomad.hg38.vcf.gz", "SLIVARGNOMAD": "/data/OpenOmics/references/genome-seek/slivar/gnomad.hg38.v2.zip", @@ -116,6 +117,7 @@ "SPECIES": "homo_sapiens" }, "1000GSNP": "/data/OpenOmics/references/genome-seek/GATK_resource_bundle/hg38bundle/1000G_phase1.snps.high_confidence.hg38.vcf.gz", + "1000G": "/data/OpenOmics/references/genome-seek/1000G/ALL.GRCh38_sites.nuclear.20170504.vcf.gz", "VEP_BUILD": "GRCh38", "VEP_DATA": "/data/OpenOmics/references/genome-seek/VEP/106/cache", "VEP_SPECIES": "homo_sapiens", diff --git a/workflow/rules/gatk_germline.smk b/workflow/rules/gatk_germline.smk index bbe5791..150c57d 100644 --- a/workflow/rules/gatk_germline.smk +++ b/workflow/rules/gatk_germline.smk @@ -628,3 +628,77 @@ rule gatk_germline_apply_vqsr_indels: --truth-sensitivity-filter-level 99.7 \\ -O {output.vcf} """ + + +rule gatk_germline_scatter_genotype_refinement: + """ + Data-processing step to calculate genotype posterior probabilities + given known population genotypes. The tool calculates the posterior + genotype probability for each sample genotype in a given VCF format + callset. This tool uses use extra information like allele frequencies + in relevant populations to further refine the genotype assignments. + @Input: + A recalibrated VCF file in which each variant of the requested + type is annotated with its VQSLOD and marked as filtered if + the score is below the desired quality level containing both + SNPs and INDELs for a given region. + (gather-per-cohort-scatter-across-regions) + @Output: + Genotype refined VCF with the following information: Genotype + posteriors added to the FORMAT fields ("PP"), genotypes and GQ + assigned according to these posteriors, per-site genotype priors + added to the INFO field ("PG"). + """ + input: + vcf = join(workpath, "haplotypecaller", "VCFs", "snp_indel_recal_chunks", "snps_and_indels_recal_variants_{region}.vcf.gz"), + output: + tmp = temp( + join(workpath, "haplotypecaller", "VCFs", "gtype_temp_chunks", "snps_and_indels_recal_refinement_variants_{region}.vcf.gz") + ), + vcf = temp( + join(workpath, "haplotypecaller", "VCFs", "gtype_fixed_chunks", "snps_and_indels_recal_refinement_variants_{region}.GTfix.vcf.gz") + ), + params: + rname = "gatk_gl_scatter_gtype_refine", + # Reference files for GType Refinement + genome = config['references']['GENOME'], + onekg = config['references']['1000G'], + exac = config['references']['EXAC'], + chunk = "{region}", + # For UGE/SGE clusters memory is allocated + # per cpu, so we must calculate total mem + # as the product of threads and memory + memory = lambda _: int( + int(allocated("mem", "gatk_germline_scatter_genotype_refinement", cluster).lower().rstrip('g')) * \ + int(allocated("threads", "gatk_germline_scatter_genotype_refinement", cluster)) + )-2 if run_mode == "uge" \ + else int(allocated("mem", "gatk_germline_scatter_genotype_refinement", cluster).lower().rstrip('g')) - 2, + threads: int(allocated("threads", "gatk_germline_scatter_genotype_refinement", cluster)) + container: config['images']['genome-seek'] + envmodules: + config['tools']['gatk4'], + config['tools']['bcftools'] + shell: """ + # Calculate genotype posterior probabilities + # in relevant populations to further refine + # the genotype assignments. + gatk --java-options '-Xmx{params.memory}g' CalculateGenotypePosteriors \\ + --reference {params.genome} \\ + --use-jdk-inflater --use-jdk-deflater \\ + -V {input.vcf} \\ + -supporting {params.onekg} \\ + -supporting {params.exac} \\ + -O {output.tmp} \\ + -L {params.chunk} + + # Unphase all genotypes and sort by allele + # frequency, example: (1|0 becomes 0/1) + bcftools +setGT \\ + {output.tmp} \\ + -O z \\ + -o {output.vcf} \\ + -- -t a -n u + + # Create an tabix index for the VCF file + tabix -p vcf {output.vcf} + """