From 1173b0de6a7354177daf75af8d03dbd2efb4bfd7 Mon Sep 17 00:00:00 2001
From: skchronicles <kuhnsa3@gmail.com>
Date: Thu, 20 Jun 2024 15:34:35 -0400
Subject: [PATCH] Adding genotype refinement rule.

---
 config/cluster/slurm.json        |  5 +++
 config/cluster/uge.json          |  5 +++
 config/genome.json               |  2 +
 workflow/rules/gatk_germline.smk | 74 ++++++++++++++++++++++++++++++++
 4 files changed, 86 insertions(+)

diff --git a/config/cluster/slurm.json b/config/cluster/slurm.json
index 4a0942f..f86ec89 100644
--- a/config/cluster/slurm.json
+++ b/config/cluster/slurm.json
@@ -254,6 +254,11 @@
         "mem": "48G",
         "time": "2-00:00:00"
     },
+    "gatk_germline_scatter_genotype_refinement": {
+        "threads": "2",
+        "mem": "48G",
+        "time": "2-00:00:00"
+    },
     "hla": {
         "threads": "8",
         "partition": "norm",
diff --git a/config/cluster/uge.json b/config/cluster/uge.json
index 416b3c2..6dc5339 100644
--- a/config/cluster/uge.json
+++ b/config/cluster/uge.json
@@ -246,6 +246,11 @@
       "partition": "",
       "mem": "6G"
   },
+  "gatk_germline_scatter_genotype_refinement": {
+    "mem": "24G",
+    "partition": "",
+    "threads": "2"
+  },
   "hla": {
     "mem": "12G",
     "partition": "",
diff --git a/config/genome.json b/config/genome.json
index 7df7660..685e6b2 100644
--- a/config/genome.json
+++ b/config/genome.json
@@ -55,6 +55,7 @@
             "chrX:120000001-150000001", "chrX:150000001-156040895", "chrY:1-30000001",
             "chrY:30000001-57227415", "chrM:1-16569"
         ],
+        "EXAC": "/data/OpenOmics/references/genome-seek/GATK_resource_bundle/hg38bundle/exacv1_grch38_release1_ExAC.r1.sites.liftover.GRCh38.vcf.gz",
         "GENOME": "/data/OpenOmics/references/genome-seek/Homo_sapiens_assembly38.fasta",
         "GNOMAD": "/data/OpenOmics/references/genome-seek/GNOMAD/somatic-hg38-af-only-gnomad.hg38.vcf.gz",
         "SLIVARGNOMAD": "/data/OpenOmics/references/genome-seek/slivar/gnomad.hg38.v2.zip",
@@ -116,6 +117,7 @@
             "SPECIES": "homo_sapiens"
         },
         "1000GSNP": "/data/OpenOmics/references/genome-seek/GATK_resource_bundle/hg38bundle/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
+        "1000G": "/data/OpenOmics/references/genome-seek/1000G/ALL.GRCh38_sites.nuclear.20170504.vcf.gz",
         "VEP_BUILD": "GRCh38",
         "VEP_DATA": "/data/OpenOmics/references/genome-seek/VEP/106/cache",
         "VEP_SPECIES": "homo_sapiens",
diff --git a/workflow/rules/gatk_germline.smk b/workflow/rules/gatk_germline.smk
index bbe5791..150c57d 100644
--- a/workflow/rules/gatk_germline.smk
+++ b/workflow/rules/gatk_germline.smk
@@ -628,3 +628,77 @@ rule gatk_germline_apply_vqsr_indels:
         --truth-sensitivity-filter-level 99.7 \\
         -O {output.vcf}
     """
+
+
+rule gatk_germline_scatter_genotype_refinement:
+    """
+    Data-processing step to calculate genotype posterior probabilities 
+    given known population genotypes. The tool calculates the posterior 
+    genotype probability for each sample genotype in a given VCF format 
+    callset. This tool uses use extra information like allele frequencies
+    in relevant populations to further refine the genotype assignments.
+    @Input:
+        A recalibrated VCF file in which each variant of the requested
+        type is annotated with its VQSLOD and marked as filtered if
+        the score is below the desired quality level containing both
+        SNPs and INDELs for a given region.
+        (gather-per-cohort-scatter-across-regions)
+    @Output:
+        Genotype refined VCF with the following information: Genotype 
+        posteriors added to the FORMAT fields ("PP"), genotypes and GQ 
+        assigned according to these posteriors, per-site genotype priors 
+        added to the INFO field ("PG").
+    """
+    input:
+        vcf = join(workpath, "haplotypecaller", "VCFs", "snp_indel_recal_chunks", "snps_and_indels_recal_variants_{region}.vcf.gz"),
+    output:
+        tmp = temp(
+            join(workpath, "haplotypecaller", "VCFs", "gtype_temp_chunks", "snps_and_indels_recal_refinement_variants_{region}.vcf.gz")
+        ),
+        vcf = temp(
+            join(workpath, "haplotypecaller", "VCFs", "gtype_fixed_chunks", "snps_and_indels_recal_refinement_variants_{region}.GTfix.vcf.gz")
+        ),
+    params:
+        rname  = "gatk_gl_scatter_gtype_refine",
+        # Reference files for GType Refinement
+        genome = config['references']['GENOME'],
+        onekg  = config['references']['1000G'],
+        exac   = config['references']['EXAC'],
+        chunk  = "{region}",
+        # For UGE/SGE clusters memory is allocated
+        # per cpu, so we must calculate total mem
+        # as the product of threads and memory
+        memory = lambda _: int(
+            int(allocated("mem", "gatk_germline_scatter_genotype_refinement", cluster).lower().rstrip('g')) * \
+            int(allocated("threads", "gatk_germline_scatter_genotype_refinement", cluster)) 
+        )-2 if run_mode == "uge" \
+        else int(allocated("mem", "gatk_germline_scatter_genotype_refinement", cluster).lower().rstrip('g')) - 2,
+    threads: int(allocated("threads", "gatk_germline_scatter_genotype_refinement", cluster))
+    container: config['images']['genome-seek']
+    envmodules:
+        config['tools']['gatk4'],
+        config['tools']['bcftools']
+    shell: """
+    # Calculate genotype posterior probabilities
+    # in relevant populations to further refine
+    # the genotype assignments.
+    gatk --java-options '-Xmx{params.memory}g' CalculateGenotypePosteriors \\
+        --reference {params.genome} \\
+        --use-jdk-inflater --use-jdk-deflater \\
+        -V {input.vcf} \\
+        -supporting {params.onekg} \\
+        -supporting {params.exac} \\
+        -O {output.tmp} \\
+        -L {params.chunk}
+
+    # Unphase all genotypes and sort by allele
+    # frequency, example: (1|0 becomes 0/1)
+    bcftools +setGT \\
+        {output.tmp} \\
+        -O z \\
+        -o {output.vcf} \\
+        -- -t a -n u
+
+    # Create an tabix index for the VCF file
+    tabix -p vcf {output.vcf}
+    """