Skip to content

Commit

Permalink
Merge pull request #1488 from BRCAChallenge/gnomad_update
Browse files Browse the repository at this point in the history
New functionality to extract the gnomAD flags from the gnomAD VCF by downloading and parsing them
  • Loading branch information
melissacline authored Jan 22, 2024
2 parents 1cd08f7 + 562fa48 commit cb2d088
Show file tree
Hide file tree
Showing 4 changed files with 431 additions and 28 deletions.
55 changes: 55 additions & 0 deletions pipeline/gnomad/variant_scoring/download.parse.gnomad.vcf.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env bash

export BRCA1_CHROM=17
export BRCA2_CHROM=13

export BRCA1_START_HG19=41196312
export BRCA1_END_HG19=41277500
export BRCA2_START_HG19=32889617
export BRCA2_START_HG19=32973809

export BRCA1_START_HG38=43044295
export BRCA1_END_HG38=43125483
export BRCA2_START_HG38=32315474
export BRCA2_END_HG38=32400266


wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.17.vcf.bgz
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.17.vcf.bgz.tbi
bcftools view -r ${BRCA1_CHROM}:${BRCA1_START_HG19}-${BRCA1_END_HG19} gnomad.exomes.r2.1.1.sites.17.vcf.bgz \
-o brca1.gnomAD.exomes.2.1.1.hg19.vcf -Ov

wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.17.vcf.bgz
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.17.vcf.bgz.tbi
bcftools view -r ${BRCA1_CHROM}:${BRCA1_START_HG19}-${BRCA1_END_HG19} gnomad.genomes.r2.1.1.sites.17.vcf.bgz \
-o brca1.gnomAD.genomes.2.1.1.hg19.vcf -Ov

wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.13.vcf.bgz
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.13.vcf.bgz.tbi
bcftools view -r ${BRCA2_CHROM}:${BRCA2_START_HG19}-${BRCA2_END_HG19} gnomad.exomes.r2.1.1.sites.13.vcf.bgz \
-o brca2.gnomAD.exomes.2.1.1.hg19.vcf -Ov

wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.13.vcf.bgz
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.13.vcf.bgz.tbi
bcftools view -r ${BRCA2_CHROM}:${BRCA2_START_HG19}-${BRCA2_END_HG19} gnomad.genomes.r2.1.1.sites.13.vcf.bgz \
-o brca2.gnomAD.genomes.2.1.1.hg19.vcf -Ov

vcf-concat brca1.gnomAD.exomes.2.1.1.hg19.vcf brca2.gnomAD.exomes.2.1.1.hg19.vcf brca1.gnomAD.genomes.2.1.1.hg19.vcf \
brca2.gnomAD.genomes.2.1.1.hg19.vcf > brca.gnomAD.2.1.1.hg19.vcf
extract_from_vcf.py -i brca.gnomAD.2.1.1.hg19.vcf > brca.gnomAD.2.1.1.hg19.flags.tsv
rm *bgz*


wget https://storage.googleapis.com/gcp-public-data--gnomad/release/3.1.1/vcf/genomes/gnomad.genomes.v3.1.1.sites.chr17.vcf.bgz
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/3.1.1/vcf/genomes/gnomad.genomes.v3.1.1.sites.chr17.vcf.bgz.tbi
bcftools view -r chr${BRCA1_CHROM}:${BRCA1_START_HG38}-${BRCA1_END_HG38} gnomad.genomes.v3.1.1.sites.chr17.vcf.bgz \
-Ov -o brca1.gnomAD.genomes.3.1.1.hg38.vcf
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/3.1.1/vcf/genomes/gnomad.genomes.v3.1.1.sites.chr13.vcf.bgz
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/3.1.1/vcf/genomes/gnomad.genomes.v3.1.1.sites.chr13.vcf.bgz.tbi
bcftools view -r chr${BRCA2_CHROM}:${BRCA2_START_HG38}-${BRCA2_END_HG38} gnomad.genomes.v3.1.1.sites.chr13.vcf.bgz \
-Ov -o brca2.gnomAD.genomes.3.1.1.hg38.vcf

vcf-concat brca1.gnomAD.genomes.3.1.1.hg38.vcf brca2.gnomAD.genomes.3.1.1.hg38.vcf > brca.gnomAD.3.1.1.hg38.vcf
extract_from_vcf.py -i brca.gnomAD.3.1.1.hg38.vcf > brca.gnomAD.3.1.1.hg38.flags.tsv
rm *bgz*

33 changes: 33 additions & 0 deletions pipeline/gnomad/variant_scoring/extract_from_vcf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env python

"""
Description:
Takes in a gnomad table and converts it to vcf format.
"""

import argparse
from pysam import VariantFile

def parse_args():
parser = argparse.ArgumentParser(description="Parse a gnomAD VCF for the BRCA Exchange pipeline")
parser.add_argument('-i', '--input', help="Input VCF file, from gnomAD")
parser.add_argument('-v', '--verbose', action='count', default=False, help='determines logging')
options = parser.parse_args()
return options

def main():
args = parse_args()
vcf_in = VariantFile(args.input)
print("ID\tFilters\tLCR")
for record in vcf_in.fetch():
all_filters = ""
delim = ""
for this_filter in record.filter.keys():
all_filters = all_filters + delim + this_filter
delim = ","
for this_alt in record.alts:
print("%s-%s-%s-%s\t%s\t%d" % (record.chrom, record.pos, record.ref, this_alt,
all_filters, record.info["lcr"]))

if __name__ == "__main__":
main()
97 changes: 97 additions & 0 deletions pipeline/gnomad/variant_scoring/test_variant_scoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env python
# coding: utf-8

import itertools
import argparse
import csv
from collections import OrderedDict
import sys

def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", default="built_with_popfreq.tsv",
help="Input file with new estimated popfreq")
parser.add_argument("-t", "--test", default="test.tsv",
help="Input file with old estimted popfreq")
args = parser.parse_args()
return(args)

def field_defined(field):
"""
Return a binary value indicating whether or not this field
has a defined value
"""
return(field != "-")

def key_fields(id, variant, evidence_code, faf95, description):
data_this_variant = {
"id" : id,
"hgvs" : variant["pyhgvs_cDNA"],
"evidence_code" : evidence_code,
"faf95": faf95,
"description": description,
"final_code" : variant["Provisional_evidence_code_popfreq"]
}
return(data_this_variant)


def parse_gnomad_annotations(input_file):
v2_new = dict()
v3_new = dict()
for variant in input_file:
if field_defined(variant["Variant_id_GnomAD"]):
v2_id = variant["Variant_id_GnomAD"]
v2_new[v2_id] = key_fields(v2_id, variant,
variant["Provisional_code_GnomAD"],
variant["faf95_popmax_exome_GnomAD"],
variant["Provisional_code_description_GnomAD"])
if field_defined(variant["Variant_id_GnomADv3"]):
v3_id = variant["Variant_id_GnomADv3"]
v3_new[v3_id] = key_fields(v3_id, variant,
variant["Provisional_code_GnomADv3"],
variant["faf95_popmax_genome_GnomADv3"],
variant["Provisional_code_description_GnomADv3"])
return(v2_new, v3_new)

def print_header():
print("\t".join(["gnomAD_id", "gnomAD_version", "hgvs",
"evidence_code_old", "allele_freq_old",
"evidence_code_new", "allele_freq_new", "descr_new",
"final_code_old", "final_code_new"]))

def compare(old_data, version, var_name, new_data):
if not var_name in new_data:
sys.stderr.write("variant %s version %s not found in the new data\n"
% (var_name, version))
else:
print("\t".join([var_name, version, new_data[var_name]["hgvs"],
old_data["evidence_code"],
old_data["AF_popmax"],
new_data[var_name]["evidence_code"],
new_data[var_name]["faf95"],
new_data[var_name]["description"],
old_data["final_code"],
new_data[var_name]["final_code"]]))


def print_comparison(entry, v2, v3):
version = entry["src"]
var_name = entry["var_name"]
if version == "v2":
compare(entry, version, entry["var_name_hg19"], v2)
elif version == "v3":
compare(entry, version, var_name, v3)


def main():
args = parse_args()
input_file = csv.DictReader(open(args.input), delimiter = "\t")
(v2_new, v3_new) = parse_gnomad_annotations(input_file)
test_file = csv.DictReader(open(args.test), delimiter = "\t")
print_header()
for entry in test_file:
print_comparison(entry, v2_new, v3_new)


if __name__ == "__main__":
main()
Loading

0 comments on commit cb2d088

Please sign in to comment.