-
Notifications
You must be signed in to change notification settings - Fork 32
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1488 from BRCAChallenge/gnomad_update
New functionality to extract the gnomAD flags from the gnomAD VCF by downloading and parsing them
- Loading branch information
Showing
4 changed files
with
431 additions
and
28 deletions.
There are no files selected for viewing
55 changes: 55 additions & 0 deletions
55
pipeline/gnomad/variant_scoring/download.parse.gnomad.vcf.bash
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#!/usr/bin/env bash | ||
|
||
export BRCA1_CHROM=17 | ||
export BRCA2_CHROM=13 | ||
|
||
export BRCA1_START_HG19=41196312 | ||
export BRCA1_END_HG19=41277500 | ||
export BRCA2_START_HG19=32889617 | ||
export BRCA2_START_HG19=32973809 | ||
|
||
export BRCA1_START_HG38=43044295 | ||
export BRCA1_END_HG38=43125483 | ||
export BRCA2_START_HG38=32315474 | ||
export BRCA2_END_HG38=32400266 | ||
|
||
|
||
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.17.vcf.bgz | ||
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.17.vcf.bgz.tbi | ||
bcftools view -r ${BRCA1_CHROM}:${BRCA1_START_HG19}-${BRCA1_END_HG19} gnomad.exomes.r2.1.1.sites.17.vcf.bgz \ | ||
-o brca1.gnomAD.exomes.2.1.1.hg19.vcf -Ov | ||
|
||
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.17.vcf.bgz | ||
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.17.vcf.bgz.tbi | ||
bcftools view -r ${BRCA1_CHROM}:${BRCA1_START_HG19}-${BRCA1_END_HG19} gnomad.genomes.r2.1.1.sites.17.vcf.bgz \ | ||
-o brca1.gnomAD.genomes.2.1.1.hg19.vcf -Ov | ||
|
||
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.13.vcf.bgz | ||
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.13.vcf.bgz.tbi | ||
bcftools view -r ${BRCA2_CHROM}:${BRCA2_START_HG19}-${BRCA2_END_HG19} gnomad.exomes.r2.1.1.sites.13.vcf.bgz \ | ||
-o brca2.gnomAD.exomes.2.1.1.hg19.vcf -Ov | ||
|
||
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.13.vcf.bgz | ||
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.13.vcf.bgz.tbi | ||
bcftools view -r ${BRCA2_CHROM}:${BRCA2_START_HG19}-${BRCA2_END_HG19} gnomad.genomes.r2.1.1.sites.13.vcf.bgz \ | ||
-o brca2.gnomAD.genomes.2.1.1.hg19.vcf -Ov | ||
|
||
vcf-concat brca1.gnomAD.exomes.2.1.1.hg19.vcf brca2.gnomAD.exomes.2.1.1.hg19.vcf brca1.gnomAD.genomes.2.1.1.hg19.vcf \ | ||
brca2.gnomAD.genomes.2.1.1.hg19.vcf > brca.gnomAD.2.1.1.hg19.vcf | ||
extract_from_vcf.py -i brca.gnomAD.2.1.1.hg19.vcf > brca.gnomAD.2.1.1.hg19.flags.tsv | ||
rm *bgz* | ||
|
||
|
||
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/3.1.1/vcf/genomes/gnomad.genomes.v3.1.1.sites.chr17.vcf.bgz | ||
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/3.1.1/vcf/genomes/gnomad.genomes.v3.1.1.sites.chr17.vcf.bgz.tbi | ||
bcftools view -r chr${BRCA1_CHROM}:${BRCA1_START_HG38}-${BRCA1_END_HG38} gnomad.genomes.v3.1.1.sites.chr17.vcf.bgz \ | ||
-Ov -o brca1.gnomAD.genomes.3.1.1.hg38.vcf | ||
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/3.1.1/vcf/genomes/gnomad.genomes.v3.1.1.sites.chr13.vcf.bgz | ||
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/3.1.1/vcf/genomes/gnomad.genomes.v3.1.1.sites.chr13.vcf.bgz.tbi | ||
bcftools view -r chr${BRCA2_CHROM}:${BRCA2_START_HG38}-${BRCA2_END_HG38} gnomad.genomes.v3.1.1.sites.chr13.vcf.bgz \ | ||
-Ov -o brca2.gnomAD.genomes.3.1.1.hg38.vcf | ||
|
||
vcf-concat brca1.gnomAD.genomes.3.1.1.hg38.vcf brca2.gnomAD.genomes.3.1.1.hg38.vcf > brca.gnomAD.3.1.1.hg38.vcf | ||
extract_from_vcf.py -i brca.gnomAD.3.1.1.hg38.vcf > brca.gnomAD.3.1.1.hg38.flags.tsv | ||
rm *bgz* | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
#!/usr/bin/env python | ||
|
||
""" | ||
Description: | ||
Takes in a gnomad table and converts it to vcf format. | ||
""" | ||
|
||
import argparse | ||
from pysam import VariantFile | ||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser(description="Parse a gnomAD VCF for the BRCA Exchange pipeline") | ||
parser.add_argument('-i', '--input', help="Input VCF file, from gnomAD") | ||
parser.add_argument('-v', '--verbose', action='count', default=False, help='determines logging') | ||
options = parser.parse_args() | ||
return options | ||
|
||
def main(): | ||
args = parse_args() | ||
vcf_in = VariantFile(args.input) | ||
print("ID\tFilters\tLCR") | ||
for record in vcf_in.fetch(): | ||
all_filters = "" | ||
delim = "" | ||
for this_filter in record.filter.keys(): | ||
all_filters = all_filters + delim + this_filter | ||
delim = "," | ||
for this_alt in record.alts: | ||
print("%s-%s-%s-%s\t%s\t%d" % (record.chrom, record.pos, record.ref, this_alt, | ||
all_filters, record.info["lcr"])) | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
#!/usr/bin/env python | ||
# coding: utf-8 | ||
|
||
import itertools | ||
import argparse | ||
import csv | ||
from collections import OrderedDict | ||
import sys | ||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-i", "--input", default="built_with_popfreq.tsv", | ||
help="Input file with new estimated popfreq") | ||
parser.add_argument("-t", "--test", default="test.tsv", | ||
help="Input file with old estimted popfreq") | ||
args = parser.parse_args() | ||
return(args) | ||
|
||
def field_defined(field): | ||
""" | ||
Return a binary value indicating whether or not this field | ||
has a defined value | ||
""" | ||
return(field != "-") | ||
|
||
def key_fields(id, variant, evidence_code, faf95, description): | ||
data_this_variant = { | ||
"id" : id, | ||
"hgvs" : variant["pyhgvs_cDNA"], | ||
"evidence_code" : evidence_code, | ||
"faf95": faf95, | ||
"description": description, | ||
"final_code" : variant["Provisional_evidence_code_popfreq"] | ||
} | ||
return(data_this_variant) | ||
|
||
|
||
def parse_gnomad_annotations(input_file): | ||
v2_new = dict() | ||
v3_new = dict() | ||
for variant in input_file: | ||
if field_defined(variant["Variant_id_GnomAD"]): | ||
v2_id = variant["Variant_id_GnomAD"] | ||
v2_new[v2_id] = key_fields(v2_id, variant, | ||
variant["Provisional_code_GnomAD"], | ||
variant["faf95_popmax_exome_GnomAD"], | ||
variant["Provisional_code_description_GnomAD"]) | ||
if field_defined(variant["Variant_id_GnomADv3"]): | ||
v3_id = variant["Variant_id_GnomADv3"] | ||
v3_new[v3_id] = key_fields(v3_id, variant, | ||
variant["Provisional_code_GnomADv3"], | ||
variant["faf95_popmax_genome_GnomADv3"], | ||
variant["Provisional_code_description_GnomADv3"]) | ||
return(v2_new, v3_new) | ||
|
||
def print_header(): | ||
print("\t".join(["gnomAD_id", "gnomAD_version", "hgvs", | ||
"evidence_code_old", "allele_freq_old", | ||
"evidence_code_new", "allele_freq_new", "descr_new", | ||
"final_code_old", "final_code_new"])) | ||
|
||
def compare(old_data, version, var_name, new_data): | ||
if not var_name in new_data: | ||
sys.stderr.write("variant %s version %s not found in the new data\n" | ||
% (var_name, version)) | ||
else: | ||
print("\t".join([var_name, version, new_data[var_name]["hgvs"], | ||
old_data["evidence_code"], | ||
old_data["AF_popmax"], | ||
new_data[var_name]["evidence_code"], | ||
new_data[var_name]["faf95"], | ||
new_data[var_name]["description"], | ||
old_data["final_code"], | ||
new_data[var_name]["final_code"]])) | ||
|
||
|
||
def print_comparison(entry, v2, v3): | ||
version = entry["src"] | ||
var_name = entry["var_name"] | ||
if version == "v2": | ||
compare(entry, version, entry["var_name_hg19"], v2) | ||
elif version == "v3": | ||
compare(entry, version, var_name, v3) | ||
|
||
|
||
def main(): | ||
args = parse_args() | ||
input_file = csv.DictReader(open(args.input), delimiter = "\t") | ||
(v2_new, v3_new) = parse_gnomad_annotations(input_file) | ||
test_file = csv.DictReader(open(args.test), delimiter = "\t") | ||
print_header() | ||
for entry in test_file: | ||
print_comparison(entry, v2_new, v3_new) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.