diff --git a/.circleci/config.yml b/.circleci/config.yml index 7bba53737..663262ec8 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,6 +1,7 @@ version: 2 jobs: buildweb: + circleci_ip_ranges: true docker: - image: circleci/node:16 - image: circleci/postgres:9.6.2 @@ -38,6 +39,7 @@ jobs: - store_test_results: path: ~/test_reports deploy-dev: + circleci_ip_ranges: true docker: - image: circleci/node:16 steps: @@ -60,6 +62,7 @@ jobs: name: deploying to dev machine command: ~/project/deployment/deploy-dev deploy-beta: + circleci_ip_ranges: true docker: - image: circleci/node:16 steps: @@ -94,8 +97,6 @@ jobs: command: | apk add --no-cache \ py-pip=9.0.0-r1 - pip install \ - docker-compose==1.12.0 - restore_cache: keys: - v1-{{ .Branch }} diff --git a/deployment/deploy-dev b/deployment/deploy-dev index 913e08b14..4eb82b7b4 100755 --- a/deployment/deploy-dev +++ b/deployment/deploy-dev @@ -13,8 +13,9 @@ cd ${WEBSITE} npm run build:prod # deploy (not preserving owner/group) -rsync -rlptD --delete --rsync-path='rsync' build/ ${USER}@${HOST}:/var/www/html/beta -rsync -rlptD --delete --exclude="/uploads" --exclude="/downloads/*" --rsync-path='rsync' django/ ${USER}@${HOST}:/var/www/backend/beta/django +# Force rsync's ssh to use ipv4 to prevent "Cannot assign requested address" error +rsync -rlptD -e 'ssh -4' --delete --rsync-path='rsync' build/ ${USER}@${HOST}:/var/www/html/beta +rsync -rlptD -e 'ssh -4' --delete --exclude="/uploads" --exclude="/downloads/*" --rsync-path='rsync' django/ ${USER}@${HOST}:/var/www/backend/beta/django requirements=$(cat requirements.txt) requirements=$(echo ${requirements}) # drop carriage returns diff --git a/deployment/site_settings/config.beta.js b/deployment/site_settings/config.beta.js index e122ff4f0..e0bbadc00 100644 --- a/deployment/site_settings/config.beta.js +++ b/deployment/site_settings/config.beta.js @@ -5,7 +5,7 @@ baseurl: '/', captcha_key: '', /* reCAPTCHA API key */ maps_key: '', /* Google maps javascript API key */ - backend_url: 'http://brcaexchange-prod.gi.ucsc.edu/backend', + backend_url: 'https://brcaexchange-prod.gi.ucsc.edu/backend', analytics: null, environment: 'beta' }; diff --git a/pipeline/Makefile b/pipeline/Makefile index ba5a3ec64..78b20de62 100755 --- a/pipeline/Makefile +++ b/pipeline/Makefile @@ -75,7 +75,6 @@ endif COMMON_DOCKER_ARGS = --rm -u `id -u ${USER}`:$(DOCKER_GRP) \ -e "DATA_DATE=$(DATA_DATE)" \ - -e "UTA_DB_URL=postgresql://anonymous@0.0.0.0:$(UTA_PORT)/uta/uta_$(UTA_RELEASE_DATE)" \ -e "HGVS_SEQREPO_DIR=$(SEQ_REPO_DIR_DOCKER)/latest" \ -e "PYTHONPATH=/opt/brca-exchange/pipeline" \ --network host \ @@ -142,7 +141,7 @@ test-coverage: ## Running pipeline unit tests with coverage information docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) bash -c 'cd /opt/brca-exchange/pipeline/data && bash getdata && cd /opt/brca-exchange/pipeline && pytest --cov --ignore=splicing/ && coverage html --include="/opt/brca-exchange/pipeline/*" --omit="*/test_*"' -build-release: start-local-uta checkout build-docker setup-files setup-lovd download-resources download-seqrepo start-seqrepo-rest-service run-pipeline variants-by-source ## create new data release +build-release: checkout build-docker setup-files setup-lovd download-resources download-seqrepo start-seqrepo-rest-service run-pipeline variants-by-source ## create new data release variants-by-source: ## postprocessing: compute statistics for changes with respect to the last release docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) python /opt/brca-exchange/pipeline/utilities/variantsBySource.py -i /files/data/output/release/built_with_change_types.tsv -c true @@ -179,7 +178,7 @@ post-release-cmds: cleanup-failed include-release-notes push-docker tag-release setup-data-from-latest-release-tar: setup-files ## sets up brca output dir with data contained in release archive from last release (only data from variant merging onwards) tar -C $(OUT_DIR) -zxf $(PREVIOUS_RELEASE_PATH) -setup-dev-env: start-local-uta build-docker setup-files download-resources ## setup development environment +setup-dev-env: build-docker setup-files download-resources ## setup development environment clean-pyc: ## remove Python file artifacts find . -name '*.pyc' -exec rm -f {} + diff --git a/pipeline/analysis/add_bioinfo_pred.py b/pipeline/analysis/add_bioinfo_pred.py new file mode 100755 index 000000000..a6922dcb1 --- /dev/null +++ b/pipeline/analysis/add_bioinfo_pred.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python + +import argparse +import csv +import re +import sys + +BIOINFO_CODE_ID = "Provisional_Evidence_Code_Bioinfo" +BIOINFO_CODE_DESCR = "Provisional_Evidence_Description_Bioinfo" + + +NO_CODE = "NO_CODE" +PP3 = "PP3" +BP4_BP7 = "BP4,BP7" +BP4 = "BP4" +BP1_STRONG = "BP1_STRONG" +PVS1_CODE = "PVS1" + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input", default="build_final.tsv", + help="built_final") + parser.add_argument("-o", "--output", default="built_with_bioinfo.tsv", + help="version of input file with new columns added") + parser.add_argument("-d", "--debug", action="store_true", default=False, + help="Print debugging info") + args = parser.parse_args() + return(args) + + +def initialize_output_file(input_file, output_filename): + """ + Create an empty output file with the new columns + """ + new_columns = [BIOINFO_CODE_ID, BIOINFO_CODE_DESCR] + input_header_row = input_file.fieldnames + if "change_type" in input_header_row: + idx = input_header_row.index("change_type") + output_header_row = input_header_row[:idx] + new_columns \ + + input_header_row[idx:] + else: + output_header_row = input_header_row + new_columns + output_file = csv.DictWriter(open(output_filename,"w"), + fieldnames=output_header_row, + delimiter = '\t') + output_file.writeheader() + return(output_file) + + +def extract_protein_coordinate(variant): + coordinate = None + hit = re.search("[0-9]+", variant["Protein_Change"]) + if hit: + token = variant["Protein_Change"][hit.start():hit.end()] + pos = int(token) + print("from", variant["Protein_Change"], "derived", pos) + return(pos) + +def inside_functional_domain(variant): + inside_domain = False + pos = extract_protein_coordinate(variant) + if pos: + if variant["Gene_Symbol"] == "BRCA1": + if pos >= 2 and pos <= 99: + inside_domain = True + elif pos >= 503 and pos <= 508: + inside_domain = True + elif pos >= 607 and pos <= 614: + inside_domain = True + elif pos >= 651 and pos <= 656: + inside_domain = True + elif pos >= 1391 and pos <= 1424: + inside_domain = True + elif pos >= 1650 and pos <= 1863: + inside_domain = True + elif variant["Gene_Symbol"] == "BRCA2": + if pos >= 10 and pos <= 40: + inside_domain = True + elif pos >= 1002 and pos <= 1036: + inside_domain = True + elif pos >= 1212 and pos <= 1246: + inside_domain = True + elif pos >= 1422 and pos <= 1453: + inside_domain = True + elif pos >= 1518 and pos <= 1549: + inside_domain = True + elif pos >= 1665 and pos <= 1696: + inside_domain = True + elif pos >= 1837 and pos <= 1871: + inside_domain = True + elif pos >= 1971 and pos <= 2005: + inside_domain = True + elif pos >= 2051 and pos <= 2085: + inside_domain = True + elif pos >= 2481 and pos <= 3186: + inside_domain = True + elif pos >= 3263 and pos <= 3269: + inside_domain = True + elif pos >= 3265 and pos <= 3330: + inside_domain = True + elif pos >= 3381 and pos <= 3385: + inside_domain = True + return(inside_domain) + + + +def estimate_bioinfo_code(variant): + effect = "unknown" + bioinfo_code = NO_CODE + if re.search("=\)$", variant["pyhgvs_Protein"]): + effect = "synonymous_variant" + elif re.search("[A-Z]+[0-9]+[A-Z]+", variant["Protein_Change"]): + effect = "missense_variant" + elif re.search("c\.[0-9]+[+]", variant["pyhgvs_cDNA"]): + effect = "intron_variant" + elif re.search("c\.[0-9]+[-]", variant["pyhgvs_cDNA"]): + effect = "intron_variant" + print("variant", variant["pyhgvs_cDNA"], "protein change", variant["Protein_Change"], variant["pyhgvs_Protein"], "effect", effect) + if variant["result_spliceai"] == "-": + splicing_effect = False + no_splicing_effect = True + else: + splicing_effect = (float(variant["result_spliceai"]) > 0.2) + no_splicing_effect = (float(variant["result_spliceai"]) < 0.1) + if variant["Gene_Symbol"] == "BRCA1": + if variant["BayesDel_nsfp33a_noAF"] == "-": + protein_effect = False + no_protein_effect = True + elif float(variant["BayesDel_nsfp33a_noAF"]) > 0.28: + protein_effect = True + no_prptein_effect = False + elif float(variant["BayesDel_nsfp33a_noAF"]) < 0.15: + protein_effect = False + no_protein_effect = True + else: + protein_effect = False + no_protein_effect = False + if variant["Gene_Symbol"] == "BRCA2": + if variant["BayesDel_nsfp33a_noAF"] == "-": + protein_effect = False + no_protein_effect = True + elif float(variant["BayesDel_nsfp33a_noAF"]) > 0.30: + protein_effect = True + no_prptein_effect = False + elif float(variant["BayesDel_nsfp33a_noAF"]) < 0.18: + protein_effect = False + no_protein_effect = True + else: + protein_effect = False + no_protein_effect = False + inside_domain = inside_functional_domain(variant) + print("effect", effect, "splicing effect", splicing_effect, "inside domain", inside_domain) + if effect == "synonymous_variant": + if splicing_effect: + bioinfo_code = PP3 + elif inside_domain: + bioinfo_code = BP4_BP7 + else: + bioinfo_code = BP1_STRONG + elif effect == "intron_variant": + if splicing_effect: + bioinfo_code = PP3 + else: + bioinfo_code = BP4 + elif effect == "missense_variant": + if splicing_effect: + bioinfo_code = PP3 + elif no_splicing_effect: + if not inside_domain: + bioinfo_code = BP1_STRONG + elif protein_effect: + bioinfo_code = PP3 + elif no_protein_effect: + bioinfo_code = BP4 + else: + if inside_domain and protein_effect: + bioinfo_code = PP3 + return(bioinfo_code) + + +def apply_pvs1_code(variant): + pvs1_code = NO_CODE + protein_hgvs = variant["pyhgvs_Protein"] + stop_added = re.search("Ter", protein_hgvs) + if stop_added: + pvs1_code = PVS1_CODE + return(pvs1_code) + + +def main(): + csv.field_size_limit(sys.maxsize) + args = parse_args() + with open(args.input, 'r') as input_fp: + input_reader = csv.DictReader(input_fp, delimiter = "\t") + writer = initialize_output_file(input_reader, args.output) + for variant in input_reader: + #variant[BIOINFO_CODE_ID] = estimate_bioinfo_code(variant, debug=args.debug) + #pvs1_code = apply_pvs1_code(variant) + variant[BIOINFO_CODE_ID] = "" + variant[BIOINFO_CODE_DESCR] = "" + writer.writerow(variant) + +if __name__ == "__main__": + main() diff --git a/pipeline/analysis/popfreq_assessment.py b/pipeline/analysis/popfreq_assessment.py new file mode 100755 index 000000000..d020c4e29 --- /dev/null +++ b/pipeline/analysis/popfreq_assessment.py @@ -0,0 +1,375 @@ +#!/usr/bin/env python +# coding: utf-8 + +import itertools +import argparse +import csv +from collections import OrderedDict +import math +import numpy as np +import pandas as pd + + +POPFREQ_CODE_ID = "Provisional_Evidence_Code_Popfreq" +POPFREQ_CODE_DESCR = "Provisional_Evidence_Description_Popfreq" + + +BA1 = "BA1 (met)" +BS1 = "BS1 (met)" +BS1_SUPPORTING = "BS1_Supporting (met)" +NO_CODE = "No code met (below threshold)" +NO_CODE_NON_SNV = "No code met for population data (indel)" +PM2_SUPPORTING = "PM2_Supporting (met)" +FAIL_NOT_ASSAYED = "Fail_Not_Assayed" +FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG = "No code met (read depth, flags)" +FAIL_NEEDS_REVIEW = "No code met (needs review)" +FAIL_NEEDS_SOFTWARE_REVIEW = "No code met (needs software review)" + +READ_DEPTH_THRESHOLD_FREQUENT_VARIANT = 20 +READ_DEPTH_THRESHOLD_RARE_VARIANT = 25 + +BA1_MSG = "The highest non-cancer, non-founder population filter allele frequency in gnomAD v2.1 (exomes only, non-cancer subset, read depth ≥20) or gnomAD v3.1 (non-cancer subset, read depth ≥20) is %s in the %s population, which is above the ENIGMA BRCA1/2 VCEP threshold (>0.001) for BA1 (BA1 met)." +BS1_MSG = "The highest non-cancer, non-founder population filter allele frequency in gnomAD v2.1 (exomes only, non-cancer subset, read depth ≥20) or gnomAD v3.1 (non-cancer subset, read depth ≥20) is %s in the %s population, which is above the ENIGMA BRCA1/2 VCEP threshold (>0.0001) for BS1, and below the BA1 threshold (>0.001) (BS1 met)." +BS1_SUPPORTING_MSG = "The highest non-cancer, non-founder population filter allele frequency in gnomAD v2.1 (exomes only, non-cancer subset, read depth ≥20) or gnomAD v3.1 f(non-cancer subset, read depth ≥20) is %s in the %s population which is within the ENIGMA BRCA1/2 VCEP threshold (>0.00002 to ≤ 0.0001) for BS1_Supporting (BS1_Supporting met)." +PM2_SUPPORTING_MSG = "This variant is absent from gnomAD v2.1 (exomes only, non-cancer subset, read depth ≥25) and gnomAD v3.1 (non-cancer subset, read depth ≥25) (PM2_Supporting met)." +NO_CODE_MSG = "This variant is present in gnomAD v2.1 (exomes only, non-cancer subset) or gnomAD v3.1 (non-cancer subset) but is below the ENIGMA BRCA1/2 VCEP threshold >0.00002 for BS1_Supporting (PM2_Supporting, BS1, and BA1 are not met)." +NO_CODE_NON_SNV_MSG = "This [insertion/deletion/large genomic rearrangement] variant was not observed in gnomAD v2.1 (exomes only, non-cancer subset) or gnomAD v3.1 (non-cancer subset), but PM2_Supporting was not applied since recall is suboptimal for this type of variant (PM2_Supporting not met)." +FAIL_NOT_ASSAYED_MSG = "Variant not tested in this dataset" +FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG_MSG = "This variant is present in gnomAD v2.1 (exomes only, non-cancer subset) or gnomAD v3.1 (non-cancer subset) but is not meeting the specified read depths threshold ≥20 OR was flagged as suspect by gnomAD (PM2_Supporting, BS1, and BA1 are not met)." +FAIL_NEEDS_REVIEW_MSG = "No code is met (variant needs review)" +FAIL_NEEDS_SOFTWARE_REVIEW_MSG = "No code is met (variant needs software review)" + +MESSAGES_PER_CODE = { + BA1: BA1_MSG, + BS1: BS1_MSG, + BS1_SUPPORTING: BS1_SUPPORTING_MSG, + NO_CODE: NO_CODE_MSG, + NO_CODE_NON_SNV: NO_CODE_NON_SNV_MSG, + PM2_SUPPORTING: PM2_SUPPORTING_MSG, + FAIL_NOT_ASSAYED: FAIL_NOT_ASSAYED_MSG, + FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG: FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG_MSG, + FAIL_NEEDS_REVIEW: FAIL_NEEDS_REVIEW_MSG, + FAIL_NEEDS_SOFTWARE_REVIEW: FAIL_NEEDS_SOFTWARE_REVIEW_MSG + } + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input", default="built_final.tsv", + help="Input file with variant data") + parser.add_argument("-o", "--output", default="built_with_popfreq.tsv", + help="Output file with new columns") + parser.add_argument("-d", "--data_dir", default="./processed_brca", + help="Directory with the processed files") + parser.add_argument("--debug", action="store_true", default=False, + help="Print debugging info") + args = parser.parse_args() + return(args) + + +def read_flags(flag_data): + flags = {} + for row in flag_data: + flags[row["ID"]] = row + return(flags) + + +def estimate_coverage(start, end, chrom, df_cov): + positions = list(range(start, end+1)) + coverage_this_chrom = df_cov.loc[df_cov["chrom"] == int(chrom)] + positions_this_variant = coverage_this_chrom[coverage_this_chrom["pos"].isin(positions)] + meanval = positions_this_variant["mean"].mean() + medianval = positions_this_variant["median"].median() + return(meanval, medianval) + + + + +def initialize_output_file(input_file, output_filename): + """ + Create an empty output file with the new columns + """ + new_columns = [POPFREQ_CODE_ID, POPFREQ_CODE_DESCR] + input_header_row = input_file.fieldnames + if "change_type" in input_header_row: + idx = input_header_row.index("change_type") + output_header_row = input_header_row[:idx] + new_columns \ + + input_header_row[idx:] + else: + output_header_row = input_header_row + new_columns + output_file = csv.DictWriter(open(output_filename,"w"), + fieldnames=output_header_row, + delimiter = '\t') + output_file.writeheader() + return(output_file) + + +def field_defined(field): + """ + Return a binary value indicating whether or not this variant has the popmax FAF defined + """ + return(field != "-") + + +def is_variant_observable(start, end, chrom, coverage): + # + # Read the coverage at the variant position. If the coverage metrics returned + # are not a number, that means that the variant could not be tested in the + # indicated dataset. + (mean_read_depth, median_read_depth) = estimate_coverage(int(start),int(end), + int(chrom),coverage) + if pd.isna(mean_read_depth) and pd.isna(median_read_depth): + return(False) + else: + return(True) + + +def analyze_one_dataset(faf95_popmax_str, faf95_population, allele_count, is_snv, + read_depth, vcf_filter_flag, debug=True): + # + # Get the coverage data. Rule out error conditions: low coverage, VCF filter flag. + rare_variant = False + if field_defined(faf95_popmax_str): + faf = float(faf95_popmax_str) + if pd.isna(faf): + rare_variant = True + elif faf <= 0.00002: + rare_variant = True + else: + faf = None + rare_variant = True + if rare_variant and read_depth < READ_DEPTH_THRESHOLD_RARE_VARIANT: + return(FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG) + if (not rare_variant) and read_depth < READ_DEPTH_THRESHOLD_FREQUENT_VARIANT: + return(FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG) + if vcf_filter_flag: + return(FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG) + # + # Address the cases where FAF is defined, and the variant is a candidate for a + # evidence code for high population frequency (BA1, BS1, BS1_SUPPORTING) + if not rare_variant: + if faf > 0.001: + return(BA1) + elif faf > 0.0001: + return(BS1) + elif faf > 0.00002: + return(BS1_SUPPORTING) + if rare_variant: + if not field_defined(allele_count): + return(NO_CODE) + elif int(allele_count) > 0: + return(NO_CODE) + else: + if is_snv: + return(PM2_SUPPORTING) + else: + return(NO_CODE_NON_SNV) + return(NEEDS_REVIEW) + + +def analyze_across_datasets(code_v2, faf_v2, faf_popmax_v2, in_v2, + code_v3, faf_v3, faf_popmax_v3, in_v3, + debug=False): + """ + Given the per-dataset evidence codes, generate an overall evidence code + """ + benign_codes = [BA1, BS1, BS1_SUPPORTING] + pathogenic_codes = [PM2_SUPPORTING] + intermediate_codes = [ NO_CODE, NO_CODE_NON_SNV] + ordered_success_codes = benign_codes + intermediate_codes + pathogenic_codes + success_codes = set(ordered_success_codes) + failure_codes = set([FAIL_NOT_ASSAYED, FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG]) + ordered_codes = ordered_success_codes + list(failure_codes) + + # + # First, rule out the case of outright contradictions + if code_v2 in benign_codes and code_v3 in pathogenic_codes \ + or code_v2 in pathogenic_codes and code_v3 in benign_codes: + return(FAIL_NEEDS_REVIEW, FAIL_NEEDS_REVIEW_MSG) + # + # Next, rule out the case where neither dataset has reliable data. + # Arbitrarily use the message for v3, as the newer and presumably more robust. + if code_v2 in failure_codes and code_v3 in failure_codes: + return(code_v3, MESSAGES_PER_CODE[code_v3]) + + # + # At this point, we can assume that at least one dataset has + # reliable data. + # + # Next, if both datasets point to a pathogenic effect, or + # if one points to a pathogenic effect and the other an error, + # then return the pathogenic effect. + if code_v2 in pathogenic_codes and code_v3 in pathogenic_codes: + return(code_v2, MESSAGES_PER_CODE[code_v2]) + elif code_v2 in pathogenic_codes and code_v3 in failure_codes: + return(code_v2, MESSAGES_PER_CODE[code_v2]) + elif code_v3 in pathogenic_codes and code_v2 in failure_codes: + return(code_v3, MESSAGES_PER_CODE[code_v3]) + + # + # Next, if either dataset has an intermediate effect and the other + # is not stronger (i.e. is also intermediate, or pathogenic, or failure), + # return the intermediate effect code. + if code_v2 in intermediate_codes and ordered_codes.index(code_v2) <= ordered_codes.index(code_v3): + return(code_v2, MESSAGES_PER_CODE[code_v2]) + elif code_v3 in intermediate_codes and ordered_codes.index(code_v3) <= ordered_codes.index(code_v2): + return(code_v3, MESSAGES_PER_CODE[code_v3]) + + # + # Now, at least one dataset must have a success code. We can also assume that + # neither is pathogenic (i.e. boht are benign, intermediate or failure). + # In this case, identify and return the stronger code. + if debug: + print("prior to assertions, codes are", code_v2, code_v3) + assert(code_v2 in benign_codes or code_v2 in intermediate_codes or code_v3 in benign_codes or code_v3 in intermediate_codes) + if code_v3 == BA1: + return(BA1, BA1_MSG % (faf_v3, faf_popmax_v3)) + elif code_v2 == BA1: + return(BA1, BA1_MSG % (faf_v2, faf_popmax_v2)) + elif code_v3 == BS1: + return(BS1, BS1_MSG % (faf_v3, faf_popmax_v3)) + elif code_v2 == BS1: + return(BS1, BS1_MSG % (faf_v2, faf_popmax_v2)) + elif code_v3 == BS1_SUPPORTING: + return(BS1_SUPPORTING, BS1_SUPPORTING_MSG % (faf_v3, faf_popmax_v3)) + elif code_v2 == BS1_SUPPORTING: + return(BS1_SUPPORTING, BS1_SUPPORTING_MSG % (faf_v2, faf_popmax_v2)) + elif code_v2 == NO_CODE: + return(code_v2, NO_CODE_MSG) + elif code_v3 == NO_CODE: + return(code_v3, NO_CODE_MSG) + # + # If we get here, there is a hole in the logic + return(FAIL_NEEDS_REVIEW, FAIL_NEEDS_SOFTWARE_REVIEW_MSG) + + +def variant_is_flagged(variant_id, flags): + assert(variant_id in flags) + variant_flagged = False + if flags[variant_id]["Filters"] != "PASS": + variant_flagged = True + return(variant_flagged) + + +def analyze_variant(variant, coverage_v2, coverage_v3, flags_v2, flags_v3, + debug=False): + """ + Analyze a single variant, adding the output columns + """ + # Initialize the output columns. First, check the read depth. If the + # read depth is defined at the variant position, this means that the + # variant could have been observed in principle; set the default to + # PM2_SUPPORTING (indicating that by default, the variant could have been + # observed, but wasn't). If no read depth is defined, this means that + # the variant could not have been observed in the dataset in question + # (for example, a deep intronic variant in an exome dataset). + # In such a case, set the default value to NOT_ASSAYED. + variant_v2_code_id = FAIL_NOT_ASSAYED + variant_v3_code_id = FAIL_NOT_ASSAYED + variant[POPFREQ_CODE_ID] = FAIL_NOT_ASSAYED + variant[POPFREQ_CODE_DESCR] = FAIL_NOT_ASSAYED_MSG + observable_in_v2 = False + observable_in_v3 = False + if is_variant_observable(int(variant["pyhgvs_Hg37_Start"]),int(variant["pyhgvs_Hg37_End"]), + int(variant["Chr"]),coverage_v2): + variant_v2_code_id = PM2_SUPPORTING + variant[POPFREQ_CODE_ID] = PM2_SUPPORTING + variant[POPFREQ_CODE_DESCR] = PM2_SUPPORTING_MSG + observable_in_v2 = True + if is_variant_observable(int(variant["Hg38_Start"]), + int(variant["Hg38_End"]), + int(variant["Chr"]), coverage_v3): + variant_v3_code_id = PM2_SUPPORTING + variant[POPFREQ_CODE_ID] = PM2_SUPPORTING + variant[POPFREQ_CODE_DESCR] = PM2_SUPPORTING_MSG + observable_in_v3 = True + is_snv = (variant["Hg38_Start"] == variant["Hg38_End"]) + if debug: + print("variant is snv:", is_snv) + # + # the gnomAD v2 variant ID is set when the variant is in the genome + # OR exome portion of gnomAD. Focus on variants that are in the exome + # data by making sure that the allele count is defined. The allele + # count is the total number of observations of the variant in the gnomAD + # dataset + variant_in_v2 = False + if (field_defined(variant["Variant_id_GnomAD"]) + and field_defined(variant["Allele_count_exome_GnomAD"]) + and observable_in_v2): + variant_in_v2 = True + (mean_read_depth, + median_read_depth) = estimate_coverage(int(variant["pyhgvs_Hg37_Start"]), + int(variant["pyhgvs_Hg37_End"]), + int(variant["Chr"]),coverage_v2) + read_depth_v2 = min(mean_read_depth, median_read_depth) + if pd.isna(read_depth_v2): + read_depth_v2 = 0 + variant_v2_code_id = analyze_one_dataset(variant["faf95_popmax_exome_GnomAD"], + variant["faf95_popmax_population_exome_GnomAD"], + variant["Allele_count_exome_GnomAD"], + is_snv, read_depth_v2, + variant_is_flagged(variant["Variant_id_GnomAD"], + flags_v2)) + if debug: + print("gnomAD V2 variant", variant["Variant_id_GnomAD"], + "popmax", variant["faf95_popmax_exome_GnomAD"], + "allele count", variant["Allele_count_exome_GnomAD"], + "mean read depth", mean_read_depth, + "median read depth", median_read_depth, "snv", is_snv, + "V2 code", variant_v2_code_id) + variant_in_v3 = False + if (field_defined(variant["Variant_id_GnomADv3"]) and observable_in_v3): + variant_in_v3 = True + (mean_read_depth, + median_read_depth) = estimate_coverage(int(variant["Hg38_Start"]), + int(variant["Hg38_End"]), + int(variant["Chr"]), + coverage_v3) + read_depth_v3 = min(mean_read_depth, median_read_depth) + if pd.isna(read_depth_v3): + read_depth_v3 = 0 + variant_v3_code_id = analyze_one_dataset(variant["faf95_popmax_genome_GnomADv3"], + variant["faf95_popmax_population_genome_GnomADv3"], + variant["Allele_count_genome_GnomADv3"], + is_snv, read_depth_v3, + variant_is_flagged(variant["Variant_id_GnomADv3"], + flags_v3)) + if debug: + print("gnomAD V3 variant", variant["Variant_id_GnomADv3"], + "popmax", variant["faf95_popmax_genome_GnomADv3"], + "allele count", variant["Allele_count_genome_GnomADv3"], + "mean read depth", mean_read_depth, + "median read depth", median_read_depth, "snv", is_snv, + "V3 code", variant_v3_code_id) + (variant[POPFREQ_CODE_ID], + variant[POPFREQ_CODE_DESCR]) = analyze_across_datasets(variant_v2_code_id,variant["faf95_popmax_exome_GnomAD"], + variant["faf95_popmax_population_exome_GnomAD"], + variant_in_v2, variant_v3_code_id, + variant["faf95_popmax_genome_GnomADv3"], + variant["faf95_popmax_population_genome_GnomADv3"], + variant_in_v3) + if debug: + print("consensus code:", variant[POPFREQ_CODE_ID], "msg", + variant[POPFREQ_CODE_DESCR]) + return() + + + +def main(): + args = parse_args() + df_cov2 = pd.read_parquet(args.data_dir + '/df_cov_v2.parquet') + df_cov3 = pd.read_parquet(args.data_dir + '/df_cov_v3.parquet') + flags_v2 = read_flags(csv.DictReader(open(args.data_dir + "/brca.gnomAD.2.1.1.hg19.flags.tsv"), + delimiter = "\t")) + flags_v3 = read_flags(csv.DictReader(open(args.data_dir + "/brca.gnomAD.3.1.1.hg38.flags.tsv"), + delimiter = "\t")) + input_file = csv.DictReader(open(args.input), delimiter = "\t") + output_file = initialize_output_file(input_file, args.output) + for variant in input_file: + analyze_variant(variant, df_cov2, df_cov3, flags_v2, flags_v3, + debug=args.debug) + output_file.writerow(variant) + + +if __name__ == "__main__": + main() + diff --git a/pipeline/field_metadata.tsv b/pipeline/field_metadata.tsv index a64b8604e..f4715623b 100644 --- a/pipeline/field_metadata.tsv +++ b/pipeline/field_metadata.tsv @@ -465,3 +465,7 @@ DP_DL_spliceAI Evidence #COPY "The location of donor loss relative to variant po result_spliceai Evidence #COPY "The maximum of four delta scores (DS_AG, DS_AL, DS_DG, DS_DL) is used as basis for spliceogenicity predictions" 0.0 BayesDel_nsfp33a_noAF Evidence bayesdel_noaf "BayesDel score, computed without allele frequency" 0.55 change_type Internal #SKIP "type of change with respect to last release" "changed_information" +Provisional_Evidence_Code_Popfreq Evidence #COPY "The provisional ACMG population frequency evidence code, by the VCEP rules" "PM2_Supporting (met)" +Provisional_Evidence_Description_Popfreq #Copy "Details on how the provisional ACMG popfreq evidence code was determined" "This variant is absent from gnomAD v2.1 (exomes only, non-cancer subset, read depth ≥25) and gnomAD v3.1 (non-cancer subset, read depth ≥25) (PM2_Supporting met)" +Provisional_Evidence_Code_Bioinfo Evidence #SKIP "The provisional ACMG bioinformatic evidence code, by the VCEP rules" N/A +Provisional_Evidence_Description_Bioinfo Evidence #SKIP "Details on how the provisional ACMG bioinformatic evidence code was determined" NA \ No newline at end of file diff --git a/pipeline/gnomad/variant_scoring/run_pipeline.sh b/pipeline/gnomad/variant_scoring/run_pipeline.sh index a1247f4ed..eac58d788 100755 --- a/pipeline/gnomad/variant_scoring/run_pipeline.sh +++ b/pipeline/gnomad/variant_scoring/run_pipeline.sh @@ -20,7 +20,7 @@ UTA_PORT=50828 UTA_RELEASE_DATE=20171026 SEQ_REPO_DIR_DOCKER=/mnt/seq_repo -COMMON_DOCKER_OPT=" --rm -v ${DATA_DIR}:/mnt --network host -v /data/variant_scoring/code/pipeline:/opt/variant_scoring -v /data/seqrepo:${SEQ_REPO_DIR_DOCKER} -e UTA_DB_URL=postgresql://anonymous@0.0.0.0:${UTA_PORT}/uta/uta_${UTA_RELEASE_DATE} -e HGVS_SEQREPO_DIR=${SEQ_REPO_DIR_DOCKER}/latest" +COMMON_DOCKER_OPT=" --rm -v ${DATA_DIR}:/mnt --network host -v /data/variant_scoring/code/pipeline:/opt/variant_scoring -v /data/seqrepo:${SEQ_REPO_DIR_DOCKER} -e UTA_DB_URL=postgresql://anonymous@0.0.0.0:${UTA_PORT}/uta/uta_${UTA_RELEASE_DATE} -e HGVS_SEQREPO_DIR=${SEQ_REPO_DIR_DOCKER}/2021-01-29" # run process_gnomad_data.py docker run ${COMMON_DOCKER_OPT} variant_scoring bash -c "python /opt/variant_scoring/gnomad/variant_scoring/process_gnomad_data.py --cores 16 --mem-per-core-mb 2400 --gene-config-path /opt/variant_scoring/workflow/gene_config_brca_only.txt /mnt/original_gnomad_data /mnt/processed_brca" diff --git a/pipeline/gnomad/variant_scoring/variant_scoring.py b/pipeline/gnomad/variant_scoring/variant_scoring.py index 16d77cdfb..9e8cdd09f 100755 --- a/pipeline/gnomad/variant_scoring/variant_scoring.py +++ b/pipeline/gnomad/variant_scoring/variant_scoring.py @@ -2,10 +2,12 @@ # coding: utf-8 import itertools + import argparse import csv from collections import OrderedDict import math + import numpy as np import pandas as pd @@ -174,6 +176,73 @@ def analyze_one_dataset(faf95_popmax_str, faf95_population, allele_count, is_snv else: return(NO_CODE_NON_SNV) return(NEEDS_REVIEW) +======= +def extract_variant_scoring_data(df_cov2, df_cov3, df_var2, df_var3, read_depth_thresh, resource_dir): + agg_v2 = process_v2(df_var2, df_cov2, read_depth_thresh, resource_dir) + agg_v3 = process_v3(df_var3, df_cov3, read_depth_thresh) + + df_overall = pd.concat([agg_v2, agg_v3]).reset_index(drop=True) + + # running the actual scoring algorithm and the combined data + df_overall['evidence_code'] = df_overall.apply(determine_evidence_code_per_variant, axis=1) + df_overall = add_final_code_column(df_overall) + + return df_overall.sort_values('var_name') + + +def add_normalization_cols(df, strand_dict, processes=2): + TMP_HGVS_HG38 = 'tmp_hgvs_hg38' + TMP_VAR_OBJ_FIELD = 'tmp_var_obj' + TMP_GENE_SYMBOL = 'Gene_Symbol' + + hgvs_proc = hgvs_utils.HgvsWrapper() + + df[TMP_GENE_SYMBOL] = df['contigName'] + df[TMP_VAR_OBJ_FIELD] = df['var_name'].apply(lambda v: var_name_to_obj(v)) + df[TMP_HGVS_HG38] = df[TMP_VAR_OBJ_FIELD].apply( + lambda v: v.to_hgvs_obj(hgvs_proc.contig_maps[hgvs_utils.HgvsWrapper.GRCh38_Assem])) + + df = utils.parallelize_dataframe(df, _normalize_genomic_fnc(TMP_HGVS_HG38, + 'var_name_right', True, + strand_dict), processes) + df = utils.parallelize_dataframe(df, _normalize_genomic_fnc(TMP_HGVS_HG38, + 'var_name_left', False, + strand_dict), processes) + + def _convert_to_name(converted): + return converted.apply(lambda h: var_obj_to_name(variant_utils.VCFVariant.from_hgvs_obj(h))) + + df['var_name_right'] = _convert_to_name(df['var_name_right']) + df['var_name_left'] = _convert_to_name(df['var_name_left']) + + return df.drop(columns=[TMP_HGVS_HG38, TMP_VAR_OBJ_FIELD, TMP_GENE_SYMBOL]) + + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input_path", help="input directory") + parser.add_argument("-o", "--output_path", help="output directory") + parser.add_argument("-g", "--gene_config_path", help="genes for analysis") + parser.add_argument("-r", "--resource_dir", help="resource dir for lift over (same as for the main pipeline)") + args = parser.parse_args() + return(args) + + +def main(): + args = parse_args() + data_dir = Path(args.input_path) + cfg_df = config.load_config(args.gene_config_path) + df_cov2 = pd.read_parquet(data_dir / 'df_cov_v2.parquet') + df_cov3 = pd.read_parquet(data_dir / 'df_cov_v3.parquet') + + df_var2 = pd.read_parquet(data_dir / 'df_var_v2.parquet') + df_var3 = pd.read_parquet(data_dir / 'df_var_v3.parquet') + + read_depth_thresh = 30 + + df = extract_variant_scoring_data(df_cov2, df_cov3, df_var2, df_var3, read_depth_thresh, Path(args.resource_dir)) + def analyze_across_datasets(code_v2, faf_v2, faf_popmax_v2, in_v2, diff --git a/pipeline/requirements.txt b/pipeline/requirements.txt index 8828be0e9..bf8772350 100644 --- a/pipeline/requirements.txt +++ b/pipeline/requirements.txt @@ -1,7 +1,7 @@ appdirs==1.4.3 appnope==0.1.0 argcomplete==1.10.0 -attrs==19.3.0 +attrs==22.2.0 backcall==0.1.0 backports-abc==0.5 beautifulsoup4==4.8.2 @@ -22,6 +22,7 @@ decorator==4.4.2 distro==1.4.0 docutils==0.16 fake-useragent==0.1.11 +fastparquet==2024.5.0 ga4gh.common==0.0.7 ga4gh.vrs==0.8.4 hgvs==1.5.4 diff --git a/pipeline/workflow/CompileVCFFiles.py b/pipeline/workflow/CompileVCFFiles.py index 8684711e5..48d2ed3d7 100644 --- a/pipeline/workflow/CompileVCFFiles.py +++ b/pipeline/workflow/CompileVCFFiles.py @@ -13,7 +13,7 @@ luigi.auto_namespace(scope=__name__) -from workflow import bayesdel_processing, esp_processing, gnomad_processing, pipeline_common, pipeline_utils +from workflow import analysis, esp_processing, gnomad_processing, pipeline_common, pipeline_utils from workflow.pipeline_common import DefaultPipelineTask, clinvar_method_dir, lovd_method_dir, \ functional_assays_method_dir, data_merging_method_dir, priors_method_dir, priors_filter_method_dir, \ utilities_method_dir, vr_method_dir, splice_ai_method_dir, field_metadata_path, field_metadata_path_additional @@ -773,7 +773,7 @@ def run(self): self.output().path) -@requires(bayesdel_processing.AddBayesdelScores) +@requires(analysis.runPopfreqAssessment) class FindMissingReports(DefaultPipelineTask): def output(self): return luigi.LocalTarget(os.path.join(self.artifacts_dir, "missing_reports.log")) @@ -790,7 +790,7 @@ def run(self): pipeline_utils.check_file_for_contents(self.output().path) -@requires(bayesdel_processing.AddBayesdelScores) +@requires(analysis.runBioinfoPred) class RunDiffAndAppendChangeTypesToOutput(DefaultPipelineTask): def _extract_release_date(self, version_json): with open(version_json, 'r') as f: diff --git a/pipeline/workflow/analysis.py b/pipeline/workflow/analysis.py new file mode 100644 index 000000000..a8d412701 --- /dev/null +++ b/pipeline/workflow/analysis.py @@ -0,0 +1,63 @@ +import os +import subprocess +from pathlib import Path + +import luigi +from luigi.util import requires + +import workflow.pipeline_utils as pipeline_utils +from workflow.pipeline_common import DefaultPipelineTask, data_merging_method_dir, analysis_method_dir +from workflow import bayesdel_processing + + +@requires(bayesdel_processing.AddBayesdelScores) +class DownloadResources(DefaultPipelineTask): + def output(self): + return(luigi.LocalTarget(Path(self.cfg.output_dir) / 'release' / 'artifacts' / 'analysis')) + + + def run(self): + download_url = "https://brcaexchange.org/backend/downloads/variant_scoring/" + self.inputfiles = [ "df_cov_v2.parquet", "df_cov_v3.parquet", + "brca.gnomAD.2.1.1.hg19.flags.tsv", + "brca.gnomAD.3.1.1.hg38.flags.tsv" + ] + self.wdir = self.output().path + if not os.path.exists(self.wdir): + os.mkdir(self.wdir) + os.chdir(self.wdir) + for this_file in self.inputfiles: + pathname = download_url + "/" + this_file + pipeline_utils.download_file_and_display_progress(pathname) + + +@requires(DownloadResources) +class runPopfreqAssessment(DefaultPipelineTask): + def output(self): + return luigi.LocalTarget(Path(self.cfg.output_dir)/'release'/'artifacts'/'built_with_popfreq.tsv') + + def run(self): + os.chdir(analysis_method_dir) + args = ["python", "popfreq_assessment.py", + "--input", bayesdel_processing.AddBayesdelScores().output().path, + "--data_dir", DownloadResources().output().path, + "--output", self.output().path] + pipeline_utils.run_process(args) + pipeline_utils.check_file_for_contents(self.output().path) + +@requires(runPopfreqAssessment) +class runBioinfoPred(DefaultPipelineTask): + def output(self): + return luigi.LocalTarget(Path(self.cfg.output_dir)/'release'/'artifacts'/'built_with_bioinfo_pred.tsv') + + def run(self): + os.chdir(analysis_method_dir) + args = ["python", "add_bioinfo_pred.py", + "--input", runPopfreqAssessment().output().path, + "--output", self.output().path] + pipeline_utils.run_process(args) + pipeline_utils.check_file_for_contents(self.output().path) + + + + diff --git a/pipeline/workflow/pipeline_common.py b/pipeline/workflow/pipeline_common.py index 47f24c4c1..d0d0a5515 100644 --- a/pipeline/workflow/pipeline_common.py +++ b/pipeline/workflow/pipeline_common.py @@ -24,6 +24,7 @@ utilities_method_dir = os.path.abspath('../utilities') vr_method_dir = os.path.abspath('../vr') splice_ai_method_dir = os.path.abspath('../splice_ai') +analysis_method_dir = os.path.abspath('../analysis') field_metadata_path = os.path.abspath(os.path.join('..', 'field_metadata.tsv')) field_metadata_path_additional = os.path.abspath(os.path.join('..', 'field_metadata_additional_fields_variant_output_file.tsv')) diff --git a/pipeline/workflow/tarball_files_discard_list.txt b/pipeline/workflow/tarball_files_discard_list.txt index 4860ac24e..8bf9605df 100644 --- a/pipeline/workflow/tarball_files_discard_list.txt +++ b/pipeline/workflow/tarball_files_discard_list.txt @@ -30,6 +30,7 @@ ./release/artifacts/GnomADv3.vcf ./release/artifacts/LOVDready.vcf ./release/artifacts/LOVD.vcf +./release/artifacts/ready_for_priors.tsv ./release/artifacts/releaseDiff.log ./release/artifacts/right1000_Genomes ./release/artifacts/rightBIC @@ -141,3 +142,5 @@ ./release/artifacts/victor_wdir/slurm.annotate.run_1.start ./release/artifacts/victor_wdir/slurm.annotate.run_1.stop ./release/artifacts/victor_wdir/slurm.annotate.run_1.version +./release/artifacts/built_with_popfreq.tsv +./release/artifacts/built_with_bioinfo_pred.tsv diff --git a/website/content/help/research/acmg-variant-evidence-codes-computational-prediction.md b/website/content/help/research/acmg-variant-evidence-codes-computational-prediction.md new file mode 100644 index 000000000..cc491a929 --- /dev/null +++ b/website/content/help/research/acmg-variant-evidence-codes-computational-prediction.md @@ -0,0 +1,6 @@ +(Coming Soon) + +* #### Provisionally Assigned ((Provisional_Evidence_Code_Bioinfo)) + (Coming Soon) +* #### Description ((Provisional_Evidence_Description_Bioinfo)) + (Coming Soon) diff --git a/website/content/help/research/acmg-variant-evidence-codes-population-frequency.md b/website/content/help/research/acmg-variant-evidence-codes-population-frequency.md new file mode 100644 index 000000000..9f28d28dd --- /dev/null +++ b/website/content/help/research/acmg-variant-evidence-codes-population-frequency.md @@ -0,0 +1,6 @@ +The ClinGen ENIGMA _BRCA1_ and _BRCA2_ Variant Curation Expert Panel (VCEP) has published the definitive [rules](https://cspec.genome.network/cspec/ui/svi/affiliation/50087) for curating BRCA variants. These rules are based on the ACMG/AMP framework but further specialized for the nuances of the BRCA genes and Hereditary Breast and Ovarian Cancer syndrome. One portion of these rules describes how and when population frequencies can translate to specific ACMG evidence for variant curation, considering the frequencies, the type and location of the variant, and technical data that describe the reliability of the observed population frequencies, such as read depth and heuristic flags. To aid in the curation of BRCA variants, by the VCEP as well as other curators, we have derived the **provisional** ACMG population frequency evidence codes for all variants in BRCA Exchange. Note that by ClinGen's procedures, all variant evidence must be reviewed by specialized biocurators as part of the variant curation process. Most of the evidence codes shown here have not yet been reviewed, therefore they are marked as provisional and should be viewed with appropriate caveats. Anyone wishing to verify a provisional evidence code can review the original data from gnomAD together with the [VCEP rules](https://cspec.genome.network/cspec/ui/svi/affiliation/50087) for population frequencies. + +* #### Provisionally Assigned ((Provisional_Evidence_Code_Popfreq)) + The provisional ACMG population frequency evidence code, as estimated according to rules of the ClinGen Variant Curation Expert Panel. +* #### Description ((Provisional_Evidence_Description_Popfreq)) + Details on how the provisional ACMG population frequency evidence code was determined. diff --git a/website/content/help/research/acmg-variant-evidence-codes-provisional-assignment.md b/website/content/help/research/acmg-variant-evidence-codes-provisional-assignment.md new file mode 100644 index 000000000..8eb0b712e --- /dev/null +++ b/website/content/help/research/acmg-variant-evidence-codes-provisional-assignment.md @@ -0,0 +1 @@ +Under the [ACMG/AMP Standards and Guidelines](https://pubmed.ncbi.nlm.nih.gov/25741868/), variant curators can assign specific evidence codes to variants according to their frequency in reference populations. These evidence codes contribute to the curation of the variants as Benign, Pathogenic, Likely Benign, or Likely Pathogenic. diff --git a/website/content/help/research/allele-frequency-gnomad.md b/website/content/help/research/allele-frequency-gnomad.md index 259824ccd..8b27cffb8 100644 --- a/website/content/help/research/allele-frequency-gnomad.md +++ b/website/content/help/research/allele-frequency-gnomad.md @@ -46,7 +46,7 @@ The BRCA Exchange displays the primary subsets used by ExAC and gnomAD (i.e. “ Overall allele frequency (for genomes) according to all studied populations, per gnomAD 3.1 * #### Popmax Filtering Allele Frequency (95% confidence) ((faf95_popmax_genome_GnomADv3)) The 95% confidence bound on the most significant gnomAD genome population frequency (gnomAD 3.1) -* #### AAllele frequency African ((Allele_frequency_genome_AFR_GnomADv3)) +* #### Allele frequency African ((Allele_frequency_genome_AFR_GnomADv3)) Allele Frequency in genomes of African/African American Populations (gnomAD 3.1) * #### Allele frequency Latino ((Allele_frequency_genome_AMR_GnomADv3)) Allele Frequency in genomes of Latino Populations (gnomAD 3.1) diff --git a/website/content/help/research/allele-frequency-reference-sets.md b/website/content/help/research/allele-frequency-reference-sets.md index aaaa70fb2..a0b73ccd3 100644 --- a/website/content/help/research/allele-frequency-reference-sets.md +++ b/website/content/help/research/allele-frequency-reference-sets.md @@ -1,3 +1 @@ -The Allele Frequency Reference Sets tile shows the frequency of a _BRCA1_ or _BRCA2_ variant in a reference population. Allele Frequencies can be helpful in understanding the frequency at which a variant appears in the population, and whether there is a population in which the variant is more common and therefore not likely to increase the risk of disease. Different sets are shown in different subtiles. To view or collapse all subtiles, click the arrows at the top right of the tile on the Variant Details Page. - -BRCA Exchange has standardized on gnomAD as its source of allele frequency reference data. The earlier allele frequency reference sets, ExAC, ESP, and 1000 Genomes, have mostly been subsumed by gnomAD. We have retired the display of these earlier datasets, but still retain their data in our database for historical puposes. +The Allele Frequency Reference Sets tile shows the frequency of a _BRCA1_ or _BRCA2_ variant in a reference population from the gnomAD resource. Allele Frequencies can be helpful in understanding the frequency at which a variant appears in the population, and whether there is a population in which the variant is so common that it's not likely to increase the risk of a much less common disease. Different sets (exome, genome) are shown in different subtiles. To view or collapse all subtiles, click the arrows at the top right of the tile on the Variant Details Page. diff --git a/website/content/help/research/clinical-significance-bic.md b/website/content/help/research/clinical-significance-bic.md deleted file mode 100644 index 5fb9dff0a..000000000 --- a/website/content/help/research/clinical-significance-bic.md +++ /dev/null @@ -1,14 +0,0 @@ -* #### Patient Nationality ((Patient_nationality_BIC)) - The field that indicates nationality of the patient associated with the variant. -* #### Ethnicity ((Ethnicity_BIC)) - The field that provides the ethnicities of the observed patients. -* #### Family members carrying this variant ((Number_of_family_member_carrying_mutation_BIC)) - Number of family members carrying this variant. -* #### Literature Reference ((Literature_citation_BIC)) - Literature Reference\(s\). From BIC. -* #### Allele Origin ((Germline_or_Somatic_BIC)) - Origin of the allele, Germline \(G\) or Somatic \(S\). - -

Variants are classified as either germline or somatic, depending on how they are acquired. Germline variants are genetic changes that we inherit from our parents. Somatic variants are DNA changes that we acquire over our lifetime, often through exposure to pollutants, toxins, radiation and other carcinogens.

- -

Please note that the BIC database is no longer actively curated. A copy of all BIC data has been shared with several other variation databases. Though the National Human Genome Research Institute recommends using ClinVar or BRCA Exchange for updated information on BRCA1 and BRCA2 variants, the BIC database will be maintained to allow historical studies and other uses.

diff --git a/website/django/data/migrations/0058_add_provisional_evidence_fields_to_variant.py b/website/django/data/migrations/0058_add_provisional_evidence_fields_to_variant.py new file mode 100644 index 000000000..5b94b14c7 --- /dev/null +++ b/website/django/data/migrations/0058_add_provisional_evidence_fields_to_variant.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.13 on 2024-04-21 19:06 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('data', '0057_add_splice_ai_fields_and_update_empties'), + ] + + operations = [ + migrations.AddField( + model_name='variant', + name='Provisional_Evidence_Code_Bioinfo', + field=models.TextField(default=''), + ), + migrations.AddField( + model_name='variant', + name='Provisional_Evidence_Code_Popfreq', + field=models.TextField(default=''), + ), + migrations.AddField( + model_name='variant', + name='Provisional_Evidence_Description_Bioinfo', + field=models.TextField(default=''), + ), + migrations.AddField( + model_name='variant', + name='Provisional_Evidence_Description_Popfreq', + field=models.TextField(default=''), + ), + migrations.RunSQL( + """ + DROP MATERIALIZED VIEW IF EXISTS currentvariant; + CREATE MATERIALIZED VIEW currentvariant AS ( + SELECT * FROM "variant" WHERE ( + "id" IN ( SELECT DISTINCT ON ("Genomic_Coordinate_hg38") "id" FROM "variant" ORDER BY "Genomic_Coordinate_hg38" ASC, "Data_Release_id" DESC ) + ) + ); + """ + ) + ] diff --git a/website/django/data/models.py b/website/django/data/models.py index 801c82d63..033e44ce1 100644 --- a/website/django/data/models.py +++ b/website/django/data/models.py @@ -74,6 +74,7 @@ class Variant(models.Model): Variant_in_GnomAD = models.BooleanField(default=False) Variant_in_GnomADv3 = models.BooleanField(default=False) + # These are regular columns Source = models.TextField() URL_ENIGMA = models.TextField() Condition_ID_type_ENIGMA = models.TextField() @@ -506,6 +507,10 @@ class Variant(models.Model): DP_DL_spliceAI = models.TextField(null=True) result_spliceai = models.TextField(null=True) BayesDel_nsfp33a_noAF = models.TextField(null=True) + Provisional_Evidence_Code_Popfreq = models.TextField(default='') + Provisional_Evidence_Description_Popfreq = models.TextField(default='') + Provisional_Evidence_Code_Bioinfo = models.TextField(default='') + Provisional_Evidence_Description_Bioinfo = models.TextField(default='') # Data Versioning Data_Release = models.ForeignKey(DataRelease) @@ -990,6 +995,7 @@ class CurrentVariant(models.Model): Variant_in_GnomAD = models.BooleanField(default=False) Variant_in_GnomADv3 = models.BooleanField(default=False) + # These are regular columns Source = models.TextField() URL_ENIGMA = models.TextField() Condition_ID_type_ENIGMA = models.TextField() @@ -1420,6 +1426,10 @@ class CurrentVariant(models.Model): DP_DL_spliceAI = models.TextField(null=True) result_spliceai = models.TextField(null=True) BayesDel_nsfp33a_noAF = models.TextField(null=True) + Provisional_Evidence_Code_Popfreq = models.TextField(default='') + Provisional_Evidence_Description_Popfreq = models.TextField(default='') + Provisional_Evidence_Code_Bioinfo = models.TextField(default='') + Provisional_Evidence_Description_Bioinfo = models.TextField(default='') # Data Versioning Data_Release = models.ForeignKey(DataRelease) diff --git a/website/js/Releases.js b/website/js/Releases.js index 17fbf90cc..57c8f0b86 100644 --- a/website/js/Releases.js +++ b/website/js/Releases.js @@ -36,10 +36,10 @@ var Releases = React.createClass({ Version {release.name} {moment(release.date, "YYYY-MM-DDTHH:mm:ss").format("DD MMMM YYYY")} {this.getSourceRepresentations(release.sources)} - {release['variants_added']} - {release['variants_classified']} - {release['variants_modified']} - {release['variants_deleted']} + {release['variants_added']} + {release['variants_classified']} + {release['variants_modified']} + {release['variants_deleted']} )); return ( @@ -114,10 +114,10 @@ var Release = React.createClass({

-

{release['variants_added']} new variant{s(release['variants_added'])}

-

{release['variants_classified']} new classification{s(release['variants_classified'])}

-

{release['variants_modified']} changed/updated variant{s(release['variants_modified'])}

-

{release['variants_deleted']} removed variant{s(release['variants_deleted'])}

+

{release['variants_added']} new variant{s(release['variants_added'])}

+

{release['variants_classified']} new classification{s(release['variants_classified'])}

+

{release['variants_modified']} changed/updated variant{s(release['variants_modified'])}

+

{release['variants_deleted']} removed variant{s(release['variants_deleted'])}

diff --git a/website/js/VariantTable.js b/website/js/VariantTable.js index 3ab0bfc7e..4555c5b55 100644 --- a/website/js/VariantTable.js +++ b/website/js/VariantTable.js @@ -169,14 +169,6 @@ const researchModeGroups = [ } }, - {groupTitle: 'Clinical Significance (BIC)', internalGroupName: 'Significance (BIC)', innerCols: [ - {title: 'Patient Nationality', prop: 'Patient_nationality_BIC'}, - {title: 'Ethnicity', prop: 'Ethnicity_BIC'}, - {title: 'Family members carrying this variant', prop: 'Number_of_family_member_carrying_mutation_BIC'}, - {title: 'Literature Reference', prop: 'Literature_citation_BIC', core: true}, - {title: 'Allele Origin', prop: 'Germline_or_Somatic_BIC'}, - ]}, - {groupTitle: 'Computational Predictions', internalGroupName: 'Computational Predictions', innerGroups: [ { source: "BayesDel", @@ -244,6 +236,25 @@ const researchModeGroups = [ ] }, + {groupTitle: 'ACMG Variant Evidence Codes, Provisional Assignment', + innerGroups: [ + { + source: "Population Frequency", + data: [ + {title: "Provisionally Assigned", prop: "Provisional_Evidence_Code_Popfreq", core: true}, + {title: "Description", prop: "Provisional_Evidence_Description_Popfreq", core: true} + ] + }, + { + source: "Computational Prediction", + data: [ + {title: "Provisionally Assigned", prop: "Provisional_Evidence_Code_Bioinfo", core: true}, + {title: "Description", prop: "Provisional_Evidence_Description_Bioinfo", core: true} + ] + }] + }, + + {groupTitle: 'Multifactorial Likelihood Analysis', internalGroupName: 'Multifactorial Likelihood Analysis', innerCols: [ {title: 'Posterior probability of pathogenicity', prop: 'Posterior_probability_exLOVD', core: true}, {title: 'Prior probability of pathogenicity', prop: 'Combined_prior_probablility_exLOVD', core: true}, diff --git a/website/js/components/AlleleFrequenciesTile.js b/website/js/components/AlleleFrequenciesTile.js index 82270a100..918b05e84 100644 --- a/website/js/components/AlleleFrequenciesTile.js +++ b/website/js/components/AlleleFrequenciesTile.js @@ -9,7 +9,8 @@ import GroupHelpButton from './GroupHelpButton'; const _ = require('underscore'); - +// Includes the Allele Frequency fields. +// when true, the field starts in the expanded state. const fieldsOfInterest = { 'gnomAD V2.1 Exomes, Non-Cancer (Graphical)': true, 'gnomAD V2.1 Exomes, Non-Cancer (Numerical)': false, @@ -66,8 +67,7 @@ export default class AlleleFrequenciesTile extends React.Component { return dd.source === "GnomADv3 Genomes"; }).data, "gnomAD V3.1 Genomes, Non-Cancer (Numerical)"]; - const alleleFrequencyFields = [gnomadExomeGraph, gnomadExomeData, gnomadGenomeGraph, - gnomadGenomeData]; + const alleleFrequencyFields = [gnomadExomeGraph, gnomadExomeData, gnomadGenomeGraph, gnomadGenomeData]; const renderedAlleleFrequencyFields = alleleFrequencyFields.map((field, idx) => { let fieldValue = field[0]; diff --git a/website/js/components/DonationBar.js b/website/js/components/DonationBar.js index 24167090b..cfd522573 100644 --- a/website/js/components/DonationBar.js +++ b/website/js/components/DonationBar.js @@ -9,7 +9,7 @@ const DonationBar = React.createClass({ return (

- We are pleased to introduce a new feature: persistent URLs for the variants in BRCA Exchange! More information is available here on the BRCA Exchange blog. + We are excited to introduce provisional assignments of the ACMG evidence codes for all variants in BRCA Exchange! Please see our latest blog post for more information

); diff --git a/website/js/components/ProvisionalEvidenceTile.js b/website/js/components/ProvisionalEvidenceTile.js new file mode 100644 index 000000000..b789f54f0 --- /dev/null +++ b/website/js/components/ProvisionalEvidenceTile.js @@ -0,0 +1,106 @@ +'use strict'; +import util from '../util'; +import React from 'react'; +import {Table} from "react-bootstrap"; +import * as _ from 'lodash'; +import classNames from "classnames"; +import CollapsibleTile from "./collapsible/CollapsibleTile"; +import CollapsibleSection from "./collapsible/CollapsibleSection"; + +// ACMG Variant Evidence Codes, Provisional Assignment +export default class ProvisionalEvidenceTile extends React.Component { + + // For the sub-tile headers that have a result value displayed in the header, + // Add css so that the value floats to the right + generateHeader(result) { + return ( +
+ {result} +
+
+ ); + } + + // Given a group source (ie groupname), data (list of dicts containing title:, prop:), and variant object + // generate table rows to be displayed in that group's section + // retrieving the requested prop values from the variant + // and applying the corresponding help tooltips + getRowsAndDetermineIfEmpty(source, data, variant) { + const rows = _.map(data, (rowDescriptor) => { + //let {prop, title, noHelpLink} = rowDescriptor; + let {prop, title} = rowDescriptor; + + const rowItem = util.getFormattedFieldByProp(prop, variant); + let rowClasses = classNames({ + 'variantfield-empty': false, // placeholder until this supports hiding when empty + // 'variantfield-empty': (isEmptyValue && this.props.hideEmptyItems), + }); + return ( + + {title} + {rowItem} + + ); + }); + return rows; + } + + render() { + const {variant, innerGroups} = this.props; + let allEmpty = true; + + // innerGroup are: Population Frequency, Computational Prediction + let sections = _.map(innerGroups, (group) => { + let groupSource = group.source; + let groupData = group.data; + let renderedRows = this.getRowsAndDetermineIfEmpty(groupSource, groupData, variant); + + // TEMPORARY - hide ComputationalPrediction subsection until data is populated + if (groupSource === "Computational Prediction") { + return (
); + } + + // Attempt to retrieve the prop name of the provisional code so we can put the value in the subsection header + let extraHeaderProp = false ; + for ( const dataItem of groupData ) { + if (dataItem.title === "Provisionally Assigned" ) { + extraHeaderProp = dataItem.prop ; + } + } + // If we found a prop, pull it from the variant. If not, leave as a dash. + let extraHeaderValue = extraHeaderProp ? util.getFormattedFieldByProp(extraHeaderProp, variant) : "-"; + + return ( + + + { renderedRows } + +
+
); + }); + + return ( + +
+ The ClinGen + ENIGMA Variant Curation Expert Panel (VCEP) rules (Version 1.1.0, dated 2023-11-22) + define how the evidence on a variant can contribute to the variant being curated as + Pathogenic or Benign, following the + ACMG/AMP standards and guidelines + . We have evaluated the following categories of evidence + for each variant against the VCEP rules, to evaluate which evidence codes the variant meets. + These are provisional evidence code assignments. In general, they have not yet been reviewed by the + VCEP biocuration team. If the variant has been curated by the VCEP, then the + Variant Curation Expert Panel tile will contain information on the evidence used for curation. +
+ {sections} +
+ ); + } +}; + diff --git a/website/js/content.js b/website/js/content.js index ef2ec7851..95b79bad4 100644 --- a/website/js/content.js +++ b/website/js/content.js @@ -224,11 +224,6 @@ const helpContentResearch = [ id: "clinical-significance-lovd", contents: require("../content/help/research/clinical-significance-lovd.md") }, - { - name: "BIC", - id: "clinical-significance-bic", - contents: require("../content/help/research/clinical-significance-bic.md") - }, ] }, { @@ -243,16 +238,33 @@ const helpContentResearch = [ // reference: "https://www.ncbi.nlm.nih.gov/pubmed/21990134" }, { - name: "What information is provided in the Allele Frequency Reference Sets Tile?", + name: "What information is provided in the Allele Frequency Reference Sets tile?", id: "allele-frequency-reference-sets", contents: require("../content/help/research/allele-frequency-reference-sets.md"), list: [ { - name: "gnomAD: Genome Aggregation Database (non-cancer cohort)", + name: "gnomAD: Genome Aggregation Database (non-cancer cohort)", contents: require("../content/help/research/allele-frequency-gnomad.md") }, ] }, + { + name: "What information is provided in the ACMG Variant Evidence Codes, Provisional Assignment tile?", + id: "acmg-variant-evidence-codes-provisional-assignment", + contents: require("../content/help/research/acmg-variant-evidence-codes-provisional-assignment.md"), + list: [ + { + name: "Population Frequency", + id: "acmg-variant-evidence-codes-population-frequency", + contents: require("../content/help/research/acmg-variant-evidence-codes-population-frequency.md") + }, + { + name: "Computational Prediction", + id: "acmg-variant-evidence-codes-computational-prediction", + contents: require("../content/help/research/acmg-variant-evidence-codes-computational-prediction.md") + }, + ] + }, { name: "What sources are available in the Functional Assay Results tile?", id: "functional-assay-results", diff --git a/website/js/index.js b/website/js/index.js index 4248ef74d..3dd314f99 100644 --- a/website/js/index.js +++ b/website/js/index.js @@ -9,6 +9,7 @@ import LiteratureTable from "./components/LiteratureTable"; import SilicoPredTile from "./components/insilicopred/SilicoPredTile"; import FunctionalAssayTile from "./components/functionalassay/FunctionalAssayTile"; import ComputationalPredictionTile from "./components/computationalprediction/ComputationalPredictionTile"; +import ProvisionalEvidenceTile from "./components/ProvisionalEvidenceTile"; import MupitStructure from './MupitStructure'; // shims for older browsers @@ -227,7 +228,7 @@ var Home = React.createClass({ {notCurrentSupporterLogoItems} -

Currently Supported By:

+ {currentSupporterLogoItems.length ? (

Currently Supported By:

) : '' } {currentSupporterLogoItems}
@@ -273,7 +274,7 @@ var About = React.createClass({ {notCurrentSupporterLogoItems} -

Currently Supported By:

+ {currentSupporterLogoItems.length ? (

Currently Supported By:

) : '' } {currentSupporterLogoItems}
@@ -816,6 +817,7 @@ var VariantDetail = React.createClass({ }; }); }, + // render for VariantDetail render: function () { const {data, error} = this.state; if (!data) { @@ -963,13 +965,19 @@ var VariantDetail = React.createClass({ ); } - // remove the BIC classification and importance fields unless the classification is 1 or 5 - if (groupTitle === 'Clinical Significance (BIC)') { - const bicClass = variant['Clinical_classification_BIC']; + if (groupTitle === "ACMG Variant Evidence Codes, Provisional Assignment") { + return ( + - if (bicClass !== 'Class 1' && bicClass !== 'Class 5') { - innerCols = innerCols.filter(x => x.prop !== 'Clinical_classification_BIC' && x.prop !== 'Clinical_importance_BIC'); - } + ); } // now map the group's columns to a list of row objects @@ -1067,28 +1075,6 @@ var VariantDetail = React.createClass({ ); - const bicTileTable = ( -
-
-
- Please note that the BIC database is no longer actively curated. A copy of all BIC data has - been shared with several other variation databases. -
-
- Though the National Human Genome Research Institute recommends using ClinVar or BRCA Exchange for - updated information on BRCA1 and BRCA2 variants, the BIC database will be maintained to allow - historical studies and other uses. -
-
- - - - {rows} - -
-
- ); - return (
- {groupTitle === "Clinical Significance (BIC)" ? bicTileTable : tileTable} + {tileTable} diff --git a/website/js/logos.js b/website/js/logos.js index 82499dfa1..64bd33b57 100644 --- a/website/js/logos.js +++ b/website/js/logos.js @@ -61,7 +61,7 @@ module.exports = [ logo: require('./img/brotman_baty_institute_logo.jpg'), url: 'https://brotmanbaty.org/', currentSupporter: false - }, + }/*, { id: 'NIH', logo: require('./img/nih_bd2k_horizontal.png'), @@ -79,5 +79,5 @@ module.exports = [ logo: require('./img/nci_itcr.png'), url: 'https://itcr.cancer.gov/', currentSupporter: true - } + }*/ ]; diff --git a/website/package-lock.json b/website/package-lock.json index e593f60b2..731113177 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -1,6 +1,6 @@ { "name": "brcaexchange", - "version": "3.116.0-alpha.0", + "version": "3.133.0", "lockfileVersion": 1, "requires": true, "dependencies": { diff --git a/website/package.json b/website/package.json index c2cb539c0..7d4bd8b3c 100644 --- a/website/package.json +++ b/website/package.json @@ -1,6 +1,6 @@ { "name": "brcaexchange", - "version": "3.130.0-alpha.0", + "version": "3.133.0", "description": "brca variant browser", "main": "index.js", "scripts": {