From 3785132ca45d49bac4db22594543b328b1bfdae9 Mon Sep 17 00:00:00 2001 From: melissacline Date: Mon, 14 Aug 2023 15:20:53 -0700 Subject: [PATCH 01/31] switched from click to argparse for cmdline arguments --- .../gnomad/variant_scoring/variant_scoring.py | 29 ++++++++++++------- 1 file changed, 18 insertions(+), 11 deletions(-) mode change 100644 => 100755 pipeline/gnomad/variant_scoring/variant_scoring.py diff --git a/pipeline/gnomad/variant_scoring/variant_scoring.py b/pipeline/gnomad/variant_scoring/variant_scoring.py old mode 100644 new mode 100755 index 086b52370..3162598af --- a/pipeline/gnomad/variant_scoring/variant_scoring.py +++ b/pipeline/gnomad/variant_scoring/variant_scoring.py @@ -4,7 +4,7 @@ import itertools from pathlib import Path -import click +import argparse import numpy as np import pandas as pd @@ -264,14 +264,21 @@ def _convert_to_name(converted): return df.drop(columns=[TMP_HGVS_HG38, TMP_VAR_OBJ_FIELD, TMP_GENE_SYMBOL]) -@click.command() -@click.argument('data_dir', type=click.Path(readable=True)) -@click.argument('output_path', type=click.Path(writable=True)) -@click.option('--resource-dir', type=click.Path(readable=True), help="resource dir for lift over (same as for the main pipeline)") -@click.option('--gene-config-path', type=click.Path(readable=True)) -def main(data_dir, output_path, resource_dir, gene_config_path): - data_dir = Path(data_dir) - cfg_df = config.load_config(gene_config_path) + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input_path", help="input directory") + parser.add_argument("-o", "--output_path", help="output directory") + parser.add_argument("-g", "--gene_config_path", help="genes for analysis") + parser.add_argument("-r", "--resource_dir", help="resource dir for lift over (same as for the main pipeline)") + args = parser.parse_args() + return(args) + + +def main(): + args = parse_args() + data_dir = Path(args.input_path) + cfg_df = config.load_config(args.gene_config_path) df_cov2 = pd.read_parquet(data_dir / 'df_cov_v2.parquet') df_cov3 = pd.read_parquet(data_dir / 'df_cov_v3.parquet') @@ -280,13 +287,13 @@ def main(data_dir, output_path, resource_dir, gene_config_path): read_depth_thresh = 30 - df = extract_variant_scoring_data(df_cov2, df_cov3, df_var2, df_var3, read_depth_thresh, Path(resource_dir)) + df = extract_variant_scoring_data(df_cov2, df_cov3, df_var2, df_var3, read_depth_thresh, Path(args.resource_dir)) # add var_name columns with different normalization to join data with other sources (e.g. brca exchange output data) strand_dict = { int(r['chr']) : r[config.STRAND_COL] for _, r in cfg_df.iterrows() } df = add_normalization_cols(df, strand_dict) - df.to_parquet(output_path) + df.to_parquet(args.output_path) if __name__ == "__main__": From 4869476158b5e6e8b6db3ce782f7bd625f5ef095 Mon Sep 17 00:00:00 2001 From: melissacline Date: Tue, 15 Aug 2023 12:53:06 -0700 Subject: [PATCH 02/31] Chnaging the seqrepo dir to use the absolute pathname for the latest seqrepo dir rather than the 'latest' symlink, it's not clear why the symlink isn't working... --- pipeline/gnomad/variant_scoring/run_pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline/gnomad/variant_scoring/run_pipeline.sh b/pipeline/gnomad/variant_scoring/run_pipeline.sh index a1247f4ed..eac58d788 100755 --- a/pipeline/gnomad/variant_scoring/run_pipeline.sh +++ b/pipeline/gnomad/variant_scoring/run_pipeline.sh @@ -20,7 +20,7 @@ UTA_PORT=50828 UTA_RELEASE_DATE=20171026 SEQ_REPO_DIR_DOCKER=/mnt/seq_repo -COMMON_DOCKER_OPT=" --rm -v ${DATA_DIR}:/mnt --network host -v /data/variant_scoring/code/pipeline:/opt/variant_scoring -v /data/seqrepo:${SEQ_REPO_DIR_DOCKER} -e UTA_DB_URL=postgresql://anonymous@0.0.0.0:${UTA_PORT}/uta/uta_${UTA_RELEASE_DATE} -e HGVS_SEQREPO_DIR=${SEQ_REPO_DIR_DOCKER}/latest" +COMMON_DOCKER_OPT=" --rm -v ${DATA_DIR}:/mnt --network host -v /data/variant_scoring/code/pipeline:/opt/variant_scoring -v /data/seqrepo:${SEQ_REPO_DIR_DOCKER} -e UTA_DB_URL=postgresql://anonymous@0.0.0.0:${UTA_PORT}/uta/uta_${UTA_RELEASE_DATE} -e HGVS_SEQREPO_DIR=${SEQ_REPO_DIR_DOCKER}/2021-01-29" # run process_gnomad_data.py docker run ${COMMON_DOCKER_OPT} variant_scoring bash -c "python /opt/variant_scoring/gnomad/variant_scoring/process_gnomad_data.py --cores 16 --mem-per-core-mb 2400 --gene-config-path /opt/variant_scoring/workflow/gene_config_brca_only.txt /mnt/original_gnomad_data /mnt/processed_brca" From 1c51d69a1c27928c2d0ec7af50279af6b651172f Mon Sep 17 00:00:00 2001 From: melissacline Date: Mon, 6 May 2024 17:31:50 -0700 Subject: [PATCH 03/31] Switched off the local UTA installation --- pipeline/Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pipeline/Makefile b/pipeline/Makefile index ba5a3ec64..78b20de62 100755 --- a/pipeline/Makefile +++ b/pipeline/Makefile @@ -75,7 +75,6 @@ endif COMMON_DOCKER_ARGS = --rm -u `id -u ${USER}`:$(DOCKER_GRP) \ -e "DATA_DATE=$(DATA_DATE)" \ - -e "UTA_DB_URL=postgresql://anonymous@0.0.0.0:$(UTA_PORT)/uta/uta_$(UTA_RELEASE_DATE)" \ -e "HGVS_SEQREPO_DIR=$(SEQ_REPO_DIR_DOCKER)/latest" \ -e "PYTHONPATH=/opt/brca-exchange/pipeline" \ --network host \ @@ -142,7 +141,7 @@ test-coverage: ## Running pipeline unit tests with coverage information docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) bash -c 'cd /opt/brca-exchange/pipeline/data && bash getdata && cd /opt/brca-exchange/pipeline && pytest --cov --ignore=splicing/ && coverage html --include="/opt/brca-exchange/pipeline/*" --omit="*/test_*"' -build-release: start-local-uta checkout build-docker setup-files setup-lovd download-resources download-seqrepo start-seqrepo-rest-service run-pipeline variants-by-source ## create new data release +build-release: checkout build-docker setup-files setup-lovd download-resources download-seqrepo start-seqrepo-rest-service run-pipeline variants-by-source ## create new data release variants-by-source: ## postprocessing: compute statistics for changes with respect to the last release docker run $(COMMON_DOCKER_ARGS) $(PIPELINE_IMAGE) python /opt/brca-exchange/pipeline/utilities/variantsBySource.py -i /files/data/output/release/built_with_change_types.tsv -c true @@ -179,7 +178,7 @@ post-release-cmds: cleanup-failed include-release-notes push-docker tag-release setup-data-from-latest-release-tar: setup-files ## sets up brca output dir with data contained in release archive from last release (only data from variant merging onwards) tar -C $(OUT_DIR) -zxf $(PREVIOUS_RELEASE_PATH) -setup-dev-env: start-local-uta build-docker setup-files download-resources ## setup development environment +setup-dev-env: build-docker setup-files download-resources ## setup development environment clean-pyc: ## remove Python file artifacts find . -name '*.pyc' -exec rm -f {} + From 95eff895398d6497cda8b097ac5747bbc97aca06 Mon Sep 17 00:00:00 2001 From: E-T-K Date: Thu, 9 May 2024 23:00:04 +0000 Subject: [PATCH 04/31] feat: provisional evidence - add fields to django Add 4 new fields to the variant model, with migration to update the database --- ..._provisional_evidence_fields_to_variant.py | 45 +++++++++++++++++++ website/django/data/models.py | 10 +++++ 2 files changed, 55 insertions(+) create mode 100644 website/django/data/migrations/0058_add_provisional_evidence_fields_to_variant.py diff --git a/website/django/data/migrations/0058_add_provisional_evidence_fields_to_variant.py b/website/django/data/migrations/0058_add_provisional_evidence_fields_to_variant.py new file mode 100644 index 000000000..5b94b14c7 --- /dev/null +++ b/website/django/data/migrations/0058_add_provisional_evidence_fields_to_variant.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.13 on 2024-04-21 19:06 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('data', '0057_add_splice_ai_fields_and_update_empties'), + ] + + operations = [ + migrations.AddField( + model_name='variant', + name='Provisional_Evidence_Code_Bioinfo', + field=models.TextField(default=''), + ), + migrations.AddField( + model_name='variant', + name='Provisional_Evidence_Code_Popfreq', + field=models.TextField(default=''), + ), + migrations.AddField( + model_name='variant', + name='Provisional_Evidence_Description_Bioinfo', + field=models.TextField(default=''), + ), + migrations.AddField( + model_name='variant', + name='Provisional_Evidence_Description_Popfreq', + field=models.TextField(default=''), + ), + migrations.RunSQL( + """ + DROP MATERIALIZED VIEW IF EXISTS currentvariant; + CREATE MATERIALIZED VIEW currentvariant AS ( + SELECT * FROM "variant" WHERE ( + "id" IN ( SELECT DISTINCT ON ("Genomic_Coordinate_hg38") "id" FROM "variant" ORDER BY "Genomic_Coordinate_hg38" ASC, "Data_Release_id" DESC ) + ) + ); + """ + ) + ] diff --git a/website/django/data/models.py b/website/django/data/models.py index 801c82d63..033e44ce1 100644 --- a/website/django/data/models.py +++ b/website/django/data/models.py @@ -74,6 +74,7 @@ class Variant(models.Model): Variant_in_GnomAD = models.BooleanField(default=False) Variant_in_GnomADv3 = models.BooleanField(default=False) + # These are regular columns Source = models.TextField() URL_ENIGMA = models.TextField() Condition_ID_type_ENIGMA = models.TextField() @@ -506,6 +507,10 @@ class Variant(models.Model): DP_DL_spliceAI = models.TextField(null=True) result_spliceai = models.TextField(null=True) BayesDel_nsfp33a_noAF = models.TextField(null=True) + Provisional_Evidence_Code_Popfreq = models.TextField(default='') + Provisional_Evidence_Description_Popfreq = models.TextField(default='') + Provisional_Evidence_Code_Bioinfo = models.TextField(default='') + Provisional_Evidence_Description_Bioinfo = models.TextField(default='') # Data Versioning Data_Release = models.ForeignKey(DataRelease) @@ -990,6 +995,7 @@ class CurrentVariant(models.Model): Variant_in_GnomAD = models.BooleanField(default=False) Variant_in_GnomADv3 = models.BooleanField(default=False) + # These are regular columns Source = models.TextField() URL_ENIGMA = models.TextField() Condition_ID_type_ENIGMA = models.TextField() @@ -1420,6 +1426,10 @@ class CurrentVariant(models.Model): DP_DL_spliceAI = models.TextField(null=True) result_spliceai = models.TextField(null=True) BayesDel_nsfp33a_noAF = models.TextField(null=True) + Provisional_Evidence_Code_Popfreq = models.TextField(default='') + Provisional_Evidence_Description_Popfreq = models.TextField(default='') + Provisional_Evidence_Code_Bioinfo = models.TextField(default='') + Provisional_Evidence_Description_Bioinfo = models.TextField(default='') # Data Versioning Data_Release = models.ForeignKey(DataRelease) From 4a46d80e7444e32af2cc9b4901412b7d4d579cfb Mon Sep 17 00:00:00 2001 From: melissacline Date: Fri, 10 May 2024 11:55:04 -0700 Subject: [PATCH 05/31] Adding the variant assessment to the pipeline workflow --- pipeline/workflow/CompileVCFFiles.py | 4 +-- pipeline/workflow/pipeline_common.py | 1 + pipeline/workflow/variant_scoring.py | 48 ++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 2 deletions(-) create mode 100644 pipeline/workflow/variant_scoring.py diff --git a/pipeline/workflow/CompileVCFFiles.py b/pipeline/workflow/CompileVCFFiles.py index 4a761dd12..f3e107bf4 100644 --- a/pipeline/workflow/CompileVCFFiles.py +++ b/pipeline/workflow/CompileVCFFiles.py @@ -13,7 +13,7 @@ luigi.auto_namespace(scope=__name__) -from workflow import bayesdel_processing, esp_processing, gnomad_processing, pipeline_common, pipeline_utils +from workflow import bayesdel_processing, esp_processing, gnomad_processing, pipeline_common, pipeline_utils, variant_scoring from workflow.pipeline_common import DefaultPipelineTask, clinvar_method_dir, lovd_method_dir, \ functional_assays_method_dir, data_merging_method_dir, priors_method_dir, priors_filter_method_dir, \ utilities_method_dir, vr_method_dir, splice_ai_method_dir, field_metadata_path, field_metadata_path_additional @@ -754,7 +754,7 @@ def run(self): self.output().path) -@requires(bayesdel_processing.AddBayesdelScores) +@requires(variant_scoring.AddVariantScores) class FindMissingReports(DefaultPipelineTask): def output(self): return luigi.LocalTarget(os.path.join(self.artifacts_dir, "missing_reports.log")) diff --git a/pipeline/workflow/pipeline_common.py b/pipeline/workflow/pipeline_common.py index 47f24c4c1..d0d0a5515 100644 --- a/pipeline/workflow/pipeline_common.py +++ b/pipeline/workflow/pipeline_common.py @@ -24,6 +24,7 @@ utilities_method_dir = os.path.abspath('../utilities') vr_method_dir = os.path.abspath('../vr') splice_ai_method_dir = os.path.abspath('../splice_ai') +analysis_method_dir = os.path.abspath('../analysis') field_metadata_path = os.path.abspath(os.path.join('..', 'field_metadata.tsv')) field_metadata_path_additional = os.path.abspath(os.path.join('..', 'field_metadata_additional_fields_variant_output_file.tsv')) diff --git a/pipeline/workflow/variant_scoring.py b/pipeline/workflow/variant_scoring.py new file mode 100644 index 000000000..662d2a89c --- /dev/null +++ b/pipeline/workflow/variant_scoring.py @@ -0,0 +1,48 @@ +import os +import subprocess +from pathlib import Path + +import luigi +from luigi.util import requires + +import workflow.pipeline_utils as pipeline_utils +from workflow.pipeline_common import DefaultPipelineTask, data_merging_method_dir, analysis_method_dir + + +@requires(bayesdel_processing.AddBayesdelScores) +class DownloadResources(DefaultPipelineTask): + def output(self): + wdir = Path(self.cfg.output_dir) / 'release' / 'artifacts' / 'variant_scoring_wdir' + return{'wdir': luigi.LocalTarget(wdir), + 'covV2': "df_cov_v2.parquet", + 'covV3': "df_cov_v2.parquet", + 'flagsV2': "brca.gnomAD.2.1.1.hg19.flags.tsv", + 'flagsV3': "brca.gnomAD.3.1.1.hg38.flags.tsv" + } + + def run(self): + wdir = self.output()['wdir'].path + if not os.path.exists(wdir): + os.mkdir(wdir) + os.chdir(wdir) + variant_scoring_data_url = "https://brcaexchange.org/backend/downloads/variant_scoring/*" + pipeline_utils.download_file_and_display_progress(brca1_data_url) + + +@requires(DownloadResources) +class runPopfreqAssessment(DefaultPipelineTask): + def output(self): + return luigi.LocalTarget(os.path.join(self.artifacts_dir, 'built_with_popfreq.tsv')) + + def run(self): + os.chdir(analysis_method_dir) + args = ["python", "popfreq_assessment.py", + "--input", "bayesdel_processing.AddBayesdelScores.output()", + "--data_dir", DownloadResources.output()["wdir"], + "--output", self.output().path] + pipeline_utils.run_process(args) + pipeline_utils.check_file_for_contents(self.output().path) + + + + From edeaddeeebebf4dc5a4772ed2efe4ec8ace05fca Mon Sep 17 00:00:00 2001 From: melissacline Date: Fri, 10 May 2024 14:19:53 -0700 Subject: [PATCH 06/31] Added popfreq_assessment --- pipeline/analysis/popfreq_assessment.py | 375 ++++++++++++++++++++++++ pipeline/workflow/variant_scoring.py | 1 + 2 files changed, 376 insertions(+) create mode 100755 pipeline/analysis/popfreq_assessment.py diff --git a/pipeline/analysis/popfreq_assessment.py b/pipeline/analysis/popfreq_assessment.py new file mode 100755 index 000000000..0f6381074 --- /dev/null +++ b/pipeline/analysis/popfreq_assessment.py @@ -0,0 +1,375 @@ +#!/usr/bin/env python +# coding: utf-8 + +import itertools +import argparse +import csv +from collections import OrderedDict +import math +import numpy as np +import pandas as pd + +#import gnomad.variant_scoring.constants as cnts +#from common import config, hgvs_utils, variant_utils, utils +#from data_merging.brca_pseudonym_generator import _normalize_genomic_fnc + +POPFREQ_CODE_ID = "Provisional_Evidence_Code_Popfreq" +POPFREQ_CODE_DESCR = "Provisional_Evidence_Description_Popfreq" + + +BA1 = "BA1 (met)" +BS1 = "BS1 (met)" +BS1_SUPPORTING = "BS1_Supporting (met)" +NO_CODE = "No code met (below threshold)" +NO_CODE_NON_SNV = "No code met for population data (indel)" +PM2_SUPPORTING = "PM2_Supporting (met)" +FAIL_NOT_ASSAYED = "Fail_Not_Assayed" +FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG = "No code met (read depth, flags)" +FAIL_NEEDS_REVIEW = "No code met (needs review)" +FAIL_NEEDS_SOFTWARE_REVIEW = "No code met (needs software review)" + +READ_DEPTH_THRESHOLD_FREQUENT_VARIANT = 20 +READ_DEPTH_THRESHOLD_RARE_VARIANT = 25 + +BA1_MSG = "The highest non-cancer, non-founder population filter allele frequency in gnomAD v2.1 (exomes only, non-cancer subset, read depth ≥20) or gnomAD v3.1 (non-cancer subset, read depth ≥20) is %s in the %s population, which is above the ENIGMA BRCA1/2 VCEP threshold (>0.001) for BA1 (BA1 met)." +BS1_MSG = "The highest non-cancer, non-founder population filter allele frequency in gnomAD v2.1 (exomes only, non-cancer subset, read depth ≥20) or gnomAD v3.1 (non-cancer subset, read depth ≥20) is %s in the %s population, which is above the ENIGMA BRCA1/2 VCEP threshold (>0.0001) for BS1, and below the BA1 threshold (>0.001) (BS1 met)." +BS1_SUPPORTING_MSG = "The highest non-cancer, non-founder population filter allele frequency in gnomAD v2.1 (exomes only, non-cancer subset, read depth ≥20) or gnomAD v3.1 f(non-cancer subset, read depth ≥20) is %s in the %s population which is within the ENIGMA BRCA1/2 VCEP threshold (>0.00002 to ≤ 0.0001) for BS1_Supporting (BS1_Supporting met)." +PM2_SUPPORTING_MSG = "This variant is absent from gnomAD v2.1 (exomes only, non-cancer subset, read depth ≥25) and gnomAD v3.1 (non-cancer subset, read depth ≥25) (PM2_Supporting met)." +NO_CODE_MSG = "This variant is present in gnomAD v2.1 (exomes only, non-cancer subset) or gnomAD v3.1 (non-cancer subset) but is below the ENIGMA BRCA1/2 VCEP threshold >0.00002 for BS1_Supporting (PM2_Supporting, BS1, and BA1 are not met)." +NO_CODE_NON_SNV_MSG = "This [insertion/deletion/large genomic rearrangement] variant was not observed in gnomAD v2.1 (exomes only, non-cancer subset) or gnomAD v3.1 (non-cancer subset), but PM2_Supporting was not applied since recall is suboptimal for this type of variant (PM2_Supporting not met)." +FAIL_NOT_ASSAYED_MSG = "Variant not tested in this dataset" +FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG_MSG = "This variant is present in gnomAD v2.1 (exomes only, non-cancer subset) or gnomAD v3.1 (non-cancer subset) but is not meeting the specified read depths threshold ≥20 OR was flagged as suspect by gnomAD (PM2_Supporting, BS1, and BA1 are not met)." +FAIL_NEEDS_REVIEW_MSG = "No code is met (variant needs review)" +FAIL_NEEDS_SOFTWARE_REVIEW_MSG = "No code is met (variant needs software review)" + +MESSAGES_PER_CODE = { + BA1: BA1_MSG, + BS1: BS1_MSG, + BS1_SUPPORTING: BS1_SUPPORTING_MSG, + NO_CODE: NO_CODE_MSG, + NO_CODE_NON_SNV: NO_CODE_NON_SNV_MSG, + PM2_SUPPORTING: PM2_SUPPORTING_MSG, + FAIL_NOT_ASSAYED: FAIL_NOT_ASSAYED_MSG, + FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG: FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG_MSG, + FAIL_NEEDS_REVIEW: FAIL_NEEDS_REVIEW_MSG, + FAIL_NEEDS_SOFTWARE_REVIEW: FAIL_NEEDS_SOFTWARE_REVIEW_MSG + } + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input", default="built_final.tsv", + help="Input file with variant data") + parser.add_argument("-o", "--output", default="built_with_popfreq.tsv", + help="Output file with new columns") + parser.add_argument("-d", "--data_dir", default="./processed_brca", + help="Directory with the processed files") + args = parser.parse_args() + return(args) + + +def read_flags(flag_data): + flags = {} + for row in flag_data: + flags[row["ID"]] = row + return(flags) + + +def estimate_coverage(start, end, chrom, df_cov): + positions = list(range(start, end+1)) + coverage_this_chrom = df_cov.loc[df_cov["chrom"] == int(chrom)] + positions_this_variant = coverage_this_chrom[coverage_this_chrom["pos"].isin(positions)] + meanval = positions_this_variant["mean"].mean() + medianval = positions_this_variant["median"].median() + return(meanval, medianval) + + + + +def initialize_output_file(input_file, output_filename): + """ + Create an empty output file with the new columns + """ + new_columns = [POPFREQ_CODE_ID, POPFREQ_CODE_DESCR] + input_header_row = input_file.fieldnames + if input_header_row.index("change_type"): + idx = input_header_row.index("change_type") + output_header_row = input_header_row[:idx] + new_columns \ + + input_header_row[idx:] + else: + output_header_row = input_header_row + new_columns + output_file = csv.DictWriter(open(output_filename,"w"), + fieldnames=output_header_row, + delimiter = '\t') + output_file.writeheader() + return(output_file) + + +def field_defined(field): + """ + Return a binary value indicating whether or not this variant has the popmax FAF defined + """ + return(field != "-") + + +def is_variant_observable(start, end, chrom, coverage): + # + # Read the coverage at the variant position. If the coverage metrics returned + # are not a number, that means that the variant could not be tested in the + # indicated dataset. + (mean_read_depth, median_read_depth) = estimate_coverage(int(start),int(end), + int(chrom),coverage) + if pd.isna(mean_read_depth) and pd.isna(median_read_depth): + return(False) + else: + return(True) + + +def analyze_one_dataset(faf95_popmax_str, faf95_population, allele_count, is_snv, + read_depth, vcf_filter_flag, debug=True): + # + # Get the coverage data. Rule out error conditions: low coverage, VCF filter flag. + rare_variant = False + if field_defined(faf95_popmax_str): + faf = float(faf95_popmax_str) + if pd.isna(faf): + rare_variant = True + elif faf <= 0.00002: + rare_variant = True + else: + faf = None + rare_variant = True + if rare_variant and read_depth < READ_DEPTH_THRESHOLD_RARE_VARIANT: + return(FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG) + if (not rare_variant) and read_depth < READ_DEPTH_THRESHOLD_FREQUENT_VARIANT: + return(FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG) + if vcf_filter_flag: + return(FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG) + # + # Address the cases where FAF is defined, and the variant is a candidate for a + # evidence code for high population frequency (BA1, BS1, BS1_SUPPORTING) + if not rare_variant: + if faf > 0.001: + return(BA1) + elif faf > 0.0001: + return(BS1) + elif faf > 0.00002: + return(BS1_SUPPORTING) + if rare_variant: + if not field_defined(allele_count): + return(NO_CODE) + elif int(allele_count) > 0: + return(NO_CODE) + else: + if is_snv: + return(PM2_SUPPORTING) + else: + return(NO_CODE_NON_SNV) + return(NEEDS_REVIEW) + + +def analyze_across_datasets(code_v2, faf_v2, faf_popmax_v2, in_v2, + code_v3, faf_v3, faf_popmax_v3, in_v3, + debug=True): + """ + Given the per-dataset evidence codes, generate an overall evidence code + """ + benign_codes = [BA1, BS1, BS1_SUPPORTING] + pathogenic_codes = [PM2_SUPPORTING] + intermediate_codes = [ NO_CODE, NO_CODE_NON_SNV] + ordered_success_codes = benign_codes + intermediate_codes + pathogenic_codes + success_codes = set(ordered_success_codes) + failure_codes = set([FAIL_NOT_ASSAYED, FAIL_INSUFFICIENT_READ_DEPTH_OR_FILTER_FLAG]) + ordered_codes = ordered_success_codes + list(failure_codes) + + # + # First, rule out the case of outright contradictions + if code_v2 in benign_codes and code_v3 in pathogenic_codes \ + or code_v2 in pathogenic_codes and code_v3 in benign_codes: + return(FAIL_NEEDS_REVIEW, FAIL_NEEDS_REVIEW_MSG) + # + # Next, rule out the case where neither dataset has reliable data. + # Arbitrarily use the message for v3, as the newer and presumably more robust. + if code_v2 in failure_codes and code_v3 in failure_codes: + return(code_v3, MESSAGES_PER_CODE[code_v3]) + + # + # At this point, we can assume that at least one dataset has + # reliable data. + # + # Next, if both datasets point to a pathogenic effect, or + # if one points to a pathogenic effect and the other an error, + # then return the pathogenic effect. + if code_v2 in pathogenic_codes and code_v3 in pathogenic_codes: + return(code_v2, MESSAGES_PER_CODE[code_v2]) + elif code_v2 in pathogenic_codes and code_v3 in failure_codes: + return(code_v2, MESSAGES_PER_CODE[code_v2]) + elif code_v3 in pathogenic_codes and code_v2 in failure_codes: + return(code_v3, MESSAGES_PER_CODE[code_v3]) + + # + # Next, if either dataset has an intermediate effect and the other + # is not stronger (i.e. is also intermediate, or pathogenic, or failure), + # return the intermediate effect code. + if code_v2 in intermediate_codes and ordered_codes.index(code_v2) <= ordered_codes.index(code_v3): + return(code_v2, MESSAGES_PER_CODE[code_v2]) + elif code_v3 in intermediate_codes and ordered_codes.index(code_v3) <= ordered_codes.index(code_v2): + return(code_v3, MESSAGES_PER_CODE[code_v3]) + + # + # Now, at least one dataset must have a success code. We can also assume that + # neither is pathogenic (i.e. boht are benign, intermediate or failure). + # In this case, identify and return the stronger code. + if debug: + print("prior to assertions, codes are", code_v2, code_v3) + assert(code_v2 in benign_codes or code_v2 in intermediate_codes or code_v3 in benign_codes or code_v3 in intermediate_codes) + if code_v3 == BA1: + return(BA1, BA1_MSG % (faf_v3, faf_popmax_v3)) + elif code_v2 == BA1: + return(BA1, BA1_MSG % (faf_v2, faf_popmax_v2)) + elif code_v3 == BS1: + return(BS1, BS1_MSG % (faf_v3, faf_popmax_v3)) + elif code_v2 == BS1: + return(BS1, BS1_MSG % (faf_v2, faf_popmax_v2)) + elif code_v3 == BS1_SUPPORTING: + return(BS1_SUPPORTING, BS1_SUPPORTING_MSG % (faf_v3, faf_popmax_v3)) + elif code_v2 == BS1_SUPPORTING: + return(BS1_SUPPORTING, BS1_SUPPORTING_MSG % (faf_v2, faf_popmax_v2)) + elif code_v2 == NO_CODE: + return(code_v2, NO_CODE_MSG) + elif code_v3 == NO_CODE: + return(code_v3, NO_CODE_MSG) + # + # If we get here, there is a hole in the logic + return(FAIL_NEEDS_REVIEW, FAIL_NEEDS_SOFTWARE_REVIEW_MSG) + + +def variant_is_flagged(variant_id, flags): + assert(variant_id in flags) + variant_flagged = False + if flags[variant_id]["Filters"] != "PASS": + variant_flagged = True + return(variant_flagged) + + +def analyze_variant(variant, coverage_v2, coverage_v3, flags_v2, flags_v3, + debug=True): + """ + Analyze a single variant, adding the output columns + """ + # Initialize the output columns. First, check the read depth. If the + # read depth is defined at the variant position, this means that the + # variant could have been observed in principle; set the default to + # PM2_SUPPORTING (indicating that by default, the variant could have been + # observed, but wasn't). If no read depth is defined, this means that + # the variant could not have been observed in the dataset in question + # (for example, a deep intronic variant in an exome dataset). + # In such a case, set the default value to NOT_ASSAYED. + variant_v2_code_id = FAIL_NOT_ASSAYED + variant_v3_code_id = FAIL_NOT_ASSAYED + variant[POPFREQ_CODE_ID] = FAIL_NOT_ASSAYED + variant[POPFREQ_CODE_DESCR] = FAIL_NOT_ASSAYED_MSG + observable_in_v2 = False + observable_in_v3 = False + if is_variant_observable(int(variant["pyhgvs_Hg37_Start"]),int(variant["pyhgvs_Hg37_End"]), + int(variant["Chr"]),coverage_v2): + variant_v2_code_id = PM2_SUPPORTING + variant[POPFREQ_CODE_ID] = PM2_SUPPORTING + variant[POPFREQ_CODE_DESCR] = PM2_SUPPORTING_MSG + observable_in_v2 = True + if is_variant_observable(int(variant["Hg38_Start"]), + int(variant["Hg38_End"]), + int(variant["Chr"]), coverage_v3): + variant_v3_code_id = PM2_SUPPORTING + variant[POPFREQ_CODE_ID] = PM2_SUPPORTING + variant[POPFREQ_CODE_DESCR] = PM2_SUPPORTING_MSG + observable_in_v3 = True + is_snv = (variant["Hg38_Start"] == variant["Hg38_End"]) + if debug: + print("variant is snv:", is_snv) + # + # the gnomAD v2 variant ID is set when the variant is in the genome + # OR exome portion of gnomAD. Focus on variants that are in the exome + # data by making sure that the allele count is defined. The allele + # count is the total number of observations of the variant in the gnomAD + # dataset + variant_in_v2 = False + if (field_defined(variant["Variant_id_GnomAD"]) + and field_defined(variant["Allele_count_exome_GnomAD"]) + and observable_in_v2): + variant_in_v2 = True + (mean_read_depth, + median_read_depth) = estimate_coverage(int(variant["pyhgvs_Hg37_Start"]), + int(variant["pyhgvs_Hg37_End"]), + int(variant["Chr"]),coverage_v2) + read_depth_v2 = min(mean_read_depth, median_read_depth) + if pd.isna(read_depth_v2): + read_depth_v2 = 0 + variant_v2_code_id = analyze_one_dataset(variant["faf95_popmax_exome_GnomAD"], + variant["faf95_popmax_population_exome_GnomAD"], + variant["Allele_count_exome_GnomAD"], + is_snv, read_depth_v2, + variant_is_flagged(variant["Variant_id_GnomAD"], + flags_v2)) + if debug: + print("gnomAD V2 variant", variant["Variant_id_GnomAD"], + "popmax", variant["faf95_popmax_exome_GnomAD"], + "allele count", variant["Allele_count_exome_GnomAD"], + "mean read depth", mean_read_depth, + "median read depth", median_read_depth, "snv", is_snv, + "V2 code", variant_v2_code_id) + variant_in_v3 = False + if (field_defined(variant["Variant_id_GnomADv3"]) and observable_in_v3): + variant_in_v3 = True + (mean_read_depth, + median_read_depth) = estimate_coverage(int(variant["Hg38_Start"]), + int(variant["Hg38_End"]), + int(variant["Chr"]), + coverage_v3) + read_depth_v3 = min(mean_read_depth, median_read_depth) + if pd.isna(read_depth_v3): + read_depth_v3 = 0 + variant_v3_code_id = analyze_one_dataset(variant["faf95_popmax_genome_GnomADv3"], + variant["faf95_popmax_population_genome_GnomADv3"], + variant["Allele_count_genome_GnomADv3"], + is_snv, read_depth_v3, + variant_is_flagged(variant["Variant_id_GnomADv3"], + flags_v3)) + if debug: + print("gnomAD V3 variant", variant["Variant_id_GnomADv3"], + "popmax", variant["faf95_popmax_genome_GnomADv3"], + "allele count", variant["Allele_count_genome_GnomADv3"], + "mean read depth", mean_read_depth, + "median read depth", median_read_depth, "snv", is_snv, + "V3 code", variant_v3_code_id) + (variant[POPFREQ_CODE_ID], + variant[POPFREQ_CODE_DESCR]) = analyze_across_datasets(variant_v2_code_id,variant["faf95_popmax_exome_GnomAD"], + variant["faf95_popmax_population_exome_GnomAD"], + variant_in_v2, variant_v3_code_id, + variant["faf95_popmax_genome_GnomADv3"], + variant["faf95_popmax_population_genome_GnomADv3"], + variant_in_v3) + if debug: + print("consensus code:", variant[POPFREQ_CODE_ID], "msg", + variant[POPFREQ_CODE_DESCR]) + return() + + + +def main(): + args = parse_args() + df_cov2 = pd.read_parquet(args.data_dir + '/df_cov_v2.parquet') + df_cov3 = pd.read_parquet(args.data_dir + '/df_cov_v3.parquet') + flags_v2 = read_flags(csv.DictReader(open(args.data_dir + "/brca.gnomAD.2.1.1.hg19.flags.tsv"), + delimiter = "\t")) + flags_v3 = read_flags(csv.DictReader(open(args.data_dir + "/brca.gnomAD.3.1.1.hg38.flags.tsv"), + delimiter = "\t")) + input_file = csv.DictReader(open(args.input), delimiter = "\t") + output_file = initialize_output_file(input_file, args.output) + for variant in input_file: + analyze_variant(variant, df_cov2, df_cov3, flags_v2, flags_v3) + output_file.writerow(variant) + + +if __name__ == "__main__": + main() + diff --git a/pipeline/workflow/variant_scoring.py b/pipeline/workflow/variant_scoring.py index 662d2a89c..212f0c780 100644 --- a/pipeline/workflow/variant_scoring.py +++ b/pipeline/workflow/variant_scoring.py @@ -9,6 +9,7 @@ from workflow.pipeline_common import DefaultPipelineTask, data_merging_method_dir, analysis_method_dir + @requires(bayesdel_processing.AddBayesdelScores) class DownloadResources(DefaultPipelineTask): def output(self): From 8b0a6837a714eeaf0e8b93698b5bdab1f6d98a89 Mon Sep 17 00:00:00 2001 From: melissacline Date: Mon, 13 May 2024 18:27:00 -0700 Subject: [PATCH 07/31] Renamed variant_scoring to anaysis --- pipeline/workflow/CompileVCFFiles.py | 4 ++-- pipeline/workflow/{variant_scoring.py => analysis.py} | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) rename pipeline/workflow/{variant_scoring.py => analysis.py} (97%) diff --git a/pipeline/workflow/CompileVCFFiles.py b/pipeline/workflow/CompileVCFFiles.py index 1d6b8b224..e31a5c821 100644 --- a/pipeline/workflow/CompileVCFFiles.py +++ b/pipeline/workflow/CompileVCFFiles.py @@ -13,7 +13,7 @@ luigi.auto_namespace(scope=__name__) -from workflow import bayesdel_processing, esp_processing, gnomad_processing, pipeline_common, pipeline_utils, variant_scoring +from workflow import bayesdel_processing, esp_processing, gnomad_processing, pipeline_common, pipeline_utils, analysis from workflow.pipeline_common import DefaultPipelineTask, clinvar_method_dir, lovd_method_dir, \ functional_assays_method_dir, data_merging_method_dir, priors_method_dir, priors_filter_method_dir, \ utilities_method_dir, vr_method_dir, splice_ai_method_dir, field_metadata_path, field_metadata_path_additional @@ -773,7 +773,7 @@ def run(self): self.output().path) -@requires(variant_scoring.AddVariantScores) +@requires(analysis.runPopfreqAssessment) class FindMissingReports(DefaultPipelineTask): def output(self): return luigi.LocalTarget(os.path.join(self.artifacts_dir, "missing_reports.log")) diff --git a/pipeline/workflow/variant_scoring.py b/pipeline/workflow/analysis.py similarity index 97% rename from pipeline/workflow/variant_scoring.py rename to pipeline/workflow/analysis.py index 212f0c780..c566ae64a 100644 --- a/pipeline/workflow/variant_scoring.py +++ b/pipeline/workflow/analysis.py @@ -7,7 +7,7 @@ import workflow.pipeline_utils as pipeline_utils from workflow.pipeline_common import DefaultPipelineTask, data_merging_method_dir, analysis_method_dir - +from workflow import bayesdel_processing @requires(bayesdel_processing.AddBayesdelScores) From 35eec874aea40fcd32588404d6f57880cfcd560a Mon Sep 17 00:00:00 2001 From: melissacline Date: Tue, 14 May 2024 18:54:10 -0700 Subject: [PATCH 08/31] Moved needle on automation of latest pipeline task --- pipeline/workflow/analysis.py | 35 ++++++++++++++-------------- pipeline/workflow/pipeline_common.py | 2 +- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/pipeline/workflow/analysis.py b/pipeline/workflow/analysis.py index c566ae64a..ce33f27bc 100644 --- a/pipeline/workflow/analysis.py +++ b/pipeline/workflow/analysis.py @@ -10,36 +10,37 @@ from workflow import bayesdel_processing -@requires(bayesdel_processing.AddBayesdelScores) +#@requires(bayesdel_processing.AddBayesdelScores) class DownloadResources(DefaultPipelineTask): def output(self): - wdir = Path(self.cfg.output_dir) / 'release' / 'artifacts' / 'variant_scoring_wdir' - return{'wdir': luigi.LocalTarget(wdir), - 'covV2': "df_cov_v2.parquet", - 'covV3': "df_cov_v2.parquet", - 'flagsV2': "brca.gnomAD.2.1.1.hg19.flags.tsv", - 'flagsV3': "brca.gnomAD.3.1.1.hg38.flags.tsv" - } + return(luigi.LocalTarget(Path(self.cfg.output_dir) / 'release' / 'artifacts' / 'analysis')) + def run(self): - wdir = self.output()['wdir'].path - if not os.path.exists(wdir): - os.mkdir(wdir) - os.chdir(wdir) - variant_scoring_data_url = "https://brcaexchange.org/backend/downloads/variant_scoring/*" - pipeline_utils.download_file_and_display_progress(brca1_data_url) + download_url = "https://brcaexchange.org/backend/downloads/variant_scoring/" + self.inputfiles = [ "df_cov_v2.parquet", "df_cov_v3.parquet", + "brca.gnomAD.2.1.1.hg19.flags.tsv", + "brca.gnomAD.3.1.1.hg38.flags.tsv" + ] + self.wdir = self.output().path + if not os.path.exists(self.wdir): + os.mkdir(self.wdir) + os.chdir(self.wdir) + for this_file in self.inputfiles: + pathname = download_url + "/" + this_file + pipeline_utils.download_file_and_display_progress(pathname) @requires(DownloadResources) class runPopfreqAssessment(DefaultPipelineTask): def output(self): - return luigi.LocalTarget(os.path.join(self.artifacts_dir, 'built_with_popfreq.tsv')) + return luigi.LocalTarget(Path(self.cfg.output_dir)/'release'/'artifacts'/'built_with_popfreq.tsv') def run(self): os.chdir(analysis_method_dir) args = ["python", "popfreq_assessment.py", - "--input", "bayesdel_processing.AddBayesdelScores.output()", - "--data_dir", DownloadResources.output()["wdir"], + "--input", bayesdel_processing.AddBayesdelScores().output().path, + "--data_dir", DownloadResources().output().path, "--output", self.output().path] pipeline_utils.run_process(args) pipeline_utils.check_file_for_contents(self.output().path) diff --git a/pipeline/workflow/pipeline_common.py b/pipeline/workflow/pipeline_common.py index d0d0a5515..d70957248 100644 --- a/pipeline/workflow/pipeline_common.py +++ b/pipeline/workflow/pipeline_common.py @@ -24,7 +24,7 @@ utilities_method_dir = os.path.abspath('../utilities') vr_method_dir = os.path.abspath('../vr') splice_ai_method_dir = os.path.abspath('../splice_ai') -analysis_method_dir = os.path.abspath('../analysis') +analysis_method_dir = os.path.abspath('../pipeline/analysis') field_metadata_path = os.path.abspath(os.path.join('..', 'field_metadata.tsv')) field_metadata_path_additional = os.path.abspath(os.path.join('..', 'field_metadata_additional_fields_variant_output_file.tsv')) From d0794a3ffe49212a2ec4245aa7a2921a8f4a18b3 Mon Sep 17 00:00:00 2001 From: melissacline Date: Tue, 4 Jun 2024 13:32:41 -0700 Subject: [PATCH 09/31] First cut of popfreq_assessment --- pipeline/analysis/popfreq_assessment.py | 14 +++++++------- pipeline/requirements.txt | 1 + pipeline/workflow/pipeline_common.py | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pipeline/analysis/popfreq_assessment.py b/pipeline/analysis/popfreq_assessment.py index 0f6381074..d020c4e29 100755 --- a/pipeline/analysis/popfreq_assessment.py +++ b/pipeline/analysis/popfreq_assessment.py @@ -9,9 +9,6 @@ import numpy as np import pandas as pd -#import gnomad.variant_scoring.constants as cnts -#from common import config, hgvs_utils, variant_utils, utils -#from data_merging.brca_pseudonym_generator import _normalize_genomic_fnc POPFREQ_CODE_ID = "Provisional_Evidence_Code_Popfreq" POPFREQ_CODE_DESCR = "Provisional_Evidence_Description_Popfreq" @@ -63,6 +60,8 @@ def parse_args(): help="Output file with new columns") parser.add_argument("-d", "--data_dir", default="./processed_brca", help="Directory with the processed files") + parser.add_argument("--debug", action="store_true", default=False, + help="Print debugging info") args = parser.parse_args() return(args) @@ -91,7 +90,7 @@ def initialize_output_file(input_file, output_filename): """ new_columns = [POPFREQ_CODE_ID, POPFREQ_CODE_DESCR] input_header_row = input_file.fieldnames - if input_header_row.index("change_type"): + if "change_type" in input_header_row: idx = input_header_row.index("change_type") output_header_row = input_header_row[:idx] + new_columns \ + input_header_row[idx:] @@ -169,7 +168,7 @@ def analyze_one_dataset(faf95_popmax_str, faf95_population, allele_count, is_snv def analyze_across_datasets(code_v2, faf_v2, faf_popmax_v2, in_v2, code_v3, faf_v3, faf_popmax_v3, in_v3, - debug=True): + debug=False): """ Given the per-dataset evidence codes, generate an overall evidence code """ @@ -252,7 +251,7 @@ def variant_is_flagged(variant_id, flags): def analyze_variant(variant, coverage_v2, coverage_v3, flags_v2, flags_v3, - debug=True): + debug=False): """ Analyze a single variant, adding the output columns """ @@ -366,7 +365,8 @@ def main(): input_file = csv.DictReader(open(args.input), delimiter = "\t") output_file = initialize_output_file(input_file, args.output) for variant in input_file: - analyze_variant(variant, df_cov2, df_cov3, flags_v2, flags_v3) + analyze_variant(variant, df_cov2, df_cov3, flags_v2, flags_v3, + debug=args.debug) output_file.writerow(variant) diff --git a/pipeline/requirements.txt b/pipeline/requirements.txt index 8828be0e9..7b92a0f34 100644 --- a/pipeline/requirements.txt +++ b/pipeline/requirements.txt @@ -22,6 +22,7 @@ decorator==4.4.2 distro==1.4.0 docutils==0.16 fake-useragent==0.1.11 +fastparquet==2024.5.0 ga4gh.common==0.0.7 ga4gh.vrs==0.8.4 hgvs==1.5.4 diff --git a/pipeline/workflow/pipeline_common.py b/pipeline/workflow/pipeline_common.py index d70957248..d0d0a5515 100644 --- a/pipeline/workflow/pipeline_common.py +++ b/pipeline/workflow/pipeline_common.py @@ -24,7 +24,7 @@ utilities_method_dir = os.path.abspath('../utilities') vr_method_dir = os.path.abspath('../vr') splice_ai_method_dir = os.path.abspath('../splice_ai') -analysis_method_dir = os.path.abspath('../pipeline/analysis') +analysis_method_dir = os.path.abspath('../analysis') field_metadata_path = os.path.abspath(os.path.join('..', 'field_metadata.tsv')) field_metadata_path_additional = os.path.abspath(os.path.join('..', 'field_metadata_additional_fields_variant_output_file.tsv')) From 4dc22b3cad25f4e10a38b2552d291162bc856856 Mon Sep 17 00:00:00 2001 From: melissacline Date: Tue, 4 Jun 2024 16:12:07 -0700 Subject: [PATCH 10/31] Added a stub for bioinformatic prediction --- pipeline/analysis/add_bioinfo_pred.py | 204 ++++++++++++++++++++++++++ pipeline/workflow/CompileVCFFiles.py | 4 +- pipeline/workflow/analysis.py | 15 +- 3 files changed, 220 insertions(+), 3 deletions(-) create mode 100755 pipeline/analysis/add_bioinfo_pred.py diff --git a/pipeline/analysis/add_bioinfo_pred.py b/pipeline/analysis/add_bioinfo_pred.py new file mode 100755 index 000000000..a6922dcb1 --- /dev/null +++ b/pipeline/analysis/add_bioinfo_pred.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python + +import argparse +import csv +import re +import sys + +BIOINFO_CODE_ID = "Provisional_Evidence_Code_Bioinfo" +BIOINFO_CODE_DESCR = "Provisional_Evidence_Description_Bioinfo" + + +NO_CODE = "NO_CODE" +PP3 = "PP3" +BP4_BP7 = "BP4,BP7" +BP4 = "BP4" +BP1_STRONG = "BP1_STRONG" +PVS1_CODE = "PVS1" + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input", default="build_final.tsv", + help="built_final") + parser.add_argument("-o", "--output", default="built_with_bioinfo.tsv", + help="version of input file with new columns added") + parser.add_argument("-d", "--debug", action="store_true", default=False, + help="Print debugging info") + args = parser.parse_args() + return(args) + + +def initialize_output_file(input_file, output_filename): + """ + Create an empty output file with the new columns + """ + new_columns = [BIOINFO_CODE_ID, BIOINFO_CODE_DESCR] + input_header_row = input_file.fieldnames + if "change_type" in input_header_row: + idx = input_header_row.index("change_type") + output_header_row = input_header_row[:idx] + new_columns \ + + input_header_row[idx:] + else: + output_header_row = input_header_row + new_columns + output_file = csv.DictWriter(open(output_filename,"w"), + fieldnames=output_header_row, + delimiter = '\t') + output_file.writeheader() + return(output_file) + + +def extract_protein_coordinate(variant): + coordinate = None + hit = re.search("[0-9]+", variant["Protein_Change"]) + if hit: + token = variant["Protein_Change"][hit.start():hit.end()] + pos = int(token) + print("from", variant["Protein_Change"], "derived", pos) + return(pos) + +def inside_functional_domain(variant): + inside_domain = False + pos = extract_protein_coordinate(variant) + if pos: + if variant["Gene_Symbol"] == "BRCA1": + if pos >= 2 and pos <= 99: + inside_domain = True + elif pos >= 503 and pos <= 508: + inside_domain = True + elif pos >= 607 and pos <= 614: + inside_domain = True + elif pos >= 651 and pos <= 656: + inside_domain = True + elif pos >= 1391 and pos <= 1424: + inside_domain = True + elif pos >= 1650 and pos <= 1863: + inside_domain = True + elif variant["Gene_Symbol"] == "BRCA2": + if pos >= 10 and pos <= 40: + inside_domain = True + elif pos >= 1002 and pos <= 1036: + inside_domain = True + elif pos >= 1212 and pos <= 1246: + inside_domain = True + elif pos >= 1422 and pos <= 1453: + inside_domain = True + elif pos >= 1518 and pos <= 1549: + inside_domain = True + elif pos >= 1665 and pos <= 1696: + inside_domain = True + elif pos >= 1837 and pos <= 1871: + inside_domain = True + elif pos >= 1971 and pos <= 2005: + inside_domain = True + elif pos >= 2051 and pos <= 2085: + inside_domain = True + elif pos >= 2481 and pos <= 3186: + inside_domain = True + elif pos >= 3263 and pos <= 3269: + inside_domain = True + elif pos >= 3265 and pos <= 3330: + inside_domain = True + elif pos >= 3381 and pos <= 3385: + inside_domain = True + return(inside_domain) + + + +def estimate_bioinfo_code(variant): + effect = "unknown" + bioinfo_code = NO_CODE + if re.search("=\)$", variant["pyhgvs_Protein"]): + effect = "synonymous_variant" + elif re.search("[A-Z]+[0-9]+[A-Z]+", variant["Protein_Change"]): + effect = "missense_variant" + elif re.search("c\.[0-9]+[+]", variant["pyhgvs_cDNA"]): + effect = "intron_variant" + elif re.search("c\.[0-9]+[-]", variant["pyhgvs_cDNA"]): + effect = "intron_variant" + print("variant", variant["pyhgvs_cDNA"], "protein change", variant["Protein_Change"], variant["pyhgvs_Protein"], "effect", effect) + if variant["result_spliceai"] == "-": + splicing_effect = False + no_splicing_effect = True + else: + splicing_effect = (float(variant["result_spliceai"]) > 0.2) + no_splicing_effect = (float(variant["result_spliceai"]) < 0.1) + if variant["Gene_Symbol"] == "BRCA1": + if variant["BayesDel_nsfp33a_noAF"] == "-": + protein_effect = False + no_protein_effect = True + elif float(variant["BayesDel_nsfp33a_noAF"]) > 0.28: + protein_effect = True + no_prptein_effect = False + elif float(variant["BayesDel_nsfp33a_noAF"]) < 0.15: + protein_effect = False + no_protein_effect = True + else: + protein_effect = False + no_protein_effect = False + if variant["Gene_Symbol"] == "BRCA2": + if variant["BayesDel_nsfp33a_noAF"] == "-": + protein_effect = False + no_protein_effect = True + elif float(variant["BayesDel_nsfp33a_noAF"]) > 0.30: + protein_effect = True + no_prptein_effect = False + elif float(variant["BayesDel_nsfp33a_noAF"]) < 0.18: + protein_effect = False + no_protein_effect = True + else: + protein_effect = False + no_protein_effect = False + inside_domain = inside_functional_domain(variant) + print("effect", effect, "splicing effect", splicing_effect, "inside domain", inside_domain) + if effect == "synonymous_variant": + if splicing_effect: + bioinfo_code = PP3 + elif inside_domain: + bioinfo_code = BP4_BP7 + else: + bioinfo_code = BP1_STRONG + elif effect == "intron_variant": + if splicing_effect: + bioinfo_code = PP3 + else: + bioinfo_code = BP4 + elif effect == "missense_variant": + if splicing_effect: + bioinfo_code = PP3 + elif no_splicing_effect: + if not inside_domain: + bioinfo_code = BP1_STRONG + elif protein_effect: + bioinfo_code = PP3 + elif no_protein_effect: + bioinfo_code = BP4 + else: + if inside_domain and protein_effect: + bioinfo_code = PP3 + return(bioinfo_code) + + +def apply_pvs1_code(variant): + pvs1_code = NO_CODE + protein_hgvs = variant["pyhgvs_Protein"] + stop_added = re.search("Ter", protein_hgvs) + if stop_added: + pvs1_code = PVS1_CODE + return(pvs1_code) + + +def main(): + csv.field_size_limit(sys.maxsize) + args = parse_args() + with open(args.input, 'r') as input_fp: + input_reader = csv.DictReader(input_fp, delimiter = "\t") + writer = initialize_output_file(input_reader, args.output) + for variant in input_reader: + #variant[BIOINFO_CODE_ID] = estimate_bioinfo_code(variant, debug=args.debug) + #pvs1_code = apply_pvs1_code(variant) + variant[BIOINFO_CODE_ID] = "" + variant[BIOINFO_CODE_DESCR] = "" + writer.writerow(variant) + +if __name__ == "__main__": + main() diff --git a/pipeline/workflow/CompileVCFFiles.py b/pipeline/workflow/CompileVCFFiles.py index e31a5c821..48d2ed3d7 100644 --- a/pipeline/workflow/CompileVCFFiles.py +++ b/pipeline/workflow/CompileVCFFiles.py @@ -13,7 +13,7 @@ luigi.auto_namespace(scope=__name__) -from workflow import bayesdel_processing, esp_processing, gnomad_processing, pipeline_common, pipeline_utils, analysis +from workflow import analysis, esp_processing, gnomad_processing, pipeline_common, pipeline_utils from workflow.pipeline_common import DefaultPipelineTask, clinvar_method_dir, lovd_method_dir, \ functional_assays_method_dir, data_merging_method_dir, priors_method_dir, priors_filter_method_dir, \ utilities_method_dir, vr_method_dir, splice_ai_method_dir, field_metadata_path, field_metadata_path_additional @@ -790,7 +790,7 @@ def run(self): pipeline_utils.check_file_for_contents(self.output().path) -@requires(bayesdel_processing.AddBayesdelScores) +@requires(analysis.runBioinfoPred) class RunDiffAndAppendChangeTypesToOutput(DefaultPipelineTask): def _extract_release_date(self, version_json): with open(version_json, 'r') as f: diff --git a/pipeline/workflow/analysis.py b/pipeline/workflow/analysis.py index ce33f27bc..a8d412701 100644 --- a/pipeline/workflow/analysis.py +++ b/pipeline/workflow/analysis.py @@ -10,7 +10,7 @@ from workflow import bayesdel_processing -#@requires(bayesdel_processing.AddBayesdelScores) +@requires(bayesdel_processing.AddBayesdelScores) class DownloadResources(DefaultPipelineTask): def output(self): return(luigi.LocalTarget(Path(self.cfg.output_dir) / 'release' / 'artifacts' / 'analysis')) @@ -45,6 +45,19 @@ def run(self): pipeline_utils.run_process(args) pipeline_utils.check_file_for_contents(self.output().path) +@requires(runPopfreqAssessment) +class runBioinfoPred(DefaultPipelineTask): + def output(self): + return luigi.LocalTarget(Path(self.cfg.output_dir)/'release'/'artifacts'/'built_with_bioinfo_pred.tsv') + + def run(self): + os.chdir(analysis_method_dir) + args = ["python", "add_bioinfo_pred.py", + "--input", runPopfreqAssessment().output().path, + "--output", self.output().path] + pipeline_utils.run_process(args) + pipeline_utils.check_file_for_contents(self.output().path) + From ae1f2f108285b8a01978562f1e6a3ffc932de805 Mon Sep 17 00:00:00 2001 From: melissacline Date: Tue, 4 Jun 2024 16:31:23 -0700 Subject: [PATCH 11/31] Added two more files to the tarball discard list --- pipeline/workflow/tarball_files_discard_list.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pipeline/workflow/tarball_files_discard_list.txt b/pipeline/workflow/tarball_files_discard_list.txt index 4860ac24e..104b74d3f 100644 --- a/pipeline/workflow/tarball_files_discard_list.txt +++ b/pipeline/workflow/tarball_files_discard_list.txt @@ -141,3 +141,5 @@ ./release/artifacts/victor_wdir/slurm.annotate.run_1.start ./release/artifacts/victor_wdir/slurm.annotate.run_1.stop ./release/artifacts/victor_wdir/slurm.annotate.run_1.version +./release/artifacts/built_with_popfreq.tsv +./release/artifacts/built_with_bioinfo_pred.tsv From fc8b079e9625dc17bb54f8dba9b2c59cbb316397 Mon Sep 17 00:00:00 2001 From: melissacline Date: Tue, 4 Jun 2024 17:16:22 -0700 Subject: [PATCH 12/31] Bumped the docker-compose version from 1.12.0 to 1.29.2 --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7bba53737..7ccb090fb 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -95,7 +95,7 @@ jobs: apk add --no-cache \ py-pip=9.0.0-r1 pip install \ - docker-compose==1.12.0 + docker-compose==1.29.2 - restore_cache: keys: - v1-{{ .Branch }} From 018d2385b04c1c3fe5b848e21c2964b4d2bfac79 Mon Sep 17 00:00:00 2001 From: melissacline Date: Tue, 4 Jun 2024 17:55:32 -0700 Subject: [PATCH 13/31] Added new fields to the fields metadata --- pipeline/field_metadata.tsv | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pipeline/field_metadata.tsv b/pipeline/field_metadata.tsv index a64b8604e..f4715623b 100644 --- a/pipeline/field_metadata.tsv +++ b/pipeline/field_metadata.tsv @@ -465,3 +465,7 @@ DP_DL_spliceAI Evidence #COPY "The location of donor loss relative to variant po result_spliceai Evidence #COPY "The maximum of four delta scores (DS_AG, DS_AL, DS_DG, DS_DL) is used as basis for spliceogenicity predictions" 0.0 BayesDel_nsfp33a_noAF Evidence bayesdel_noaf "BayesDel score, computed without allele frequency" 0.55 change_type Internal #SKIP "type of change with respect to last release" "changed_information" +Provisional_Evidence_Code_Popfreq Evidence #COPY "The provisional ACMG population frequency evidence code, by the VCEP rules" "PM2_Supporting (met)" +Provisional_Evidence_Description_Popfreq #Copy "Details on how the provisional ACMG popfreq evidence code was determined" "This variant is absent from gnomAD v2.1 (exomes only, non-cancer subset, read depth ≥25) and gnomAD v3.1 (non-cancer subset, read depth ≥25) (PM2_Supporting met)" +Provisional_Evidence_Code_Bioinfo Evidence #SKIP "The provisional ACMG bioinformatic evidence code, by the VCEP rules" N/A +Provisional_Evidence_Description_Bioinfo Evidence #SKIP "Details on how the provisional ACMG bioinformatic evidence code was determined" NA \ No newline at end of file From 0e208d7cb2a0d788633af203e7e4ddaa10095471 Mon Sep 17 00:00:00 2001 From: melissacline Date: Tue, 4 Jun 2024 18:24:12 -0700 Subject: [PATCH 14/31] Added one more file to the tarball discard list --- pipeline/workflow/tarball_files_discard_list.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/pipeline/workflow/tarball_files_discard_list.txt b/pipeline/workflow/tarball_files_discard_list.txt index 104b74d3f..8bf9605df 100644 --- a/pipeline/workflow/tarball_files_discard_list.txt +++ b/pipeline/workflow/tarball_files_discard_list.txt @@ -30,6 +30,7 @@ ./release/artifacts/GnomADv3.vcf ./release/artifacts/LOVDready.vcf ./release/artifacts/LOVD.vcf +./release/artifacts/ready_for_priors.tsv ./release/artifacts/releaseDiff.log ./release/artifacts/right1000_Genomes ./release/artifacts/rightBIC From 0ee92ea2234e4060c679b2c620e1baa775a48fea Mon Sep 17 00:00:00 2001 From: E-T-K Date: Mon, 17 Jun 2024 05:34:14 +0000 Subject: [PATCH 15/31] Provisional Evidence - frontend First implementation of the provisional evidence frontend gui --- .../js/components/AlleleFrequenciesTile.js | 22 +++- ...AlleleFrequencyProvisionalEvidenceField.js | 107 ++++++++++++++++++ 2 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 website/js/components/AlleleFrequencyProvisionalEvidenceField.js diff --git a/website/js/components/AlleleFrequenciesTile.js b/website/js/components/AlleleFrequenciesTile.js index 82270a100..58267f571 100644 --- a/website/js/components/AlleleFrequenciesTile.js +++ b/website/js/components/AlleleFrequenciesTile.js @@ -5,12 +5,14 @@ import React from "react"; import {Panel} from 'react-bootstrap'; import {AlleleFrequencyField} from "./AlleleFrequencyField"; +import {AlleleFreqProvEvidField} from "./AlleleFrequencyProvisionalEvidenceField"; //ETK import GroupHelpButton from './GroupHelpButton'; const _ = require('underscore'); - +// Includes the Allele Frequency fields plus the Provisional Evidence. const fieldsOfInterest = { + 'Provisional ACMG Variant Evidence': true, 'gnomAD V2.1 Exomes, Non-Cancer (Graphical)': true, 'gnomAD V2.1 Exomes, Non-Cancer (Numerical)': false, 'gnomAD V3.1 Genomes, Non-Cancer (Graphical)': true, @@ -47,6 +49,7 @@ export default class AlleleFrequenciesTile extends React.Component { } render() { + console.log(this.props); // ETK const variant = this.props.variant; const data = this.props.alleleFrequencyData; @@ -90,6 +93,22 @@ export default class AlleleFrequenciesTile extends React.Component { ); }); + let fieldNameACMG = 'Provisional ACMG Variant Evidence'; + let expandedACMG = this.state[fieldNameACMG]; + let provisionalEvidence = { + codePopfreq: variant.Provisional_Evidence_Code_Popfreq, + descriptionPopfreq: variant.Provisional_Evidence_Description_Popfreq + }; + const renderedProvEvidenceField = ( + + ); + // TODO: figure out how to determine if everything is empty even though variant is in GnomAD const allEmpty = !variant.Variant_in_GnomAD; @@ -151,6 +170,7 @@ export default class AlleleFrequenciesTile extends React.Component { to assess pathogenicity represent individuals not affected by cancer. + {renderedProvEvidenceField} {renderedAlleleFrequencyFields} diff --git a/website/js/components/AlleleFrequencyProvisionalEvidenceField.js b/website/js/components/AlleleFrequencyProvisionalEvidenceField.js new file mode 100644 index 000000000..c3dd2332d --- /dev/null +++ b/website/js/components/AlleleFrequencyProvisionalEvidenceField.js @@ -0,0 +1,107 @@ +/*eslint-env browser */ +/*global require: false, module */ +'use strict'; + +import React from "react"; +import ReactDOM from 'react-dom'; +import {Collapse, Table} from "react-bootstrap"; +import KeyInline from './KeyInline'; + + +const AlleleFreqProvEvidField = React.createClass({ + + getInitialState: function () { + // identifies which subpopulation groups are expanded + return {}; + }, + + getCollapsableDOMNode: function() { + return ReactDOM.findDOMNode(this.refs.panel); + }, + + getCollapsableDimensionValue: function() { + return ReactDOM.findDOMNode(this.refs.panel).scrollHeight; + }, + + handleToggle: function(e, fieldName) { + e.preventDefault(); + + // ask our parent to toggle us + this.props.onFieldToggled(fieldName); + }, + + fieldToggled: function(field) { + // handles toggling of subpopulation groups in gnomad numerical tables + this.setState({ + [field]: !this.state[field] + }); + }, + + generateHeader: function(field, fieldName) { + console.log("ETK rendered acmg header for ." + field + "." + fieldName + ". expanded:" + this.props.expanded + "."); + return ( +
this.handleToggle(e, fieldName)}> +
+ { + this.props.expanded + ?