Skip to content

Commit

Permalink
issue #1215 T2T - gnomAD liftover contains duplicate entries - pick h…
Browse files Browse the repository at this point in the history
…ighest
  • Loading branch information
davmlaw committed Dec 18, 2024
1 parent f5cfde5 commit 7375613
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 1 deletion.
16 changes: 15 additions & 1 deletion annotation/vcf_files/bulk_vep_vcf_annotation_inserter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from library.django_utils.django_file_utils import get_import_processing_filename, get_import_processing_dir
from library.genomics import overlap_fraction, Range, parse_gnomad_coord
from library.log_utils import log_traceback
from library.utils import invert_dict
from library.utils import invert_dict, split_dict_multi_values
from snpdb.models import GenomeBuild, VariantCoordinate
from upload.vcf.sql_copy_files import write_sql_copy_csv, sql_copy_csv

Expand Down Expand Up @@ -426,6 +426,19 @@ def _get_transcript_id_and_version(self, vep_transcript_data: TranscriptData) ->
logging.warning(f"Could not find transcript: '{t_id}'")
return transcript_id, transcript_version_id

def _fix_multiple_values(self, transcript_data: TranscriptData):
# T2T gnomad liftover has dupes, sometimes we get multiple entries in which case all will be dupes
# We are going to pick the HIGHEST, as if there was a liftover error this will cause false positives after
# gnomAD filter rather than false negatives (potentially throwing away real rare disease causing variants)
if "&" in transcript_data.get("gnomad_af", ""):
gnomad = {k: v for k, v in transcript_data.items() if k.startswith("gnomad")}
gnomad_list = split_dict_multi_values(gnomad, sep='&')
highest_af = sorted(gnomad_list, key=operator.itemgetter("gnomad_af"), reverse=True)[0]
# Copy back overwriting old multi-value data with single values
for k, v in highest_af.items():
transcript_data[k] = v


def add_calculated_transcript_columns(self, variant_coordinate: Optional[VariantCoordinate], transcript_data: TranscriptData):
""" variant_coordinate - will only be set for symbolics """

Expand Down Expand Up @@ -587,6 +600,7 @@ def process_entry(self, v: VCFVariant):
if representative_transcript:
variant_data = self.vep_to_db_dict(vep_transcript_data, self.variant_only_columns)
variant_data.update(transcript_data)
self._fix_multiple_values(variant_data)
self.add_calculated_variant_annotation_columns(variant_coordinate, variant_data)
# If we're using custom COSMIC vcf, merge with those from VEP existing variation
if custom_vcf_cosmic_ids := vep_transcript_data.get("COSMIC"):
Expand Down
10 changes: 10 additions & 0 deletions library/utils/text_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,16 @@ def join_with_commas_and_ampersand(items: list[str], final_sep: str = "&") -> st
return f" {final_sep} ".join([comma_sep, items[-1]])


def split_dict_multi_values(data: dict[str, str], sep: str) -> list[dict[str, str]]:
any_value = next(iter(data.values()))
num_records = len(any_value.split(sep))
dict_list = [{} for _ in range(num_records)]
for k, v in data.items():
for i, v_part in enumerate(v.split(sep)):
dict_list[i][k] = v_part
return dict_list


def limit_str(text: str, limit: int) -> str:
if len(text) > limit:
text = text[:limit] + "..."
Expand Down

0 comments on commit 7375613

Please sign in to comment.