diff --git a/VariantValidator/modules/complex_descriptions.py b/VariantValidator/modules/complex_descriptions.py index be6c947a..95cd42b0 100644 --- a/VariantValidator/modules/complex_descriptions.py +++ b/VariantValidator/modules/complex_descriptions.py @@ -1,3 +1,6 @@ +import re +import vvhgvs.exceptions + # Errors class FuzzyPositionError(Exception): pass @@ -7,6 +10,13 @@ class FuzzyRangeError(Exception): pass +class InvalidRangeError(Exception): + pass + + +class IncompatibleTypeError(Exception): + pass + def fuzzy_ends(my_variant): """ :param my_variant: @@ -27,6 +37,60 @@ def fuzzy_ends(my_variant): return False + +def uncertain_positions(my_variant, validator): + + print("AWOOO") + print(dir(my_variant)) + print(my_variant.quibble) + + print("Try it") + + # Formats like NC_000005.9:g.(90136803_90144453)_(90159675_90261231)dup + if ")_(" in my_variant.quibble: + accession_and_type, positions_and_edit = my_variant.quibble.split(".(") + position_1, position_2 = positions_and_edit.split(")_(") + position_2, variation = position_2.split(")") + position_1 = position_1.replace(")", "") + position_2 = position_2.replace("(", "") + print(accession_and_type) + print(position_1) + print(position_2) + print(variation) + v1 = f"{accession_and_type}.{position_1}{variation}" + v2 = f"{accession_and_type}.{position_2}{variation}" + my_variant.reftype = f":{accession_and_type.split(':')[1]}." + print(my_variant.reftype) + try: + parsed_v1 = validator.hp.parse(v1) + validator.vr.validate(parsed_v1) + except vvhgvs.exceptions.HGVSError as e: + if "is not known to be compatible with variant type" in str(e): + raise IncompatibleTypeError(str(e)) + import traceback + traceback.print_exc() + raise InvalidRangeError(f"{position_1} is an invlaid range for " + f"accession {accession_and_type.split(':')[0]}") + try: + parsed_v2 = validator.hp.parse(v2) + validator.vr.validate(parsed_v2) + except vvhgvs.exceptions.HGVSError as e: + if "is not known to be compatible with variant type" in str(e): + raise IncompatibleTypeError(str(e)) + import traceback + raise InvalidRangeError(f"{position_1} is an invlaid range for " + f"accession {accession_and_type.split(':')[0]}") + + my_variant.warnings = ["Uncertain positions are not fully supported, however the syntax is valid"] + if "NC_" in my_variant.quibble: + my_variant.hgvs_genomic = my_variant.quibble + if "NM_" in my_variant.quibble or "NR_" in my_variant.quibble or "ENST" in my_variant.quibble: + my_variant.hgvs_coding = my_variant.quibble + my_variant.hgvs_transcript_variant = my_variant.quibble + + + return + # # Copyright (C) 2016-2023 VariantValidator Contributors # diff --git a/VariantValidator/modules/format_converters.py b/VariantValidator/modules/format_converters.py index 76dcf38e..09899178 100644 --- a/VariantValidator/modules/format_converters.py +++ b/VariantValidator/modules/format_converters.py @@ -2,10 +2,11 @@ import vvhgvs.exceptions import copy import logging -from .variant import Variant -from . import seq_data -from . import utils as fn +from VariantValidator.modules.variant import Variant +from VariantValidator.modules import seq_data +from VariantValidator.modules import utils as fn import VariantValidator.modules.rna_formatter +from VariantValidator.modules import complex_descriptions, use_checking logger = logging.getLogger(__name__) @@ -37,7 +38,7 @@ def initial_format_conversions(variant, validator, select_transcripts_dict_plus_ return True # Uncertain positions - toskip = uncertain_pos(variant) + toskip = uncertain_pos(variant, validator) if toskip: return True @@ -1090,7 +1091,7 @@ def rna(variant, validator): return False -def uncertain_pos(variant): +def uncertain_pos(variant, validator): """ check for uncertain positions in the variant and return unsupported warning """ @@ -1104,9 +1105,22 @@ def uncertain_pos(variant): return False elif ":r.(" in to_check: return False - error = 'Uncertain positions are not currently supported' - variant.warnings.append(error) - logger.warning(str(error)) + else: + try: + complex_descriptions.uncertain_positions(variant, validator) + except complex_descriptions.IncompatibleTypeError: + print("YYAYAYAY") + + use_checking.refseq_common_mistakes(variant) + + + + import traceback + traceback.print_exc() + + # error = 'Uncertain positions are not currently supported' + # variant.warnings.append(error) + # logger.warning(str(error)) return True else: return False diff --git a/VariantValidator/modules/use_checking.py b/VariantValidator/modules/use_checking.py index 2c44abaf..993e4e94 100644 --- a/VariantValidator/modules/use_checking.py +++ b/VariantValidator/modules/use_checking.py @@ -13,6 +13,10 @@ def refseq_common_mistakes(variant): """ Evolving list of common mistakes, see sections below """ + + print("IN IT") + print(variant.quibble) + # NM_ .g if (variant.quibble.startswith('NM_') or variant.quibble.startswith('NR_')) and variant.reftype == ':g.': suggestion = variant.quibble.replace(':g.', ':c.') @@ -36,6 +40,7 @@ def refseq_common_mistakes(variant): 'Did you mean ' + suggestion + '?' variant.warnings.append(error) logger.warning(error) + print("YahBoo") return True # NM_ NC_ NG_ NR_ p. diff --git a/VariantValidator/modules/vvMixinCore.py b/VariantValidator/modules/vvMixinCore.py index 29db7ca6..fa4985c9 100644 --- a/VariantValidator/modules/vvMixinCore.py +++ b/VariantValidator/modules/vvMixinCore.py @@ -276,6 +276,16 @@ def validate(self, continue if toskip: + print("Out we go") + print(my_variant.warnings) + print(my_variant.hgvs_genomic) + if my_variant.warnings is not None and my_variant.hgvs_genomic is not None: + if "NC_" in my_variant.hgvs_genomic and "Uncertain positions" in str(my_variant.warnings): + my_variant.primary_assembly_loci = {my_variant.primary_assembly.lower(): + {"hgvs_genomic_description": + my_variant.hgvs_genomic}} + print(my_variant.primary_assembly_loci) + continue # INITIAL USER INPUT FORMATTING @@ -1355,7 +1365,6 @@ def validate(self, variant.stable_gene_ids = stable_gene_ids variant.annotations = reference_annotations - variant.hgvs_transcript_variant = tx_variant variant.genome_context_intronic_sequence = genome_context_transcript_variant variant.refseqgene_context_intronic_sequence = refseqgene_context_transcript_variant variant.hgvs_refseqgene_variant = refseqgene_variant @@ -1363,7 +1372,12 @@ def validate(self, variant.hgvs_lrg_transcript_variant = lrg_transcript_variant variant.hgvs_lrg_variant = lrg_variant variant.alt_genomic_loci = alt_genomic_dicts - variant.primary_assembly_loci = primary_genomic_dicts + if "Uncertain positions are not fully supported, however the syntax is valid" \ + not in str(variant.warnings): + variant.primary_assembly_loci = primary_genomic_dicts + variant.hgvs_transcript_variant = tx_variant + if variant.hgvs_transcript_variant is None: + variant.hgvs_transcript_variant = tx_variant variant.reference_sequence_records = '' variant.validated = True @@ -1372,107 +1386,111 @@ def validate(self, if ref_records != {}: variant.reference_sequence_records = ref_records - # Liftover intergenic positions genome to genome - if (variant.output_type_flag == 'intergenic' and liftover_level is not None) or \ - (('grch37' not in variant.primary_assembly_loci.keys() or - 'grch38' not in variant.primary_assembly_loci.keys() or - 'hg38' not in variant.primary_assembly_loci.keys() or - 'hg19' not in variant.primary_assembly_loci.keys()) - and liftover_level is not None): - - # Simple cache - lo_cache = {} - - # Attempt to liftover between genome builds - # Note: pyliftover uses the UCSC liftOver tool. - # https://pypi.org/project/pyliftover/ - genomic_position_info = variant.primary_assembly_loci - - for g_p_key in list(genomic_position_info.keys()): - build_to = '' - build_from = '' - - # Identify the current build and hgvs_genomic description - if 'hg' in g_p_key: - # incoming_vcf = genomic_position_info[g_p_key]['vcf'] - # set builds - if g_p_key == 'hg38': - build_to = 'hg19' - build_from = 'hg38' - if g_p_key == 'hg19': - build_to = 'hg38' - build_from = 'hg19' - elif 'grc' in g_p_key: - # incoming_vcf = genomic_position_info[g_p_key]['vcf'] - # set builds - if g_p_key == 'grch38': - build_to = 'GRCh37' - build_from = 'GRCh38' - if g_p_key == 'grch37': - build_to = 'GRCh38' - build_from = 'GRCh37' - - # Genome to Genome liftover if tx not annotated to the build - g_to_g = False - if variant.output_type_flag != 'intergenic': - g_to_g = True - - # Lift-over - if (genomic_position_info[g_p_key]['hgvs_genomic_description'] not in lo_cache.keys()) or ( - "NC_012920.1" in genomic_position_info[g_p_key]['hgvs_genomic_description'] - and build_from == "hg38" and build_to == "hg19"): - lifted_response = liftover(genomic_position_info[g_p_key]['hgvs_genomic_description'], - build_from, - build_to, variant.hn, variant.reverse_normalizer, - variant.evm, - self, - specify_tx=False, - liftover_level=liftover_level, - g_to_g=g_to_g) - - if "NC_012920.1" in genomic_position_info[g_p_key]['hgvs_genomic_description'] or \ - "NC_001807.4:" in genomic_position_info[g_p_key]['hgvs_genomic_description']: - capture_corrected_response = False - for key, val in lifted_response.items(): - if "grch38" in key: - capture_corrected_response = val - if val == {} and "grch37" in key: - if capture_corrected_response is not False: - lifted_response[key] = capture_corrected_response - for key, val in lifted_response.items(): - if "grch38" in key: - capture_corrected_response = val - if val == {} and "grch37" in key: - if capture_corrected_response is not False: - lifted_response[key] = capture_corrected_response - - lo_cache[genomic_position_info[g_p_key]['hgvs_genomic_description']] = lifted_response - else: - lifted_response = lo_cache[genomic_position_info[g_p_key]['hgvs_genomic_description']] + # Loop out uncertain position variants + if "Uncertain positions are not fully supported, however the syntax is valid" \ + not in str(variant.warnings): + + # Liftover intergenic positions genome to genome + if (variant.output_type_flag == 'intergenic' and liftover_level is not None) or \ + (('grch37' not in variant.primary_assembly_loci.keys() or + 'grch38' not in variant.primary_assembly_loci.keys() or + 'hg38' not in variant.primary_assembly_loci.keys() or + 'hg19' not in variant.primary_assembly_loci.keys()) + and liftover_level is not None): + + # Simple cache + lo_cache = {} + + # Attempt to liftover between genome builds + # Note: pyliftover uses the UCSC liftOver tool. + # https://pypi.org/project/pyliftover/ + genomic_position_info = variant.primary_assembly_loci + + for g_p_key in list(genomic_position_info.keys()): + build_to = '' + build_from = '' + + # Identify the current build and hgvs_genomic description + if 'hg' in g_p_key: + # incoming_vcf = genomic_position_info[g_p_key]['vcf'] + # set builds + if g_p_key == 'hg38': + build_to = 'hg19' + build_from = 'hg38' + if g_p_key == 'hg19': + build_to = 'hg38' + build_from = 'hg19' + elif 'grc' in g_p_key: + # incoming_vcf = genomic_position_info[g_p_key]['vcf'] + # set builds + if g_p_key == 'grch38': + build_to = 'GRCh37' + build_from = 'GRCh38' + if g_p_key == 'grch37': + build_to = 'GRCh38' + build_from = 'GRCh37' + + # Genome to Genome liftover if tx not annotated to the build + g_to_g = False + if variant.output_type_flag != 'intergenic' and variant.output_type_flag != "other": + g_to_g = True + + # Lift-over + if (genomic_position_info[g_p_key]['hgvs_genomic_description'] not in lo_cache.keys()) or ( + "NC_012920.1" in genomic_position_info[g_p_key]['hgvs_genomic_description'] + and build_from == "hg38" and build_to == "hg19"): + lifted_response = liftover(genomic_position_info[g_p_key]['hgvs_genomic_description'], + build_from, + build_to, variant.hn, variant.reverse_normalizer, + variant.evm, + self, + specify_tx=False, + liftover_level=liftover_level, + g_to_g=g_to_g) + + if "NC_012920.1" in genomic_position_info[g_p_key]['hgvs_genomic_description'] or \ + "NC_001807.4:" in genomic_position_info[g_p_key]['hgvs_genomic_description']: + capture_corrected_response = False + for key, val in lifted_response.items(): + if "grch38" in key: + capture_corrected_response = val + if val == {} and "grch37" in key: + if capture_corrected_response is not False: + lifted_response[key] = capture_corrected_response + for key, val in lifted_response.items(): + if "grch38" in key: + capture_corrected_response = val + if val == {} and "grch37" in key: + if capture_corrected_response is not False: + lifted_response[key] = capture_corrected_response + + lo_cache[genomic_position_info[g_p_key]['hgvs_genomic_description']] = lifted_response + else: + lifted_response = lo_cache[genomic_position_info[g_p_key]['hgvs_genomic_description']] - # Sort the respomse into primary assembly and ALT - primary_assembly_loci = {} - alt_genomic_loci = [] + # Sort the respomse into primary assembly and ALT + primary_assembly_loci = {} + alt_genomic_loci = [] - for build_key, accession_dict in list(lifted_response.items()): - try: - accession_key = list(accession_dict.keys())[0] - if 'NC_' in accession_dict[accession_key]['hgvs_genomic_description']: - primary_assembly_loci[build_key.lower()] = accession_dict[accession_key] - else: - alt_genomic_loci.append({build_key.lower(): accession_dict[accession_key]}) + for build_key, accession_dict in list(lifted_response.items()): + try: + accession_key = list(accession_dict.keys())[0] + if 'NC_' in accession_dict[accession_key]['hgvs_genomic_description']: + primary_assembly_loci[build_key.lower()] = accession_dict[accession_key] + else: + alt_genomic_loci.append({build_key.lower(): accession_dict[accession_key]}) - # KeyError if the dicts are empty - except KeyError: - continue - except IndexError: - continue + # KeyError if the dicts are empty + except KeyError: + continue + except IndexError: + continue - # Add the dictionaries from lifted response to the output - if primary_assembly_loci != {}: - variant.primary_assembly_loci = primary_assembly_loci - if alt_genomic_loci: - variant.alt_genomic_loci = alt_genomic_loci + # Add the dictionaries from lifted response to the output + if primary_assembly_loci != {}: + variant.primary_assembly_loci = primary_assembly_loci + if alt_genomic_loci: + variant.alt_genomic_loci = alt_genomic_loci # Add exon numbering information see issue # if variant.coding != "": diff --git a/tests/test_warnings.py b/tests/test_warnings.py index f9558b98..e776feab 100644 --- a/tests/test_warnings.py +++ b/tests/test_warnings.py @@ -869,6 +869,34 @@ def test_p_with_tc_ref(self): "variation is not HGVS compliant. Please select an appropriate protein reference sequence (NP_)" in \ results['validation_warning_1']['validation_warnings'] + def test_uncertain_1(self): + variant = 'NC_000005.9:g.(90136803_90144453)_(90159675_90261231)dup' + results = self.vv.validate(variant, 'GRCh38', 'all').format_as_dict(test=True) + print(results) + assert "Uncertain positions are not fully supported, however the syntax is valid" in \ + results['validation_warning_1']['validation_warnings'] + assert results['validation_warning_1']['primary_assembly_loci'] == { + "grch38": { + "hgvs_genomic_description": "NC_000005.9:g.(90136803_90144453)_(90159675_90261231)dup" + }} + + def test_uncertain_2(self): + variant = 'NM_006138.4:n.(1_20)_(30_36)del' + results = self.vv.validate(variant, 'GRCh38', 'all').format_as_dict(test=True) + print(results) + assert "Coding transcript reference sequence input as non-coding transcript (n.) reference sequence. " \ + "Did you mean NM_006138.4:c.(1_20)_(30_36)del?" in \ + results['validation_warning_1']['validation_warnings'] + + def test_uncertain_3(self): + variant = 'NM_006138.4:c.(1_20)_(30_36)del' + results = self.vv.validate(variant, 'GRCh38', 'all').format_as_dict(test=True) + print(results) + assert "Uncertain positions are not fully supported, however the syntax is valid" in \ + results['validation_warning_1']['validation_warnings'] + assert results['validation_warning_1']['hgvs_transcript_variant'] == "NM_006138.4:c.(1_20)_(30_36)del" + + # # Copyright (C) 2016-2023 VariantValidator Contributors #