Skip to content

Commit

Permalink
Fix variant region for ins and dup on intron-exon boundary
Browse files Browse the repository at this point in the history
  • Loading branch information
b0d0nne11 committed Dec 6, 2023
1 parent 0f1d05d commit ed861bd
Show file tree
Hide file tree
Showing 10 changed files with 177 additions and 21 deletions.
22 changes: 22 additions & 0 deletions src/hgvs/assemblymapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import logging

from bioutils.sequences import reverse_complement

import hgvs
import hgvs.normalizer
from hgvs.exceptions import (
Expand Down Expand Up @@ -241,6 +243,26 @@ def _alt_ac_for_tx_ac(self, tx_ac):
assert len(alt_acs) == 1, "Should have exactly one alignment at this point"
return alt_acs[0]

def _replace_reference(self, var):
if (getattr(var.posedit.pos.start, 'offset', 0) != 0 or getattr(var.posedit.pos.end, 'offset', 0) != 0):
if var.type != 'g':
alt_ac = self._alt_ac_for_tx_ac(var.ac)
if var.type == 'c':
var_g = super(AssemblyMapper, self).c_to_g(var, alt_ac, alt_aln_method=self.alt_aln_method)
if var.type == 'n':
var_g = super(AssemblyMapper, self).n_to_g(var, alt_ac, alt_aln_method=self.alt_aln_method)
if var.type == 't':
var_g = super(AssemblyMapper, self).t_to_g(var, alt_ac, alt_aln_method=self.alt_aln_method)
super(AssemblyMapper, self)._replace_reference(var_g)
ref = var_g.posedit.edit.ref
if self._fetch_AlignmentMapper(tx_ac=var.ac).strand == -1:
ref = reverse_complement(ref)
var.posedit.edit.ref = ref
_logger.debug("Replaced reference sequence in {var} with {ref}".format(var=var, ref=var_g.posedit.edit.ref))
return var

return super(AssemblyMapper, self)._replace_reference(var)

def _fetch_AlignmentMapper(self, tx_ac, alt_ac=None, alt_aln_method=None):
"""convenience version of VariantMapper._fetch_AlignmentMapper that
derives alt_ac from transcript, assembly, and alt_aln_method
Expand Down
20 changes: 17 additions & 3 deletions src/hgvs/utils/altseqbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import logging
import math

from hgvs.exceptions import HGVSError

import six
from bioutils.sequences import reverse_complement, translate_cds

Expand Down Expand Up @@ -192,8 +194,17 @@ def _get_variant_region(self):
and self._var_c.posedit.pos.end.datum == Datum.CDS_END
):
result = self.WHOLE_GENE
elif self._var_c.posedit.edit.type == "ins" and self._var_c.posedit.pos.start.offset == -1 and self._var_c.posedit.pos.end.offset == 0:
# ins at intron-exon boundary
result = self.EXON
elif self._var_c.posedit.edit.type == "ins" and self._var_c.posedit.pos.start.offset == 0 and self._var_c.posedit.pos.end.offset == 1:
# ins at exon-intron boundary
result = self.EXON
elif self._var_c.posedit.edit.type == "dup" and self._var_c.posedit.pos.start.offset <= -1 and self._var_c.posedit.pos.end.offset >= -1:
# dup at intron-exon boundary
result = self.EXON
elif self._var_c.posedit.pos.start.offset != 0 or self._var_c.posedit.pos.end.offset != 0:
# leave out anything intronic for now
# leave out anything else intronic for now
result = self.INTRON
else: # anything else that contains an exon
result = self.EXON
Expand Down Expand Up @@ -263,7 +274,10 @@ def _incorporate_dup(self):
"""Incorporate dup into sequence"""
seq, cds_start, cds_stop, start, end = self._setup_incorporate()

dup_seq = seq[start:end]
if not self._var_c.posedit.edit.ref:
raise HGVSError('Duplication variant is missing reference sequence')

dup_seq = self._var_c.posedit.edit.ref
seq[end:end] = dup_seq

is_frameshift = len(dup_seq) % 3 != 0
Expand Down Expand Up @@ -326,7 +340,7 @@ def _setup_incorporate(self):
result = cds_start - 1
else: # cds/intron
if pos.offset <= 0:
result = (cds_start - 1) + pos.base - 1
result = (cds_start - 1) + pos.base + pos.offset - 1
else:
result = (cds_start - 1) + pos.base
elif pos.datum == Datum.CDS_END: # 3' UTR
Expand Down
14 changes: 7 additions & 7 deletions src/hgvs/variantmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,7 @@ def c_to_p(self, var_c, pro_ac=None):
raise HGVSInvalidVariantError("Expected a cDNA (c.) variant; got " + str(var_c))
if self._validator:
self._validator.validate(var_c)
self._replace_reference(var_c)
reference_data = RefTranscriptData(self.hdp, var_c.ac, pro_ac)
builder = altseqbuilder.AltSeqBuilder(var_c, reference_data)

Expand Down Expand Up @@ -468,11 +469,10 @@ def _replace_reference(self, var):
return var

pos = var.posedit.pos
if (isinstance(pos.start, hgvs.location.BaseOffsetPosition) and pos.start.offset != 0) or (
isinstance(pos.end, hgvs.location.BaseOffsetPosition) and pos.end.offset != 0
):
_logger.info("Can't update reference sequence for intronic variant {}".format(var))
return var
if (getattr(pos.start, 'offset', 0) != 0 or getattr(pos.end, 'offset', 0) != 0):
if var.type != "g":
_logger.info("Can't update reference sequence for intronic variant {}".format(var))
return var

# For c. variants, we need coords on underlying sequences
if var.type == "c":
Expand All @@ -483,8 +483,8 @@ def _replace_reference(self, var):
else:
pos = var.posedit.pos

seq_start = pos.start.base - 1
seq_end = pos.end.base
seq_start = pos.start.base + getattr(pos.start, 'offset', 0) - 1
seq_end = pos.end.base + getattr(pos.end, 'offset', 0)

# When strict_bounds is False and an error occurs, return
# variant as-is
Expand Down
Binary file modified tests/data/cache-py3.hdp
Binary file not shown.
20 changes: 10 additions & 10 deletions tests/data/sanity_cp.tsv
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
accession transcript_sequence cds_start_i cds_end_i
NM_999999.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGGGG 9 39
NM_999998.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGAGGAGGTAGTTTCGC 9 39
NM_999997.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGTAGAATAGAGGAGGCAGTTTCGC 9 39
NM_999996.1 AAAATCAAAATGAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 39
NM_999995.1 AAAATCAAAATGAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 42
NM_999994.1 AAAATCAAAATGAAAAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 45
NM_999993.1 AAAATCAAAATGGGGAGGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39
NM_999992.1 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39
NM_999992.2 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGCC 9 40
accession transcript_sequence cds_start_i cds_end_i lengths
NM_999999.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGGGG 9 39 [10,25,30]
NM_999998.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGAGGAGGTAGTTTCGC 9 39 [10,25,30]
NM_999997.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGTAGAATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999996.1 AAAATCAAAATGAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999995.1 AAAATCAAAATGAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 42 [10,25,30]
NM_999994.1 AAAATCAAAATGAAAAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 45 [10,25,30]
NM_999993.1 AAAATCAAAATGGGGAGGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999992.1 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999992.2 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGCC 9 40 [10,25,30]
3 changes: 2 additions & 1 deletion tests/support/mock_input_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def fetch_transcript_info(self, ac):
result = None
data = self._mock_data.get(ac)
if data: # interbase coordinates
result = {"cds_start_i": data["cds_start_i"], "cds_end_i": data["cds_end_i"]}
result = {"cds_start_i": data["cds_start_i"], "cds_end_i": data["cds_end_i"], "lengths": data["lengths"]}
return result

def get_tx_identity_info(self, ac):
Expand Down Expand Up @@ -74,6 +74,7 @@ def _read_input(self, in_file):
"transcript_sequence": row["transcript_sequence"],
"cds_start_i": int(row["cds_start_i"]),
"cds_end_i": int(row["cds_end_i"]),
"lengths": eval(row["lengths"]),
}

return result
Expand Down
51 changes: 51 additions & 0 deletions tests/test_hgvs_assemblymapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,57 @@ def test_c_to_p_with_stop_gain(self):

self.assertEqual(str(var_p), hgvs_p)

def test_map_of_dup_intron_exon_boundary(self):
hgvs_c = "NM_004380.2:c.3251-1dup"
hgvs_p = "NP_004371.2:p.(Ile1084SerfsTer3)"

var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.am.c_to_p(var_c)

self.assertEqual(str(var_p), hgvs_p)

hgvs_c = "NM_024529.4:c.132-2_132-1dup"
hgvs_p = "NP_078805.3:p.(Thr45GlyfsTer65)"

var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.am.c_to_p(var_c)

self.assertEqual(str(var_p), hgvs_p)

hgvs_c = "NM_004985.4:c.112-1_113dup"
hgvs_p = "NP_004976.2:p.(Glu37dup)"

var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.am.c_to_p(var_c)

self.assertEqual(str(var_p), hgvs_p)

hgvs_c = "NM_004380.2:c.3251dup"
hgvs_p = "NP_004371.2:p.(Phe1085LeufsTer2)"

var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.am.c_to_p(var_c)

self.assertEqual(str(var_p), hgvs_p)

def test_map_of_dup_exon_intron_boundary(self):
hgvs_c = "NM_021140.3:c.619dup"
hgvs_p = "NP_066963.2:p.(Ile207AsnfsTer10)"

var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.am.c_to_p(var_c)

self.assertEqual(str(var_p), hgvs_p)

def test_map_of_dup_intron_exon_boundary_rc(self):
hgvs_c = "NM_004985.4:c.112-1_112dup"
hgvs_p = "NP_004976.2:p.(Asp38GlyfsTer8)"

var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.am.c_to_p(var_c)

self.assertEqual(str(var_p), hgvs_p)


class Test_RefReplacement(unittest.TestCase):
test_cases = [
Expand Down
45 changes: 45 additions & 0 deletions tests/test_hgvs_variantmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,51 @@ def test_map_of_c_out_of_reference_bound(self):
with pytest.raises(HGVSError, match="coordinate is out of bounds"):
self.vm.c_to_p(var_c)

def test_map_of_ins_intron_exon_boundary(self):
hgvs_c = "NM_004380.2:c.3251-1_3251insA"
var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.vm.c_to_p(var_c)
self.assertEqual(str(var_p), "NP_004371.2:p.(Ile1084AsnfsTer3)")

def test_map_of_ins_exon_intron_boundary(self):
hgvs_c = "NM_004380.2:c.3250_3250+1insT"
var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.vm.c_to_p(var_c)
self.assertEqual(str(var_p), "NP_004371.2:p.(Phe1085LeufsTer2)")

def test_map_of_dup_intron_exon_boundary(self):
hgvs_c = "NM_004380.2:c.3251-1dup"
var_c = self.hp.parse_hgvs_variant(hgvs_c)
with self.assertRaises(HGVSError):
var_p = self.vm.c_to_p(var_c)

hgvs_c = "NM_024529.4:c.132-2_132-1dup"
var_c = self.hp.parse_hgvs_variant(hgvs_c)
with self.assertRaises(HGVSError):
var_p = self.vm.c_to_p(var_c)

hgvs_c = "NM_004985.4:c.112-1_113dup"
var_c = self.hp.parse_hgvs_variant(hgvs_c)
with self.assertRaises(HGVSError):
var_p = self.vm.c_to_p(var_c)

hgvs_c = "NM_004380.2:c.3251dup"
var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.vm.c_to_p(var_c)
self.assertEqual(str(var_p), "NP_004371.2:p.(Phe1085LeufsTer2)")

def test_map_of_dup_exon_intron_boundary(self):
hgvs_c = "NM_021140.3:c.619dup"
var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.vm.c_to_p(var_c)
self.assertEqual(str(var_p), "NP_066963.2:p.(Ile207AsnfsTer10)")

def test_map_of_dup_intron_exon_boundary_rc(self):
hgvs_c = "NM_004985.4:c.112-1_112dup"
var_c = self.hp.parse_hgvs_variant(hgvs_c)
with self.assertRaises(HGVSError):
var_p = self.vm.c_to_p(var_c)


if __name__ == "__main__":
unittest.main()
Expand Down
1 change: 1 addition & 0 deletions tests/test_hgvs_variantmapper_cp_altseqbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def test_sequence_with_length_that_is_not_divisible_by_3(self):
def _run_comparison(self, hgvsc, expected_sequence):
ac_p = "DUMMY"
var = self._parser.parse_hgvs_variant(hgvsc)
var.fill_ref(self._datasource)
transcript_data = RefTranscriptData(hdp=self._datasource, tx_ac=var.ac, pro_ac=ac_p)

builder = altseqbuilder.AltSeqBuilder(var, transcript_data)
Expand Down
22 changes: 22 additions & 0 deletions tests/test_hgvs_variantmapper_cp_sanity.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import hgvs.parser
import hgvs.variantmapper as variantmapper
from hgvs.exceptions import HGVSError


class TestHgvsCToP(unittest.TestCase):
Expand Down Expand Up @@ -154,6 +155,27 @@ def test_intron(self):
hgvsp_expected = "MOCK:p.?"
self._run_conversion(hgvsc, hgvsp_expected)

def test_ins_intron_exon_boundary(self):
hgvsc = "NM_999999.1:c.9-1_9insG"
hgvsp_expected = "MOCK:p.(Lys4GlufsTer?)"
self._run_conversion(hgvsc, hgvsp_expected)

def test_ins_exon_intron_boundary(self):
hgvsc = "NM_999999.1:c.39_39+1insC"
hgvsp_expected = "MOCK:p.(=)"
self._run_conversion(hgvsc, hgvsp_expected)

def test_dup_intron_exon_boundary(self):
hgvsc = "NM_999999.1:c.9-1dup"
hgvsp_expected = "MOCK:p.?"
with self.assertRaises(HGVSError):
self._run_conversion(hgvsc, hgvsp_expected)

def test_dup_exon_intron_boundary(self):
hgvsc = "NM_999999.1:c.39+1dup"
hgvsp_expected = "MOCK:p.?"
self._run_conversion(hgvsc, hgvsp_expected)

def test_five_prime_utr(self):
hgvsc = "NM_999999.1:c.-2A>G"
hgvsp_expected = "MOCK:p.?"
Expand Down

0 comments on commit ed861bd

Please sign in to comment.