Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix variant region for ins and dup on intron-exon boundary #709

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions src/hgvs/assemblymapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import logging

from bioutils.sequences import reverse_complement

import hgvs
import hgvs.normalizer
from hgvs.exceptions import (
Expand Down Expand Up @@ -239,6 +241,27 @@ def _alt_ac_for_tx_ac(self, tx_ac):
assert len(alt_acs) == 1, "Should have exactly one alignment at this point"
return alt_acs[0]

def _replace_reference(self, var):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems to me that we should have only one _replace_reference. It's currently in variantmapper. Is there a reason that one doesn't work / can't be extended?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. In order to get the reference value for these variants we need to first map them back to the var_g form which requires an assembly mapper afaik. We don't actually call the *_to_g function from the AM because that includes additional normalization. Instead we use the AM to get the alt_ac and then call *_to_g from the VM with the alt_ac and alt_aln_method from the AM. In all other cases this method simply calls _replace_reference from the variant mapper instead. This is all so that we can get a portion of the reference sequence from the intronic section.

Copy link
Member

@andreasprlic andreasprlic Apr 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The AssemblyMapper has VariantMapper as a parent class. In that light, do we need to have a method of the same name? I think this is confusing, since this one does something different. Also, the AM has a normalize property. Could that get used to prevent what seem to be an undesired side-effects? I assume the intent is not to call _maybe_normalize?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @andreasprlic,

I don't think I understand your first question. This _replace_reference method overrides the method of the same name from the parent VariantMapper class. It has the same purpose and type signature. The only difference is that I've added logic to handle c and n type variants with non-zero offsets.

I thought about disabling self.normalize temporarily in AssemblyMapper to avoid calling the _maybe_normalize method but I would also have to make sure the original value of self.normalize is replaced afterward, probably with a try...finally block. I thought it was simpler to call the underlying functions directly but I can rework this if you'd prefer.

Thanks!

if (
var.type in "cn"
and var.posedit.pos is not None
and (var.posedit.pos.start.offset != 0 or var.posedit.pos.end.offset != 0)
):
alt_ac = self._alt_ac_for_tx_ac(var.ac)
if var.type == 'c':
var_g = super(AssemblyMapper, self).c_to_g(var, alt_ac, alt_aln_method=self.alt_aln_method)
if var.type == 'n':
var_g = super(AssemblyMapper, self).n_to_g(var, alt_ac, alt_aln_method=self.alt_aln_method)
super(AssemblyMapper, self)._replace_reference(var_g)
ref = var_g.posedit.edit.ref
if self._fetch_AlignmentMapper(tx_ac=var.ac).strand == -1:
ref = reverse_complement(ref)
var.posedit.edit.ref = ref
_logger.debug("Replaced reference sequence in {var} with {ref}".format(var=var, ref=var_g.posedit.edit.ref))
return var

return super(AssemblyMapper, self)._replace_reference(var)

def _fetch_AlignmentMapper(self, tx_ac, alt_ac=None, alt_aln_method=None):
"""convenience version of VariantMapper._fetch_AlignmentMapper that
derives alt_ac from transcript, assembly, and alt_aln_method
Expand Down
27 changes: 22 additions & 5 deletions src/hgvs/utils/altseqbuilder.py
b0d0nne11 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import logging
import math

from hgvs.exceptions import HGVSError

import six
from bioutils.sequences import reverse_complement, translate_cds

Expand Down Expand Up @@ -195,8 +197,20 @@ def _get_variant_region(self):
and self._var_c.posedit.pos.end.datum == Datum.CDS_END
):
result = self.WHOLE_GENE
elif self._var_c.posedit.edit.type == "ins" and self._var_c.posedit.pos.start.offset == -1 and self._var_c.posedit.pos.end.offset == 0:
# ins at intron-exon boundary
result = self.EXON
elif self._var_c.posedit.edit.type == "ins" and self._var_c.posedit.pos.start.offset == 0 and self._var_c.posedit.pos.end.offset == 1:
# ins at exon-intron boundary
result = self.EXON
elif self._var_c.posedit.edit.type == "dup" and self._var_c.posedit.pos.end.offset == -1:
# dup at intron-exon boundary
result = self.EXON
elif self._var_c.posedit.edit.type == "dup" and self._var_c.posedit.pos.start.offset == 1:
# dup at exon-intron boundary
result = self.EXON
elif self._var_c.posedit.pos.start.offset != 0 or self._var_c.posedit.pos.end.offset != 0:
# leave out anything intronic for now
# leave out anything else intronic for now
result = self.INTRON
else: # anything else that contains an exon
result = self.EXON
Expand Down Expand Up @@ -266,7 +280,10 @@ def _incorporate_dup(self):
"""Incorporate dup into sequence"""
seq, cds_start, cds_stop, start, end = self._setup_incorporate()

dup_seq = seq[start:end]
if not self._var_c.posedit.edit.ref:
raise HGVSError('Duplication variant is missing reference sequence')

dup_seq = self._var_c.posedit.edit.ref
seq[end:end] = dup_seq

is_frameshift = len(dup_seq) % 3 != 0
Expand Down Expand Up @@ -328,10 +345,10 @@ def _setup_incorporate(self):
if pos.base < 0: # 5' UTR
result = cds_start - 1
else: # cds/intron
if pos.offset <= 0:
result = (cds_start - 1) + pos.base - 1
if pos.offset < 0:
result = (cds_start - 1) + pos.base - 2
else:
result = (cds_start - 1) + pos.base
result = (cds_start - 1) + pos.base - 1
elif pos.datum == Datum.CDS_END: # 3' UTR
result = cds_stop + pos.base - 1
else:
Expand Down
11 changes: 8 additions & 3 deletions src/hgvs/variantmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,7 @@ def c_to_p(self, var_c, pro_ac=None):
raise HGVSInvalidVariantError("Expected a cDNA (c.) variant; got " + str(var_c))
if self._validator:
self._validator.validate(var_c)
self._replace_reference(var_c)
reference_data = RefTranscriptData(self.hdp, var_c.ac, pro_ac)
builder = altseqbuilder.AltSeqBuilder(var_c, reference_data)

Expand Down Expand Up @@ -465,9 +466,10 @@ def _replace_reference(self, var):
# these types have no reference sequence (zero-width), so return as-is
return var

pos = var.posedit.pos
if (isinstance(pos.start, hgvs.location.BaseOffsetPosition) and pos.start.offset != 0) or (
isinstance(pos.end, hgvs.location.BaseOffsetPosition) and pos.end.offset != 0
if (
var.type in "cnr"
and var.posedit.pos is not None
and (var.posedit.pos.start.offset != 0 or var.posedit.pos.end.offset != 0)
):
_logger.info("Can't update reference sequence for intronic variant {}".format(var))
return var
Expand All @@ -483,6 +485,9 @@ def _replace_reference(self, var):

seq_start = pos.start.base - 1
seq_end = pos.end.base
if var.type in "cnr":
seq_start += pos.start.offset
seq_end += pos.end.offset

# When strict_bounds is False and an error occurs, return
# variant as-is
Expand Down
Binary file modified tests/data/cache-py3.hdp
Binary file not shown.
20 changes: 10 additions & 10 deletions tests/data/sanity_cp.tsv
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
accession transcript_sequence cds_start_i cds_end_i
NM_999999.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGGGG 9 39
NM_999998.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGAGGAGGTAGTTTCGC 9 39
NM_999997.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGTAGAATAGAGGAGGCAGTTTCGC 9 39
NM_999996.1 AAAATCAAAATGAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 39
NM_999995.1 AAAATCAAAATGAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 42
NM_999994.1 AAAATCAAAATGAAAAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 45
NM_999993.1 AAAATCAAAATGGGGAGGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39
NM_999992.1 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39
NM_999992.2 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGCC 9 40
accession transcript_sequence cds_start_i cds_end_i lengths
NM_999999.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGGGG 9 39 [10,25,30]
NM_999998.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGAGGAGGTAGTTTCGC 9 39 [10,25,30]
NM_999997.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGTAGAATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999996.1 AAAATCAAAATGAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999995.1 AAAATCAAAATGAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 42 [10,25,30]
NM_999994.1 AAAATCAAAATGAAAAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 45 [10,25,30]
NM_999993.1 AAAATCAAAATGGGGAGGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999992.1 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999992.2 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGCC 9 40 [10,25,30]
3 changes: 2 additions & 1 deletion tests/support/mock_input_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def fetch_transcript_info(self, ac):
result = None
data = self._mock_data.get(ac)
if data: # interbase coordinates
result = {"cds_start_i": data["cds_start_i"], "cds_end_i": data["cds_end_i"]}
result = {"cds_start_i": data["cds_start_i"], "cds_end_i": data["cds_end_i"], "lengths": data["lengths"]}
return result

def get_tx_identity_info(self, ac):
Expand Down Expand Up @@ -72,6 +72,7 @@ def _read_input(self, in_file):
"transcript_sequence": row["transcript_sequence"],
"cds_start_i": int(row["cds_start_i"]),
"cds_end_i": int(row["cds_end_i"]),
"lengths": eval(row["lengths"]),
}

return result
Expand Down
60 changes: 60 additions & 0 deletions tests/test_hgvs_assemblymapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,66 @@ def test_c_to_p_with_stop_gain(self):

self.assertEqual(str(var_p), hgvs_p)

def test_map_of_dup_intron_exon_boundary(self):
hgvs_c = "NM_024529.4:c.132-1_132dup"
hgvs_p = "NP_078805.3:p.?"

var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.am.c_to_p(var_c)

self.assertEqual(str(var_p), hgvs_p)

hgvs_c = "NM_024529.4:c.132-2_132-1dup"
hgvs_p = "NP_078805.3:p.(Thr45GlyfsTer65)"

var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.am.c_to_p(var_c)

self.assertEqual(str(var_p), hgvs_p)

hgvs_c = "NM_004985.4:c.112-8_112-1dup"
hgvs_p = "NP_004976.2:p.(Asp38LeufsTer10)"

var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.am.c_to_p(var_c)

self.assertEqual(str(var_p), hgvs_p)

def test_map_of_dup_intron_exon_boundary_rc(self):
hgvs_c = "NM_004380.2:c.3251-1dup"
hgvs_p = "NP_004371.2:p.(Ile1084SerfsTer3)"

var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.am.c_to_p(var_c)

self.assertEqual(str(var_p), hgvs_p)

def test_map_of_dup_exon_intron_boundary(self):
hgvs_c = "NM_024529.4:c.130_131+1dup"
hgvs_p = "NP_078805.3:p.?"

var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.am.c_to_p(var_c)

self.assertEqual(str(var_p), hgvs_p)

hgvs_c = "NM_024529.4:c.131+1_131+3dup"
hgvs_p = "NP_078805.3:p.(Thr45Ter)"

var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.am.c_to_p(var_c)

self.assertEqual(str(var_p), hgvs_p)

def test_map_of_dup_exon_intron_boundary_rc(self):
hgvs_c = "NM_004985.4:c.111+1_111+4dup"
hgvs_p = "NP_004976.2:p.(Asp38ValfsTer11)"

var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.am.c_to_p(var_c)

self.assertEqual(str(var_p), hgvs_p)


class Test_RefReplacement(unittest.TestCase):
test_cases = [
Expand Down
51 changes: 51 additions & 0 deletions tests/test_hgvs_variantmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,57 @@ def test_map_of_dup_three_prime_utr(self):
var_p = self.vm.c_to_p(var_c)
self.assertEqual(str(var_p), "NP_694955.2:p.?")

def test_map_of_ins_intron_exon_boundary(self):
hgvs_c = "NM_004380.2:c.3251-1_3251insA"
var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.vm.c_to_p(var_c)
self.assertEqual(str(var_p), "NP_004371.2:p.(Ile1084AsnfsTer3)")

def test_map_of_ins_exon_intron_boundary(self):
hgvs_c = "NM_004380.2:c.3250_3250+1insT"
var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.vm.c_to_p(var_c)
self.assertEqual(str(var_p), "NP_004371.2:p.(Phe1085LeufsTer2)")

def test_map_of_dup_intron_exon_boundary(self):
hgvs_c = "NM_024529.4:c.132-1_132dup"
var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.vm.c_to_p(var_c)
self.assertEqual(str(var_p), "NP_078805.3:p.?")

hgvs_c = "NM_024529.4:c.132-2_132-1dup"
var_c = self.hp.parse_hgvs_variant(hgvs_c)
with self.assertRaises(HGVSError):
var_p = self.vm.c_to_p(var_c)

hgvs_c = "NM_004985.4:c.112-8_112-1dup"
var_c = self.hp.parse_hgvs_variant(hgvs_c)
with self.assertRaises(HGVSError):
var_p = self.vm.c_to_p(var_c)

def test_map_of_dup_intron_exon_boundary_rc(self):
hgvs_c = "NM_004380.2:c.3251-1dup"
var_c = self.hp.parse_hgvs_variant(hgvs_c)
with self.assertRaises(HGVSError):
var_p = self.vm.c_to_p(var_c)

def test_map_of_dup_exon_intron_boundary(self):
hgvs_c = "NM_024529.4:c.130_131+1dup"
var_c = self.hp.parse_hgvs_variant(hgvs_c)
var_p = self.vm.c_to_p(var_c)
self.assertEqual(str(var_p), "NP_078805.3:p.?")

hgvs_c = "NM_024529.4:c.131+1_131+3dup"
var_c = self.hp.parse_hgvs_variant(hgvs_c)
with self.assertRaises(HGVSError):
var_p = self.vm.c_to_p(var_c)

def test_map_of_dup_exon_intron_boundary_rc(self):
hgvs_c = "NM_004985.4:c.111+1_111+4dup"
var_c = self.hp.parse_hgvs_variant(hgvs_c)
with self.assertRaises(HGVSError):
var_p = self.vm.c_to_p(var_c)


if __name__ == "__main__":
unittest.main()
Expand Down
1 change: 1 addition & 0 deletions tests/test_hgvs_variantmapper_cp_altseqbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def test_sequence_with_length_that_is_not_divisible_by_3(self):
def _run_comparison(self, hgvsc, expected_sequence):
ac_p = "DUMMY"
var = self._parser.parse_hgvs_variant(hgvsc)
var.fill_ref(self._datasource)
transcript_data = RefTranscriptData(hdp=self._datasource, tx_ac=var.ac, pro_ac=ac_p)

builder = altseqbuilder.AltSeqBuilder(var, transcript_data)
Expand Down
23 changes: 23 additions & 0 deletions tests/test_hgvs_variantmapper_cp_sanity.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import hgvs.parser
import hgvs.variantmapper as variantmapper
from hgvs.exceptions import HGVSError


class TestHgvsCToP(unittest.TestCase):
Expand Down Expand Up @@ -152,6 +153,28 @@ def test_intron(self):
hgvsp_expected = "MOCK:p.?"
self._run_conversion(hgvsc, hgvsp_expected)

def test_ins_intron_exon_boundary(self):
hgvsc = "NM_999999.1:c.9-1_9insG"
hgvsp_expected = "MOCK:p.(Lys4GlufsTer?)"
self._run_conversion(hgvsc, hgvsp_expected)

def test_ins_exon_intron_boundary(self):
hgvsc = "NM_999999.1:c.39_39+1insC"
hgvsp_expected = "MOCK:p.(=)"
self._run_conversion(hgvsc, hgvsp_expected)

def test_dup_intron_exon_boundary(self):
hgvsc = "NM_999999.1:c.9-1dup"
hgvsp_expected = "MOCK:p.?"
with self.assertRaises(HGVSError):
self._run_conversion(hgvsc, hgvsp_expected)

def test_dup_exon_intron_boundary(self):
hgvsc = "NM_999999.1:c.39+1dup"
hgvsp_expected = "MOCK:p.?"
with self.assertRaises(HGVSError):
self._run_conversion(hgvsc, hgvsp_expected)

def test_five_prime_utr(self):
hgvsc = "NM_999999.1:c.-2A>G"
hgvsp_expected = "MOCK:p.?"
Expand Down
Loading