diff --git a/src/hgvs/assemblymapper.py b/src/hgvs/assemblymapper.py index e14e07ba..28984c30 100644 --- a/src/hgvs/assemblymapper.py +++ b/src/hgvs/assemblymapper.py @@ -2,6 +2,8 @@ import logging +from bioutils.sequences import reverse_complement + import hgvs import hgvs.normalizer from hgvs.exceptions import ( @@ -239,6 +241,27 @@ def _alt_ac_for_tx_ac(self, tx_ac): assert len(alt_acs) == 1, "Should have exactly one alignment at this point" return alt_acs[0] + def _replace_reference(self, var): + if ( + var.type in "cn" + and var.posedit.pos is not None + and (var.posedit.pos.start.offset != 0 or var.posedit.pos.end.offset != 0) + ): + alt_ac = self._alt_ac_for_tx_ac(var.ac) + if var.type == 'c': + var_g = super(AssemblyMapper, self).c_to_g(var, alt_ac, alt_aln_method=self.alt_aln_method) + if var.type == 'n': + var_g = super(AssemblyMapper, self).n_to_g(var, alt_ac, alt_aln_method=self.alt_aln_method) + super(AssemblyMapper, self)._replace_reference(var_g) + ref = var_g.posedit.edit.ref + if self._fetch_AlignmentMapper(tx_ac=var.ac).strand == -1: + ref = reverse_complement(ref) + var.posedit.edit.ref = ref + _logger.debug("Replaced reference sequence in {var} with {ref}".format(var=var, ref=var_g.posedit.edit.ref)) + return var + + return super(AssemblyMapper, self)._replace_reference(var) + def _fetch_AlignmentMapper(self, tx_ac, alt_ac=None, alt_aln_method=None): """convenience version of VariantMapper._fetch_AlignmentMapper that derives alt_ac from transcript, assembly, and alt_aln_method diff --git a/src/hgvs/utils/altseqbuilder.py b/src/hgvs/utils/altseqbuilder.py index 74e706c8..f1f012f5 100644 --- a/src/hgvs/utils/altseqbuilder.py +++ b/src/hgvs/utils/altseqbuilder.py @@ -9,6 +9,8 @@ import logging import math +from hgvs.exceptions import HGVSError + import six from bioutils.sequences import reverse_complement, translate_cds @@ -195,8 +197,20 @@ def _get_variant_region(self): and self._var_c.posedit.pos.end.datum == Datum.CDS_END ): result = self.WHOLE_GENE + elif self._var_c.posedit.edit.type == "ins" and self._var_c.posedit.pos.start.offset == -1 and self._var_c.posedit.pos.end.offset == 0: + # ins at intron-exon boundary + result = self.EXON + elif self._var_c.posedit.edit.type == "ins" and self._var_c.posedit.pos.start.offset == 0 and self._var_c.posedit.pos.end.offset == 1: + # ins at exon-intron boundary + result = self.EXON + elif self._var_c.posedit.edit.type == "dup" and self._var_c.posedit.pos.end.offset == -1: + # dup at intron-exon boundary + result = self.EXON + elif self._var_c.posedit.edit.type == "dup" and self._var_c.posedit.pos.start.offset == 1: + # dup at exon-intron boundary + result = self.EXON elif self._var_c.posedit.pos.start.offset != 0 or self._var_c.posedit.pos.end.offset != 0: - # leave out anything intronic for now + # leave out anything else intronic for now result = self.INTRON else: # anything else that contains an exon result = self.EXON @@ -266,7 +280,10 @@ def _incorporate_dup(self): """Incorporate dup into sequence""" seq, cds_start, cds_stop, start, end = self._setup_incorporate() - dup_seq = seq[start:end] + if not self._var_c.posedit.edit.ref: + raise HGVSError('Duplication variant is missing reference sequence') + + dup_seq = self._var_c.posedit.edit.ref seq[end:end] = dup_seq is_frameshift = len(dup_seq) % 3 != 0 @@ -328,10 +345,10 @@ def _setup_incorporate(self): if pos.base < 0: # 5' UTR result = cds_start - 1 else: # cds/intron - if pos.offset <= 0: - result = (cds_start - 1) + pos.base - 1 + if pos.offset < 0: + result = (cds_start - 1) + pos.base - 2 else: - result = (cds_start - 1) + pos.base + result = (cds_start - 1) + pos.base - 1 elif pos.datum == Datum.CDS_END: # 3' UTR result = cds_stop + pos.base - 1 else: diff --git a/src/hgvs/variantmapper.py b/src/hgvs/variantmapper.py index d73b98b7..08a11149 100644 --- a/src/hgvs/variantmapper.py +++ b/src/hgvs/variantmapper.py @@ -431,6 +431,7 @@ def c_to_p(self, var_c, pro_ac=None): raise HGVSInvalidVariantError("Expected a cDNA (c.) variant; got " + str(var_c)) if self._validator: self._validator.validate(var_c) + self._replace_reference(var_c) reference_data = RefTranscriptData(self.hdp, var_c.ac, pro_ac) builder = altseqbuilder.AltSeqBuilder(var_c, reference_data) @@ -465,9 +466,10 @@ def _replace_reference(self, var): # these types have no reference sequence (zero-width), so return as-is return var - pos = var.posedit.pos - if (isinstance(pos.start, hgvs.location.BaseOffsetPosition) and pos.start.offset != 0) or ( - isinstance(pos.end, hgvs.location.BaseOffsetPosition) and pos.end.offset != 0 + if ( + var.type in "cnr" + and var.posedit.pos is not None + and (var.posedit.pos.start.offset != 0 or var.posedit.pos.end.offset != 0) ): _logger.info("Can't update reference sequence for intronic variant {}".format(var)) return var @@ -483,6 +485,9 @@ def _replace_reference(self, var): seq_start = pos.start.base - 1 seq_end = pos.end.base + if var.type in "cnr": + seq_start += pos.start.offset + seq_end += pos.end.offset # When strict_bounds is False and an error occurs, return # variant as-is diff --git a/tests/data/cache-py3.hdp b/tests/data/cache-py3.hdp index 1fbcd59c..454654af 100644 Binary files a/tests/data/cache-py3.hdp and b/tests/data/cache-py3.hdp differ diff --git a/tests/data/sanity_cp.tsv b/tests/data/sanity_cp.tsv index 81fc9dee..3d23a3d7 100644 --- a/tests/data/sanity_cp.tsv +++ b/tests/data/sanity_cp.tsv @@ -1,10 +1,10 @@ -accession transcript_sequence cds_start_i cds_end_i -NM_999999.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGGGG 9 39 -NM_999998.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGAGGAGGTAGTTTCGC 9 39 -NM_999997.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGTAGAATAGAGGAGGCAGTTTCGC 9 39 -NM_999996.1 AAAATCAAAATGAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 39 -NM_999995.1 AAAATCAAAATGAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 42 -NM_999994.1 AAAATCAAAATGAAAAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 45 -NM_999993.1 AAAATCAAAATGGGGAGGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 -NM_999992.1 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 -NM_999992.2 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGCC 9 40 +accession transcript_sequence cds_start_i cds_end_i lengths +NM_999999.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGGGG 9 39 [10,25,30] +NM_999998.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGAGGAGGTAGTTTCGC 9 39 [10,25,30] +NM_999997.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGTAGAATAGAGGAGGCAGTTTCGC 9 39 [10,25,30] +NM_999996.1 AAAATCAAAATGAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 39 [10,25,30] +NM_999995.1 AAAATCAAAATGAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 42 [10,25,30] +NM_999994.1 AAAATCAAAATGAAAAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 45 [10,25,30] +NM_999993.1 AAAATCAAAATGGGGAGGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 [10,25,30] +NM_999992.1 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 [10,25,30] +NM_999992.2 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGCC 9 40 [10,25,30] diff --git a/tests/support/mock_input_source.py b/tests/support/mock_input_source.py index 90a2dd6a..b646f809 100644 --- a/tests/support/mock_input_source.py +++ b/tests/support/mock_input_source.py @@ -31,7 +31,7 @@ def fetch_transcript_info(self, ac): result = None data = self._mock_data.get(ac) if data: # interbase coordinates - result = {"cds_start_i": data["cds_start_i"], "cds_end_i": data["cds_end_i"]} + result = {"cds_start_i": data["cds_start_i"], "cds_end_i": data["cds_end_i"], "lengths": data["lengths"]} return result def get_tx_identity_info(self, ac): @@ -72,6 +72,7 @@ def _read_input(self, in_file): "transcript_sequence": row["transcript_sequence"], "cds_start_i": int(row["cds_start_i"]), "cds_end_i": int(row["cds_end_i"]), + "lengths": eval(row["lengths"]), } return result diff --git a/tests/test_hgvs_assemblymapper.py b/tests/test_hgvs_assemblymapper.py index 460ea299..5456eb75 100644 --- a/tests/test_hgvs_assemblymapper.py +++ b/tests/test_hgvs_assemblymapper.py @@ -198,6 +198,66 @@ def test_c_to_p_with_stop_gain(self): self.assertEqual(str(var_p), hgvs_p) + def test_map_of_dup_intron_exon_boundary(self): + hgvs_c = "NM_024529.4:c.132-1_132dup" + hgvs_p = "NP_078805.3:p.?" + + var_c = self.hp.parse_hgvs_variant(hgvs_c) + var_p = self.am.c_to_p(var_c) + + self.assertEqual(str(var_p), hgvs_p) + + hgvs_c = "NM_024529.4:c.132-2_132-1dup" + hgvs_p = "NP_078805.3:p.(Thr45GlyfsTer65)" + + var_c = self.hp.parse_hgvs_variant(hgvs_c) + var_p = self.am.c_to_p(var_c) + + self.assertEqual(str(var_p), hgvs_p) + + hgvs_c = "NM_004985.4:c.112-8_112-1dup" + hgvs_p = "NP_004976.2:p.(Asp38LeufsTer10)" + + var_c = self.hp.parse_hgvs_variant(hgvs_c) + var_p = self.am.c_to_p(var_c) + + self.assertEqual(str(var_p), hgvs_p) + + def test_map_of_dup_intron_exon_boundary_rc(self): + hgvs_c = "NM_004380.2:c.3251-1dup" + hgvs_p = "NP_004371.2:p.(Ile1084SerfsTer3)" + + var_c = self.hp.parse_hgvs_variant(hgvs_c) + var_p = self.am.c_to_p(var_c) + + self.assertEqual(str(var_p), hgvs_p) + + def test_map_of_dup_exon_intron_boundary(self): + hgvs_c = "NM_024529.4:c.130_131+1dup" + hgvs_p = "NP_078805.3:p.?" + + var_c = self.hp.parse_hgvs_variant(hgvs_c) + var_p = self.am.c_to_p(var_c) + + self.assertEqual(str(var_p), hgvs_p) + + hgvs_c = "NM_024529.4:c.131+1_131+3dup" + hgvs_p = "NP_078805.3:p.(Thr45Ter)" + + var_c = self.hp.parse_hgvs_variant(hgvs_c) + var_p = self.am.c_to_p(var_c) + + self.assertEqual(str(var_p), hgvs_p) + + def test_map_of_dup_exon_intron_boundary_rc(self): + hgvs_c = "NM_004985.4:c.111+1_111+4dup" + hgvs_p = "NP_004976.2:p.(Asp38ValfsTer11)" + + var_c = self.hp.parse_hgvs_variant(hgvs_c) + var_p = self.am.c_to_p(var_c) + + self.assertEqual(str(var_p), hgvs_p) + class Test_RefReplacement(unittest.TestCase): test_cases = [ diff --git a/tests/test_hgvs_variantmapper.py b/tests/test_hgvs_variantmapper.py index df90a0a9..24cb3c47 100644 --- a/tests/test_hgvs_variantmapper.py +++ b/tests/test_hgvs_variantmapper.py @@ -128,6 +128,57 @@ def test_map_of_dup_three_prime_utr(self): var_p = self.vm.c_to_p(var_c) self.assertEqual(str(var_p), "NP_694955.2:p.?") + def test_map_of_ins_intron_exon_boundary(self): + hgvs_c = "NM_004380.2:c.3251-1_3251insA" + var_c = self.hp.parse_hgvs_variant(hgvs_c) + var_p = self.vm.c_to_p(var_c) + self.assertEqual(str(var_p), "NP_004371.2:p.(Ile1084AsnfsTer3)") + + def test_map_of_ins_exon_intron_boundary(self): + hgvs_c = "NM_004380.2:c.3250_3250+1insT" + var_c = self.hp.parse_hgvs_variant(hgvs_c) + var_p = self.vm.c_to_p(var_c) + self.assertEqual(str(var_p), "NP_004371.2:p.(Phe1085LeufsTer2)") + + def test_map_of_dup_intron_exon_boundary(self): + hgvs_c = "NM_024529.4:c.132-1_132dup" + var_c = self.hp.parse_hgvs_variant(hgvs_c) + var_p = self.vm.c_to_p(var_c) + self.assertEqual(str(var_p), "NP_078805.3:p.?") + + hgvs_c = "NM_024529.4:c.132-2_132-1dup" + var_c = self.hp.parse_hgvs_variant(hgvs_c) + with self.assertRaises(HGVSError): + var_p = self.vm.c_to_p(var_c) + + hgvs_c = "NM_004985.4:c.112-8_112-1dup" + var_c = self.hp.parse_hgvs_variant(hgvs_c) + with self.assertRaises(HGVSError): + var_p = self.vm.c_to_p(var_c) + + def test_map_of_dup_intron_exon_boundary_rc(self): + hgvs_c = "NM_004380.2:c.3251-1dup" + var_c = self.hp.parse_hgvs_variant(hgvs_c) + with self.assertRaises(HGVSError): + var_p = self.vm.c_to_p(var_c) + + def test_map_of_dup_exon_intron_boundary(self): + hgvs_c = "NM_024529.4:c.130_131+1dup" + var_c = self.hp.parse_hgvs_variant(hgvs_c) + var_p = self.vm.c_to_p(var_c) + self.assertEqual(str(var_p), "NP_078805.3:p.?") + + hgvs_c = "NM_024529.4:c.131+1_131+3dup" + var_c = self.hp.parse_hgvs_variant(hgvs_c) + with self.assertRaises(HGVSError): + var_p = self.vm.c_to_p(var_c) + + def test_map_of_dup_exon_intron_boundary_rc(self): + hgvs_c = "NM_004985.4:c.111+1_111+4dup" + var_c = self.hp.parse_hgvs_variant(hgvs_c) + with self.assertRaises(HGVSError): + var_p = self.vm.c_to_p(var_c) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_hgvs_variantmapper_cp_altseqbuilder.py b/tests/test_hgvs_variantmapper_cp_altseqbuilder.py index 21f16cb0..f3657ea4 100644 --- a/tests/test_hgvs_variantmapper_cp_altseqbuilder.py +++ b/tests/test_hgvs_variantmapper_cp_altseqbuilder.py @@ -116,6 +116,7 @@ def test_sequence_with_length_that_is_not_divisible_by_3(self): def _run_comparison(self, hgvsc, expected_sequence): ac_p = "DUMMY" var = self._parser.parse_hgvs_variant(hgvsc) + var.fill_ref(self._datasource) transcript_data = RefTranscriptData(hdp=self._datasource, tx_ac=var.ac, pro_ac=ac_p) builder = altseqbuilder.AltSeqBuilder(var, transcript_data) diff --git a/tests/test_hgvs_variantmapper_cp_sanity.py b/tests/test_hgvs_variantmapper_cp_sanity.py index 23e085db..7a5cbbfb 100644 --- a/tests/test_hgvs_variantmapper_cp_sanity.py +++ b/tests/test_hgvs_variantmapper_cp_sanity.py @@ -9,6 +9,7 @@ import hgvs.parser import hgvs.variantmapper as variantmapper +from hgvs.exceptions import HGVSError class TestHgvsCToP(unittest.TestCase): @@ -152,6 +153,28 @@ def test_intron(self): hgvsp_expected = "MOCK:p.?" self._run_conversion(hgvsc, hgvsp_expected) + def test_ins_intron_exon_boundary(self): + hgvsc = "NM_999999.1:c.9-1_9insG" + hgvsp_expected = "MOCK:p.(Lys4GlufsTer?)" + self._run_conversion(hgvsc, hgvsp_expected) + + def test_ins_exon_intron_boundary(self): + hgvsc = "NM_999999.1:c.39_39+1insC" + hgvsp_expected = "MOCK:p.(=)" + self._run_conversion(hgvsc, hgvsp_expected) + + def test_dup_intron_exon_boundary(self): + hgvsc = "NM_999999.1:c.9-1dup" + hgvsp_expected = "MOCK:p.?" + with self.assertRaises(HGVSError): + self._run_conversion(hgvsc, hgvsp_expected) + + def test_dup_exon_intron_boundary(self): + hgvsc = "NM_999999.1:c.39+1dup" + hgvsp_expected = "MOCK:p.?" + with self.assertRaises(HGVSError): + self._run_conversion(hgvsc, hgvsp_expected) + def test_five_prime_utr(self): hgvsc = "NM_999999.1:c.-2A>G" hgvsp_expected = "MOCK:p.?"