From 3e19d46031286b0fb5c7ce0ca3f1db520a048e07 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 27 Sep 2023 09:18:01 -0700 Subject: [PATCH] fix start/end positions edge cases --- tests/expected-results-edgy/orfs.json | 158 +++++++++++++------------- util/find_orf.py | 5 +- 2 files changed, 81 insertions(+), 82 deletions(-) diff --git a/tests/expected-results-edgy/orfs.json b/tests/expected-results-edgy/orfs.json index fe80c92..a68a56d 100644 --- a/tests/expected-results-edgy/orfs.json +++ b/tests/expected-results-edgy/orfs.json @@ -137,7 +137,7 @@ { "name": "gag", "start": 0, - "end": 19, + "end": 21, "subtype_start": 1, "subtype_end": 1498, "orientation": "forward", @@ -149,7 +149,7 @@ { "name": "pol", "start": 0, - "end": 19, + "end": 21, "subtype_start": 1290, "subtype_end": 4302, "orientation": "forward", @@ -161,7 +161,7 @@ { "name": "env", "start": 0, - "end": 19, + "end": 21, "subtype_start": 5430, "subtype_end": 8007, "orientation": "forward", @@ -173,7 +173,7 @@ { "name": "vif", "start": 0, - "end": 19, + "end": 21, "subtype_start": 4246, "subtype_end": 4822, "orientation": "forward", @@ -185,7 +185,7 @@ { "name": "vpr", "start": 0, - "end": 19, + "end": 21, "subtype_start": 4764, "subtype_end": 5052, "orientation": "forward", @@ -221,7 +221,7 @@ { "name": "vpu", "start": 0, - "end": 19, + "end": 21, "subtype_start": 5267, "subtype_end": 5513, "orientation": "forward", @@ -233,7 +233,7 @@ { "name": "nef", "start": 0, - "end": 19, + "end": 21, "subtype_start": 8008, "subtype_end": 8683, "orientation": "forward", @@ -244,7 +244,7 @@ }, { "name": "tat_exon2", - "start": 18, + "start": 19, "end": 19, "subtype_start": 7567, "subtype_end": 7663, @@ -252,11 +252,11 @@ "distance": 0.7834691501746216, "protein": "", "aminoacids": "", - "nucleotides": "T" + "nucleotides": "" }, { "name": "rev_exon2", - "start": 18, + "start": 19, "end": 19, "subtype_start": 7568, "subtype_end": 7865, @@ -264,14 +264,14 @@ "distance": 0.7811712165958367, "protein": "", "aminoacids": "", - "nucleotides": "T" + "nucleotides": "" } ], "singleton-sequence": [ { "name": "gag", "start": 0, - "end": 1, + "end": 3, "subtype_start": 1, "subtype_end": 1498, "orientation": "forward", @@ -283,7 +283,7 @@ { "name": "pol", "start": 0, - "end": 1, + "end": 3, "subtype_start": 1290, "subtype_end": 4302, "orientation": "forward", @@ -295,7 +295,7 @@ { "name": "env", "start": 0, - "end": 1, + "end": 3, "subtype_start": 5430, "subtype_end": 8007, "orientation": "forward", @@ -307,7 +307,7 @@ { "name": "vif", "start": 0, - "end": 1, + "end": 3, "subtype_start": 4246, "subtype_end": 4822, "orientation": "forward", @@ -319,7 +319,7 @@ { "name": "vpr", "start": 0, - "end": 1, + "end": 3, "subtype_start": 4764, "subtype_end": 5052, "orientation": "forward", @@ -331,7 +331,7 @@ { "name": "tat_exon1", "start": 0, - "end": 1, + "end": 3, "subtype_start": 5032, "subtype_end": 5248, "orientation": "forward", @@ -343,7 +343,7 @@ { "name": "rev_exon1", "start": 0, - "end": 1, + "end": 3, "subtype_start": 5171, "subtype_end": 5249, "orientation": "forward", @@ -355,7 +355,7 @@ { "name": "vpu", "start": 0, - "end": 1, + "end": 3, "subtype_start": 5267, "subtype_end": 5513, "orientation": "forward", @@ -365,8 +365,20 @@ "nucleotides": "A" }, { - "name": "tat_exon2", + "name": "nef", "start": 0, + "end": 3, + "subtype_start": 8008, + "subtype_end": 8683, + "orientation": "forward", + "distance": 0.7805933836772095, + "protein": "", + "aminoacids": "X", + "nucleotides": "A" + }, + { + "name": "tat_exon2", + "start": 1, "end": 1, "subtype_start": 7567, "subtype_end": 7663, @@ -374,11 +386,11 @@ "distance": 0.7834691501746216, "protein": "", "aminoacids": "", - "nucleotides": "A" + "nucleotides": "" }, { "name": "rev_exon2", - "start": 0, + "start": 1, "end": 1, "subtype_start": 7568, "subtype_end": 7865, @@ -386,25 +398,13 @@ "distance": 0.7811712165958367, "protein": "", "aminoacids": "", - "nucleotides": "A" - }, - { - "name": "nef", - "start": 0, - "end": 1, - "subtype_start": 8008, - "subtype_end": 8683, - "orientation": "forward", - "distance": 0.7805933836772095, - "protein": "", - "aminoacids": "X", - "nucleotides": "A" + "nucleotides": "" } ], "empty-sequence": [ { "name": "gag", - "start": -1, + "start": 0, "end": 0, "subtype_start": 1, "subtype_end": 1498, @@ -416,7 +416,7 @@ }, { "name": "pol", - "start": -1, + "start": 0, "end": 0, "subtype_start": 1290, "subtype_end": 4302, @@ -428,7 +428,7 @@ }, { "name": "env", - "start": -1, + "start": 0, "end": 0, "subtype_start": 5430, "subtype_end": 8007, @@ -440,7 +440,7 @@ }, { "name": "vif", - "start": -1, + "start": 0, "end": 0, "subtype_start": 4246, "subtype_end": 4822, @@ -452,7 +452,7 @@ }, { "name": "vpr", - "start": -1, + "start": 0, "end": 0, "subtype_start": 4764, "subtype_end": 5052, @@ -464,8 +464,8 @@ }, { "name": "tat_exon1", - "start": -1, - "end": 0, + "start": 0, + "end": 3, "subtype_start": 5032, "subtype_end": 5248, "orientation": "forward", @@ -476,8 +476,8 @@ }, { "name": "rev_exon1", - "start": -1, - "end": 0, + "start": 0, + "end": 3, "subtype_start": 5171, "subtype_end": 5249, "orientation": "forward", @@ -488,7 +488,7 @@ }, { "name": "vpu", - "start": -1, + "start": 0, "end": 0, "subtype_start": 5267, "subtype_end": 5513, @@ -500,7 +500,7 @@ }, { "name": "tat_exon2", - "start": -1, + "start": 0, "end": 0, "subtype_start": 7567, "subtype_end": 7663, @@ -512,7 +512,7 @@ }, { "name": "rev_exon2", - "start": -1, + "start": 0, "end": 0, "subtype_start": 7568, "subtype_end": 7865, @@ -524,7 +524,7 @@ }, { "name": "nef", - "start": -1, + "start": 0, "end": 0, "subtype_start": 8008, "subtype_end": 8683, @@ -538,7 +538,7 @@ "empty-sequence2": [ { "name": "gag", - "start": -1, + "start": 0, "end": 0, "subtype_start": 1, "subtype_end": 1498, @@ -550,7 +550,7 @@ }, { "name": "pol", - "start": -1, + "start": 0, "end": 0, "subtype_start": 1290, "subtype_end": 4302, @@ -562,7 +562,7 @@ }, { "name": "env", - "start": -1, + "start": 0, "end": 0, "subtype_start": 5430, "subtype_end": 8007, @@ -574,7 +574,7 @@ }, { "name": "vif", - "start": -1, + "start": 0, "end": 0, "subtype_start": 4246, "subtype_end": 4822, @@ -586,7 +586,7 @@ }, { "name": "vpr", - "start": -1, + "start": 0, "end": 0, "subtype_start": 4764, "subtype_end": 5052, @@ -598,8 +598,8 @@ }, { "name": "tat_exon1", - "start": -1, - "end": 0, + "start": 0, + "end": 3, "subtype_start": 5032, "subtype_end": 5248, "orientation": "forward", @@ -610,8 +610,8 @@ }, { "name": "rev_exon1", - "start": -1, - "end": 0, + "start": 0, + "end": 3, "subtype_start": 5171, "subtype_end": 5249, "orientation": "forward", @@ -622,7 +622,7 @@ }, { "name": "vpu", - "start": -1, + "start": 0, "end": 0, "subtype_start": 5267, "subtype_end": 5513, @@ -634,7 +634,7 @@ }, { "name": "tat_exon2", - "start": -1, + "start": 0, "end": 0, "subtype_start": 7567, "subtype_end": 7663, @@ -646,7 +646,7 @@ }, { "name": "rev_exon2", - "start": -1, + "start": 0, "end": 0, "subtype_start": 7568, "subtype_end": 7865, @@ -658,7 +658,7 @@ }, { "name": "nef", - "start": -1, + "start": 0, "end": 0, "subtype_start": 8008, "subtype_end": 8683, @@ -1001,7 +1001,7 @@ { "name": "tat_exon1", "start": 0, - "end": 12, + "end": 15, "subtype_start": 5032, "subtype_end": 5248, "orientation": "forward", @@ -1013,7 +1013,7 @@ { "name": "rev_exon1", "start": 0, - "end": 12, + "end": 15, "subtype_start": 5171, "subtype_end": 5249, "orientation": "forward", @@ -1048,7 +1048,7 @@ }, { "name": "tat_exon2", - "start": 11, + "start": 12, "end": 12, "subtype_start": 7567, "subtype_end": 7663, @@ -1056,11 +1056,11 @@ "distance": 0.7834691501746216, "protein": "", "aminoacids": "", - "nucleotides": "C" + "nucleotides": "" }, { "name": "rev_exon2", - "start": 11, + "start": 12, "end": 12, "subtype_start": 7568, "subtype_end": 7865, @@ -1068,7 +1068,7 @@ "distance": 0.7811712165958367, "protein": "", "aminoacids": "", - "nucleotides": "C" + "nucleotides": "" } ], "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455.BUT.WITH-BAD-SYMBOLS[REVERSE_COMPLEMENT]": [ @@ -1269,7 +1269,7 @@ { "name": "tat_exon1", "start": 0, - "end": 12, + "end": 15, "subtype_start": 5032, "subtype_end": 5248, "orientation": "forward", @@ -1281,7 +1281,7 @@ { "name": "rev_exon1", "start": 0, - "end": 12, + "end": 15, "subtype_start": 5171, "subtype_end": 5249, "orientation": "forward", @@ -1316,7 +1316,7 @@ }, { "name": "tat_exon2", - "start": 11, + "start": 12, "end": 12, "subtype_start": 7567, "subtype_end": 7663, @@ -1324,11 +1324,11 @@ "distance": 0.7834691501746216, "protein": "", "aminoacids": "", - "nucleotides": "C" + "nucleotides": "" }, { "name": "rev_exon2", - "start": 11, + "start": 12, "end": 12, "subtype_start": 7568, "subtype_end": 7865, @@ -1336,7 +1336,7 @@ "distance": 0.7811712165958367, "protein": "", "aminoacids": "", - "nucleotides": "C" + "nucleotides": "" } ], "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455.BUT.WITH-DASHES[REVERSE_COMPLEMENT]": [ @@ -1537,7 +1537,7 @@ { "name": "tat_exon1", "start": 0, - "end": 12, + "end": 15, "subtype_start": 5032, "subtype_end": 5248, "orientation": "forward", @@ -1549,7 +1549,7 @@ { "name": "rev_exon1", "start": 0, - "end": 12, + "end": 15, "subtype_start": 5171, "subtype_end": 5249, "orientation": "forward", @@ -1584,7 +1584,7 @@ }, { "name": "tat_exon2", - "start": 11, + "start": 12, "end": 12, "subtype_start": 7567, "subtype_end": 7663, @@ -1592,11 +1592,11 @@ "distance": 0.7834691501746216, "protein": "", "aminoacids": "", - "nucleotides": "C" + "nucleotides": "" }, { "name": "rev_exon2", - "start": 11, + "start": 12, "end": 12, "subtype_start": 7568, "subtype_end": 7865, @@ -1604,7 +1604,7 @@ "distance": 0.7811712165958367, "protein": "", "aminoacids": "", - "nucleotides": "C" + "nucleotides": "" } ] } \ No newline at end of file diff --git a/util/find_orf.py b/util/find_orf.py index d81a864..fbfb8f5 100644 --- a/util/find_orf.py +++ b/util/find_orf.py @@ -36,7 +36,6 @@ def find_candidate_positions(aligned_sequence, e): expected_protein = expected_aminoacids.strip("*") q_start_a = q_start // 3 q_end_a = q_end // 3 - n = len(aligned_sequence.this.seq) - 1 visited_set = set() query_aminoacids_table = get_query_aminoacids_table(aligned_sequence.this) @@ -52,8 +51,8 @@ def find_candidate_positions(aligned_sequence, e): else: visited_set.add(got_aminoacids) - closest_start = min(n, (closest_start_a * 3) + frame) - closest_end = min(n + 1, (closest_end_a * 3) + 3 + frame) + closest_start = (closest_start_a * 3) + frame + closest_end = (closest_end_a * 3) + 3 + frame got_protein = get_biggest_protein(has_start_codon(e), got_aminoacids) dist = detailed_aligner.align(got_protein, expected_protein).distance() orf = OriginalORF(