Skip to content
This repository has been archived by the owner on Mar 19, 2024. It is now read-only.

Commit

Permalink
use precise distance metric when searching for ORFs
Browse files Browse the repository at this point in the history
  • Loading branch information
Donaim committed Sep 21, 2023
1 parent 6138ee8 commit 5fb3c0b
Show file tree
Hide file tree
Showing 13 changed files with 930 additions and 929 deletions.
13 changes: 8 additions & 5 deletions intact/intact.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import Bio
from Bio import AlignIO, Seq, SeqIO, SeqRecord
from scipy.stats import fisher_exact
from jarowinkler import jaro_similarity

import util.constants as const
import util.subtypes as st
Expand Down Expand Up @@ -550,13 +549,15 @@ def find_closest(aminoacids, start, direction, target):
def find_candidate_positions(e, q_start, q_end):
expected_nucleotides = e.nucleotides
expected_aminoacids = e.aminoacids
expected_protein = expected_aminoacids.strip("*")
q_start = coordinates_mapping[e.start]
q_end = coordinates_mapping[e.end - 1 if e.end == len(coordinates_mapping) else e.end]
got_nucleotides = sequence.seq[q_start:q_end]
got_aminoacids = translate(got_nucleotides)
q_start_a = q_start // 3
q_end_a = q_end // 3
n = len(sequence.seq) - 1
visited_set = set()

for frame in range(3):
aminoacids = query_aminoacids_table[frame]
Expand All @@ -565,10 +566,15 @@ def find_candidate_positions(e, q_start, q_end):
closest_start_a = q_start_a if not has_start_codon(e) else find_closest(aminoacids, q_start_a, start_direction, 'M')
closest_end_a = q_end_a if not has_stop_codon(e) else find_closest(aminoacids, q_end_a, end_direction, '*')
got_aminoacids = aminoacids[closest_start_a:closest_end_a + 1]
dist = 1 - jaro_similarity(got_aminoacids, expected_aminoacids)
if got_aminoacids in visited_set:
continue
else:
visited_set.add(got_aminoacids)

closest_start = min(n, (closest_start_a * 3) + frame)
closest_end = min(n + 1, (closest_end_a * 3) + 3 + frame)
got_protein = get_biggest_protein(has_start_codon(e), got_aminoacids)
dist = detailed_aligner.align(got_protein, expected_protein).distance()
yield CandidateORF(e.name, closest_start, closest_end, e.start, e.end,
"forward", dist, got_protein, got_aminoacids)

Expand Down Expand Up @@ -610,9 +616,6 @@ def get_indel_impact(alignment):
deletions = max(0, len(exp_protein) - len(got_protein)) * 3
insertions = max(0, len(got_protein) - len(exp_protein)) * 3

orf_alignment = detailed_aligner.align(exp_protein, got_protein)
best_match.distance = detailed_aligner.measure_distance(orf_alignment)

# Max deletion allowed in ORF exceeded
if deletions > e.deletion_tolerence:

Expand Down
2 changes: 0 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,15 +83,13 @@ def run(self):
'biopython>=1.71',
'click>=6.7',
'scipy>=1.6.0',
'jarowinkler>=1.1.0',
'numpy>1.19.5'
],
setup_requires=[
'appdirs>=1.4.3',
'biopython>=1.71',
'click>=6.7',
'scipy>=1.6.0',
'jarowinkler>=1.1.0',
'numpy>=1.19.5'
]
)
Expand Down
42 changes: 21 additions & 21 deletions tests/expected-results-large-csv/errors.csv
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
sequence_name,error,message
KX505501.1,DeletionInOrf,"ORF pol at 2084-5096 can have maximum deletions 30, got 2892"
KX505501.1,DeletionInOrf,"ORF pol at 2084-5096 can have maximum deletions 30, got 2721"
KX505501.1,InternalStopInOrf,ORF env at 6224-8795 contains an internal stop codon at 6323
KX505501.1,InternalStopInOrf,Smaller ORF vif at 5040-5619 contains an internal stop codon at 5076
KX505501.1,DeletionInOrf,"Smaller ORF vpr at 5558-5850 can have maximum deletions 30, got 231"
KX505501.1,DeletionInOrf,"Smaller ORF tat_exon1 at 5830-6045 can have maximum deletions 30, got 213"
KX505501.1,DeletionInOrf,"Smaller ORF rev_exon1 at 5969-6045 can have maximum deletions 30, got 75"
KX505501.1,InsertionInOrf,"Smaller ORF vif at 5040-5619 can have maximum insertions 90, got 909"
KX505501.1,DeletionInOrf,"Smaller ORF vpr at 5558-5850 can have maximum deletions 30, got 84"
KX505501.1,InternalStopInOrf,Smaller ORF tat_exon1 at 5830-6045 contains an internal stop codon at 5893
KX505501.1,InternalStopInOrf,Smaller ORF rev_exon1 at 5969-6045 contains an internal stop codon at 6005
KX505501.1,DeletionInOrf,"Smaller ORF vpu at 6061-6310 can have maximum deletions 30, got 69"
KX505501.1,FrameshiftInOrf,Smaller ORF tat_exon2 at 8376-8469 contains out of frame indels that impact 40 positions.
KX505501.1,DeletionInOrf,"Smaller ORF rev_exon2 at 8377-8653 can have maximum deletions 30, got 96"
KX505501.1,InternalStopInOrf,Smaller ORF nef at 8796-9417 contains an internal stop codon at 8832
KX505501.1,InsertionInOrf,"Smaller ORF nef at 8796-9417 can have maximum insertions 90, got 1116"
KX505501.1,RevResponseElementDeletion,Query Sequence exceeds maximum deletion tolerance in RRE. Contains 265 deletions with max tolerance of 20 deletions.
KX505501.1,LongDeletion,Query sequence contains a long deletion.
KX505501.1,Scramble,Sequence is plus-scrambled.
Expand All @@ -22,8 +22,8 @@ MN692074,InsertionInOrf,"Smaller ORF vpr at 5558-5850 can have maximum insertion
MN692074,InternalStopInOrf,Smaller ORF tat_exon1 at 5830-6045 contains an internal stop codon at 5893
MN692074,DeletionInOrf,"Smaller ORF vpu at 6061-6310 can have maximum deletions 30, got 165"
MN692074,FrameshiftInOrf,Smaller ORF tat_exon2 at 8376-8469 contains out of frame indels that impact 76 positions.
MN692074,DeletionInOrf,"Smaller ORF rev_exon2 at 8377-8653 can have maximum deletions 30, got 192"
MN692074,InsertionInOrf,"Smaller ORF nef at 8796-9417 can have maximum insertions 90, got 126"
MN692074,DeletionInOrf,"Smaller ORF rev_exon2 at 8377-8653 can have maximum deletions 30, got 204"
MN692074,InsertionInOrf,"Smaller ORF nef at 8796-9417 can have maximum insertions 90, got 1131"
MN692074,RevResponseElementDeletion,Query Sequence exceeds maximum deletion tolerance in RRE. Contains 265 deletions with max tolerance of 20 deletions.
MN692074,LongDeletion,Query sequence contains a long deletion.
MN692145,FrameshiftInOrf,Smaller ORF vpr at 5558-5850 contains out of frame indels that impact 79 positions.
Expand All @@ -35,7 +35,7 @@ MN090335,PackagingSignalDeletion,Query Sequence exceeds maximum deletion toleran
MN090335,MajorSpliceDonorSiteMutated,"Query sequence has a mutated splice donor site, AT."
MN090335,Scramble,Sequence is minus-scrambled.
MN090335,InternalInversion,Sequence contains an internal inversion.
MN090376,InternalStopInOrf,ORF gag at 789-2292 contains an internal stop codon at 933
MN090376,InternalStopInOrf,ORF gag at 789-2292 contains an internal stop codon at 822
MN090376,FrameshiftInOrf,Smaller ORF vpr at 5558-5850 contains out of frame indels that impact 79 positions.
MN090376,InsertionInOrf,"Smaller ORF nef at 8796-9417 can have maximum insertions 90, got 252"
MN090376,PackagingSignalDeletion,Query Sequence exceeds maximum deletion tolerance in PSI. Contains 95 deletions with max tolerance of 10 deletions.
Expand Down Expand Up @@ -78,34 +78,34 @@ MK114705.1,FrameshiftInOrf,Smaller ORF nef at 8796-9417 contains out of frame in
MK114856.1,InternalStopInOrf,ORF gag at 789-2292 contains an internal stop codon at 873
MK114856.1,InternalStopInOrf,ORF pol at 2084-5096 contains an internal stop codon at 2213
MK114856.1,InternalStopInOrf,ORF env at 6224-8795 contains an internal stop codon at 6326
MK114856.1,InternalStopInOrf,Smaller ORF vif at 5040-5619 contains an internal stop codon at 5130
MK114856.1,InternalStopInOrf,Smaller ORF vif at 5040-5619 contains an internal stop codon at 5172
MK114856.1,InternalStopInOrf,Smaller ORF vpr at 5558-5850 contains an internal stop codon at 5594
MK114856.1,InternalStopInOrf,Smaller ORF tat_exon1 at 5830-6045 contains an internal stop codon at 5860
MK114856.1,InternalStopInOrf,Smaller ORF tat_exon1 at 5830-6045 contains an internal stop codon at 5920
MK114856.1,InternalStopInOrf,Smaller ORF vpu at 6061-6310 contains an internal stop codon at 6127
MK114856.1,InternalStopInOrf,Smaller ORF rev_exon2 at 8377-8653 contains an internal stop codon at 8434
MK114856.1,InternalStopInOrf,Smaller ORF nef at 8796-9417 contains an internal stop codon at 8970
MK114856.1,InternalStopInOrf,Smaller ORF nef at 8796-9417 contains an internal stop codon at 8853
MK114856.1,APOBECHypermutationDetected,Query sequence shows evidence of APOBEC3F/G-mediated hypermutation (p = 4.399685326687554e-65).
MK115009.1,InternalStopInOrf,ORF gag at 789-2292 contains an internal stop codon at 834
MK115009.1,InternalStopInOrf,ORF pol at 2084-5096 contains an internal stop codon at 2183
MK115009.1,InternalStopInOrf,ORF env at 6224-8795 contains an internal stop codon at 6350
MK115009.1,InternalStopInOrf,Smaller ORF vif at 5040-5619 contains an internal stop codon at 5079
MK115009.1,InternalStopInOrf,Smaller ORF vif at 5040-5619 contains an internal stop codon at 5373
MK115009.1,InternalStopInOrf,Smaller ORF vpr at 5558-5850 contains an internal stop codon at 5717
MK115009.1,DeletionInOrf,"Smaller ORF tat_exon1 at 5830-6045 can have maximum deletions 30, got 54"
MK115009.1,InternalStopInOrf,Smaller ORF tat_exon1 at 5830-6045 contains an internal stop codon at 5860
MK115009.1,InternalStopInOrf,Smaller ORF rev_exon2 at 8377-8653 contains an internal stop codon at 8434
MK115009.1,InternalStopInOrf,Smaller ORF nef at 8796-9417 contains an internal stop codon at 8982
MK115009.1,InternalStopInOrf,Smaller ORF nef at 8796-9417 contains an internal stop codon at 8874
MK115009.1,APOBECHypermutationDetected,Query sequence shows evidence of APOBEC3F/G-mediated hypermutation (p = 1.3731449337509935e-41).
MK115009.1,Scramble,Sequence is minus-scrambled.
MK115009.1,InternalInversion,Sequence contains an internal inversion.
MK115387.1,FrameshiftInOrf,Smaller ORF vpr at 5558-5850 contains out of frame indels that impact 79 positions.
MK115387.1,InsertionInOrf,"Smaller ORF nef at 8796-9417 can have maximum insertions 90, got 279"
MK115491.1,FrameshiftInOrf,Smaller ORF vpr at 5558-5850 contains out of frame indels that impact 80 positions.
MK115491.1,InsertionInOrf,"Smaller ORF nef at 8796-9417 can have maximum insertions 90, got 243"
MK116110.1,InternalStopInOrf,ORF gag at 140-1643 contains an internal stop codon at 194
MK116110.1,InternalStopInOrf,ORF gag at 140-1643 contains an internal stop codon at 185
MK116110.1,PackagingSignalDeletion,Query Sequence exceeds maximum deletion tolerance in PSI. Contains 22 deletions with max tolerance of 10 deletions.
MK116110.1,MajorSpliceDonorSiteMutated,"Query sequence has a mutated splice donor site, CC."
MK115527.1,FrameshiftInOrf,Smaller ORF vpr at 5558-5850 contains out of frame indels that impact 80 positions.
MK115527.1,InsertionInOrf,"Smaller ORF nef at 8796-9417 can have maximum insertions 90, got 243"
MK114997.1,InternalStopInOrf,ORF env at 6224-8795 contains an internal stop codon at 6356
MK114997.1,InternalStopInOrf,ORF env at 6224-8795 contains an internal stop codon at 6512
MK114997.1,FrameshiftInOrf,Smaller ORF vpr at 5558-5850 contains out of frame indels that impact 79 positions.
MK114997.1,InsertionInOrf,"Smaller ORF nef at 8796-9417 can have maximum insertions 90, got 270"
MK115518.1,FrameshiftInOrf,Smaller ORF vpr at 5558-5850 contains out of frame indels that impact 80 positions.
Expand All @@ -121,14 +121,14 @@ MK115464.1,InternalStopInOrf,ORF pol at 2084-5096 contains an internal stop codo
MK115464.1,InternalStopInOrf,ORF env at 6224-8795 contains an internal stop codon at 6425
MK115464.1,InternalStopInOrf,Smaller ORF vif at 5040-5619 contains an internal stop codon at 5247
MK115464.1,FrameshiftInOrf,Smaller ORF vpr at 5558-5850 contains out of frame indels that impact 79 positions.
MK115464.1,InternalStopInOrf,Smaller ORF tat_exon1 at 5830-6045 contains an internal stop codon at 5860
MK115464.1,FrameshiftInOrf,Smaller ORF tat_exon1 at 5830-6045 contains out of frame indels that impact 104 positions.
MK115464.1,InternalStopInOrf,Smaller ORF vpu at 6061-6310 contains an internal stop codon at 6127
MK115464.1,InternalStopInOrf,Smaller ORF rev_exon2 at 8377-8653 contains an internal stop codon at 8434
MK115464.1,FrameshiftInOrf,Smaller ORF nef at 8796-9417 contains out of frame indels that impact 149 positions.
MK115464.1,FrameshiftInOrf,Smaller ORF nef at 8796-9417 contains out of frame indels that impact 393 positions.
MK115464.1,APOBECHypermutationDetected,Query sequence shows evidence of APOBEC3F/G-mediated hypermutation (p = 5.391006513622446e-23).
MK115530.1,FrameshiftInOrf,Smaller ORF vpr at 5558-5850 contains out of frame indels that impact 80 positions.
MK115530.1,InsertionInOrf,"Smaller ORF nef at 8796-9417 can have maximum insertions 90, got 243"
MK115520.1,InternalStopInOrf,ORF pol at 2084-5096 contains an internal stop codon at 3284
MK115520.1,InternalStopInOrf,ORF pol at 2084-5096 contains an internal stop codon at 2198
MK115520.1,FrameshiftInOrf,Smaller ORF vpr at 5558-5850 contains out of frame indels that impact 80 positions.
MK115520.1,InsertionInOrf,"Smaller ORF nef at 8796-9417 can have maximum insertions 90, got 243"
MK115520.1,MajorSpliceDonorSiteMutated,"Query sequence has a mutated splice donor site, GA."
Expand All @@ -152,7 +152,7 @@ MK115095.1,InternalStopInOrf,ORF pol at 2084-5096 contains an internal stop codo
MK115095.1,InternalStopInOrf,ORF env at 6224-8795 contains an internal stop codon at 6551
MK115095.1,InternalStopInOrf,Smaller ORF vif at 5040-5619 contains an internal stop codon at 5151
MK115095.1,FrameshiftInOrf,Smaller ORF vpr at 5558-5850 contains out of frame indels that impact 79 positions.
MK115095.1,DeletionInOrf,"Smaller ORF tat_exon1 at 5830-6045 can have maximum deletions 30, got 54"
MK115095.1,InternalStopInOrf,Smaller ORF tat_exon1 at 5830-6045 contains an internal stop codon at 5860
MK115095.1,InternalStopInOrf,Smaller ORF vpu at 6061-6310 contains an internal stop codon at 6127
MK115095.1,InternalStopInOrf,Smaller ORF rev_exon2 at 8377-8653 contains an internal stop codon at 8434
MK115095.1,InsertionInOrf,"Smaller ORF nef at 8796-9417 can have maximum insertions 90, got 213"
Expand Down
Loading

0 comments on commit 5fb3c0b

Please sign in to comment.