-
Notifications
You must be signed in to change notification settings - Fork 0
/
helper_functions_CDS_present.py
173 lines (155 loc) · 7.86 KB
/
helper_functions_CDS_present.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
from typing import List
from operator import itemgetter
def check_stop_codon(transcript_as_list: List[List[str]], strand: str):
stop_codon = [
sub_list for sub_list in transcript_as_list if 'stop_codon' in sub_list]
if len(stop_codon) > 0:
if strand == '+':
stop_codon = int(stop_codon[0][0])
else:
stop_codon = int(stop_codon[0][1])
return stop_codon
else:
return None
def get_cds_end(cds: List[List[str]], strand: str) -> int:
stop_position_plus = 0
stop_position_minus = float('inf')
cds_length = 0
for partial_cds in cds:
if strand == '+':
three_prime = int(partial_cds[1])
five_prime = int(partial_cds[0])
if stop_position_plus < three_prime:
stop_position_plus = three_prime
cds_end = three_prime
#the cds length always need to be added, even if the stop position is not
#updated, because the entries are prefiltered for CDS features
cds_length += three_prime - five_prime + 1
else:
three_prime = int(partial_cds[0])
five_prime = int(partial_cds[1])
if stop_position_minus > three_prime:
stop_position_minus = three_prime
cds_end = three_prime
cds_length += five_prime - three_prime + 1
return cds_end, cds_length
def find_termination_codon(transcript_as_list: List[str], cds: List[List[str]]) -> int:
# get strand information and stop codon if annotated
strand = transcript_as_list[0][4]
# if stop codon annotated: return its first base position
stop_position = check_stop_codon(transcript_as_list, strand)
if stop_position is not None:
cds_end, cds_length = get_cds_end(cds, strand)
return stop_position, cds_length
# if stop codon not annotated, compose CDS and extract end poisition of CDS
else:
cds_end, cds_length = get_cds_end(cds, strand)
# get exons of transcript
exons = [sub_list for sub_list in transcript_as_list if 'exon' in sub_list]
# sort list according to start positions of exons
exons.sort(key=lambda elem: int(elem[0]))
#exons = sorted(exons, key=itemgetter(int(0)))
#not working!!!
if strand == '+':
# check whether the end of the CDS coincides with the end of an exon
exon_end_is_cds_end = [
exon for exon in exons if str(cds_end) == exon[1]]
# if not assign baseposition on genome after cds (CDs_end+1) as stop
if len(exon_end_is_cds_end) == 0:
stop_position = cds_end+1 # stop position first base will start right after the CDS
# if CDS end coincides with exon end
else:
# extract exon number of coinciding exon
exon_num = exons.index(exon_end_is_cds_end[0])
# annotate the start of the next exon as stop position
try:
next_exon = exons[exon_num+1]
# if cds ends with exon and there is a next exon, set stop position to first bp of next exon
stop_position = int(next_exon[0])
except IndexError:
# stop_position = cds_end-2
stop_position = cds_end
else:
exon_end_is_cds_end = [
exon for exon in exons if str(cds_end) == exon[0]]
if len(exon_end_is_cds_end) == 0:
stop_position = cds_end-1
else:
# extract exon number of coinciding exon
exon_num = exons.index(exon_end_is_cds_end[0])
# annotate the start of the next exon as stop position
try:
if exon_num != 0 and exon_num != len(exons)-1:
next_exon = exons[exon_num-1]
# if cds ends with exon and there is a next exon, set stop position to first bp of next exon
stop_position = int(next_exon[1])
else:
stop_position = cds_end
except IndexError:
# stop_position = cds_end+2
stop_position = cds_end
return stop_position, cds_length
def compose_transcript(transcript_as_list, stop_position):
# extract all exons of the transcript
exons = [sub_list for sub_list in transcript_as_list if 'exon' in sub_list]
strand = exons[0][4]
# dict to store transcript coordinates for exon junctions and stop_position
transcript_coordinates = {'length': 0, 'exon_jc': {}, 'stop_position': 0}
last_exon = 0
if strand == '+':
# sort list according to start positions of exons
exons.sort(key=lambda elem: int(elem[0]))
else:
exons.sort(key=lambda elem: int(elem[0]), reverse=True)
exon_index = 0
for exon in exons:
exon_num = exon[2]
if strand == '+':
five_prime = int(exon[0])
three_prime = int(exon[1])
length = three_prime - five_prime + 1
# if the stop position is inside this exon: calculate the stop position in transcript coordinates,
# length plus distance from stop to 5' end of exon
if stop_position >= five_prime and stop_position <= three_prime:
#print('curr_length', transcript_coordinates['length'])
#print('stop_pos', stop_position)
#print('five_prime', five_prime)
transcript_coordinates['stop_position'] = transcript_coordinates['length'] + \
stop_position - five_prime + 1
exon_containing_stop = exon_index
dist_stop_next_EJC = three_prime - stop_position
# add length of exon to total length of the transcript
transcript_coordinates['length'] += length
# note down the length of the transcript including current exon = EJC position
transcript_coordinates['exon_jc'][exon_num] = length
if last_exon < int(exon_num):
last_exon = int(exon_num)
else:
five_prime = int(exon[1])
three_prime = int(exon[0])
# for the negative strand the ordering is adjusted such that the first exon is the one with
# the largest genoic +strand cooridnates
# the five prime end is bigger than the 3 prime end (as this is in respect to the negative strand)
# plus 1 because the las coordinate is also part of the exon
length = five_prime - three_prime + 1
if stop_position <= five_prime and stop_position >= three_prime:
# as cooridnates for stop and five_prime are on +strand but we are calculating on the -strand
# distance from current (minus strand) five prime to PTC, is equivaluent to 5
# -stopp_position in +strand coordinates
#print('curr_length', transcript_coordinates['length'])
#print('stop_pos', stop_position)
#print('five_prime', five_prime)
transcript_coordinates['stop_position'] = transcript_coordinates['length'] - \
stop_position + five_prime + 1
exon_containing_stop = exon_index
dist_stop_next_EJC = five_prime - stop_position
transcript_coordinates['length'] += length
transcript_coordinates['exon_jc'][exon_num] = length
if last_exon < int(exon_num):
last_exon = int(exon_num)
exon_index += 1
trans_coord_last_ejc = transcript_coordinates['length'] - \
transcript_coordinates['exon_jc'][str(last_exon)]
exon_containing_stop_length = int(
exons[exon_containing_stop][1]) - int(exons[exon_containing_stop][0]) + 1
return transcript_coordinates, trans_coord_last_ejc, exon_containing_stop_length, dist_stop_next_EJC