-
Notifications
You must be signed in to change notification settings - Fork 0
/
filesplitAllTranscriptsMANE.py
351 lines (315 loc) · 15.6 KB
/
filesplitAllTranscriptsMANE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
from Bio import SeqIO
import numpy as np
import matplotlib.pyplot as plt
'''
This block of code filters for 3' UTR sequences at least 15 nt and unavailable sequences
del_items = [] #contain keys of sequences that will be deleted
'''
def filter_unavailable(dict_seq):
del_items = []
for gene in dict_seq:
seq = dict_seq[gene].seq
if seq == "Sequenceunavailable": #can add or len(seq) < 15 to filter for nucleotides
del_items.append(gene)
filtered_dict = dict_seq
for key in del_items:
del filtered_dict[key]
return filtered_dict
#make a function that converts a dictionary with sequences in Biopython's SeqRecord format to string format. Input: dictionary w values in SeqRecord for,at.
#output: dictionary with String values.
def seqdict_to_strdict(dict1):
strdict = {}
for gene in dict1:
sequence = dict1[gene].seq
string_seq = str(sequence)
strdict[gene] = string_seq
return strdict
#Find gene IDs that aren't
#Input: CSV with gene names of all genes you want data for, and dictionary with header in FASTA format where first thing before | is the name of the gene
#plan: convert all of these to a list
#will manually get data for the last 19 genes that ENSEMBL couldn't get MANE 3' information for.
#output: write to file called add_manually with a list of genes I need to get 3' UTR info for.
def find_genes(dict1):
with open("CSV of gene names of all 660 haploinsufficient genes.csv", "r") as filevar:
first_line_with_names = filevar.readline()
gene_list = first_line_with_names.split(",")
gene_list_dict = []
for header in dict1:
gene_name = header.split("|")[0]
gene_list_dict.append(gene_name)
genes_manually_add = []
for gene in gene_list:
if gene not in gene_list_dict:
genes_manually_add.append(gene)
with open("Gene Names to add.txt", "w") as filevar2:
for gene in genes_manually_add:
filevar2.write(gene)
filevar2.write("\n")
#plot_hist - plots summary statistics of sequence lengths of all transcripts (assuming dictionary values are sequences in string format). Second parameter is title of histogram
def plot_hist(dict1):
dict_hist = {}
for transcript, utr in dict1.items():
num_seq = len(utr)
dict_hist[transcript] = num_seq
values = list(dict_hist.values())
min_utr_len = min(values)
max_utr_len = max(values)
#algorithm for binning: https://stackoverflow.com/questions/6855710/how-to-have-logarithmic-bins-in-a-python-histogram ; I chose arbitrary # of bins at 30
#numpy linspace package returns evenly spaced numbers over a specified interval
plt.hist(values, bins = 10 ** np.linspace(np.log10(min_utr_len), np.log10(max_utr_len), 30), histtype = "bar")
plt.xscale('log')
plt.xlabel("Sequence Length")
plt.ylabel("Frequency")
plt.show()
#return dictionary with keys being in the format:
#Gene name | transcript id | 270 nt fragment number for this transcript id
#and the value is the sequence. Will use this to check for, and remove, duplicates in the next step; and after, write the sequences to a file in FASTA format.
#Addition: add additional thing you return, a dictionary with the number of 270nt sequences needed for 3' region for each gene. Useful for summary statistics.
def dict_with_270nt_seq(dict1):
dict_270nt_seq = {}
dict_number_seq = {}
for gene in dict1:
gene_header_list = gene.split("|")
gene_name = gene_header_list[0]
gene_transcript_id = gene_header_list[1]
string_seq = dict1[gene]
#now handle this just like a string, and write to a file just like a string.
num_lines = 1
count = 0
len_seq = len(string_seq)
while count < len_seq:
key = gene_name + "|" + gene_transcript_id + "|" + str(num_lines)
value = string_seq[count:count+270]
#for fragments > 270 nts: last fragment = the last 270 nts
if count + 270 > len_seq and len_seq > 270:
value = string_seq[len_seq-270:]
dict_270nt_seq[key] = value
count = count + 220
if count > len_seq:#summary statistics
dict_number_seq[gene] = num_lines#summary statistics
#50-overlap is created by just increasing by 220 nt each iteration over the string, but writing 270 nt
num_lines = num_lines + 1
return dict_270nt_seq, dict_number_seq
#write sequences before removal to file - debugging function
def write_seqs_before_dup_removal(dict1):
with open("seqs_before_duplication_removal.fasta", "w") as filevar:
for key in dict1:
filevar.write(">" + key + "\n" + dict1[key] + "\n")
#write dict_number_seqs to a new file called seqs_per_transcript.txt. Can do analysis in Python, but easier in Excel. In Excel, just import this text file as CSV to open it.
def write_seqs_per_transcript(dict1):
with open("seqs_per_transcript.txt", "w") as filevar:
filevar.write("Gene name | transcript id | # of sequences\n")
for gene in dict1:
filevar.write(gene + "|" + str(dict1[gene]) + "\n")
#given a dictionary that can have keys with duplicate values, return a dictionary such that there is only one key per value, and other keys with duplicate values
#are removed. Write to a file called filename the keys that are removed from the dictionary.
#return another dictionary with numbers of each transcript id and fragment, to compare later for quality control testing.
#second dictionary returns gene_name|transcript_id as key and value as #times it appears, will compare to dictionary before adding.
def remove_duplicates(dict1, filename):
dict_270_nt_no_dupl = {}
dict_qc_test = {}
count_deleted_duplicates = 0
count_original = 0
with open(filename, "w") as filevar:
filevar.write("Deleted Sequences\nGene Name | Transcript ID | Transcript #\n")
non_duplicate_values_list = []
keys_not_deleted = []
for header,utr in dict1.items():
if utr not in non_duplicate_values_list:
non_duplicate_values_list.append(utr)
keys_not_deleted.append(header)
#qc part
split_list = header.split("|")
gene_name = split_list[0]#weird numbers here to make same format to compare to other dictionary in qc test
transcript_id = split_list[1]
qc_key = gene_name +"|" + transcript_id
if qc_key not in dict_qc_test:
dict_qc_test[qc_key] = 0
dict_qc_test[qc_key] = dict_qc_test[qc_key] + 1
for header in dict1:
if header not in keys_not_deleted:
filevar.write("\n" + header)
count_deleted_duplicates = count_deleted_duplicates + 1
if header in keys_not_deleted:
dict_270_nt_no_dupl[header] = dict1[header]
count_original = count_original + 1
print("Number of duplicate sequences: " + str(count_deleted_duplicates))
print("Number of non-duplicates: " + str(count_original))
return dict_270_nt_no_dupl, dict_qc_test
#alternative way to remove duplicates: remove_duplicates2
def remove_duplicates2(dict1, filename):
dict_270_nt_no_dupl = {}
with open(filename, "w") as filevar:
filevar.write("Deleted Sequences\nGene Name | Transcript ID | Fragment # | Replaced by Fragment \n") # add replaced by
for key in dict1:
if dict1[key] in dict_270_nt_no_dupl:
filevar.write(key + " | " + dict_270_nt_no_dupl[dict1[key]] + "\n" + dict1[key] + "\n")
dict_270_nt_no_dupl[dict1[key]] = key
print("Number of non-duplicates remove_duplicates2: " + str(len(dict_270_nt_no_dupl)))
return dict_270_nt_no_dupl
'''
Make a file that shows the number of fragments per transcript before duplicate
removal and after (inputs: original dictionary, dictionary after removal);
Output: writing to file in CSV format. Both dictionaries have the same keys if they both have the item.
Format:
Gene Id | Transcript ID | Number of fragments before | Number of fragments after
'''
def make_fragment_deleted_table(original_dict_nums, dict_after_removal_nums):
#print("dict_after_removal_nums keys", list(dict_after_removal_nums.keys()))
with open("deleted_transcripts_data.tsv", "w") as filevar:
for key in original_dict_nums:
filevar.write(key + "|" + str(original_dict_nums[key]) + "|")
if key in dict_after_removal_nums:
filevar.write(str(dict_after_removal_nums[key]))
else:
filevar.write("0")
filevar.write("\n")
#filter 1: If 1 fragment is subset of another, then remove the smaller one.
def filter_subsequence(dict1):
values_list = list(dict1.values())
new_dict = {}
for key in dict1:
add_or_not = True
for value in values_list:
if dict1[key] != value and dict1[key] in value:
add_or_not = False
if add_or_not == True:
new_dict[key] = dict1[key]
print("Length of new dictionary: " + str(len(new_dict)))
return new_dict
#count number of fragments smaller than a particular size
def count_15(dict1):
count200 = 0
count10 = 0
for x in dict1:
if len(dict1[x]) <= 200:
count200 = count200 +1
if len(dict1[x]) <= 10:
count10 = count10 +1
print("Number < 200: ", count200, "\n")
print("Number < 10: ", count10, "\n")
#Write Fragments to file in Fasta format:
#header: >Gene name | transcript id | 270 nt fragment number for this id
#sequence on next ine
def write_270_nt(dict1, filename):
with open(filename, "w") as filevar:
#filevar.write("Gene name | transcript id | 270 nt sequence number for this id | sequence\n")
for gene in dict1:
filevar.write(">" + gene + "\n" + dict1[gene] + "\n")
#now handle this just like a string, and write to a file just like a string.
#for 3' UTR sequences initially smaller than 270 nt, add GFP to end of that sequence until
#it reaches 270 nt. Return a new dictionary with all sequences initially larger than 270 nt, and those
#that are smaller than 270 nt corrected to be 270 nt by appending GFP to the end.
def add_GFP_to_270(dict1):
gfp = "ATGGTGAGCAAGGGCGCCGAGCTGTTCACCGGCATCGTGCCCATCCTGATCGAGCTGAATGGCGATGTGAATGGCCACAAGTTCAGCGTGAGCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCTGTGCCCTGGCCCACCCTGGTGACCACCCTGAGCTACGGCGTGCAGTGCTTCTCACGCTACCCCGATCACATGAAGCAGCACGACTTCTTCAAGAGCGCCATGCCT"
new_dict = {}
for seq in dict1:
if len(dict1[seq]) > 270:
new_dict[seq] = dict1[seq]
else:
len_utr = len(dict1[seq])
new_seq = dict1[seq] + gfp[:270-len_utr]
new_dict[seq] = new_seq
return new_dict
#returns a dictionary of MANE Refseq IDs to MANE Ensembl IDs
def refseq_to_ensembl():
with open("EnsembltoRefSeq.txt", "r") as filevar:
dict1 = {}
for lin in filevar:
lin_split = lin.strip().split("\t")
dict1[lin_split[1]] = lin_split[0]
return dict1
#given a file with a list of RefSeq IDs, and a dictionary with conversion, create a new file called
#ENSEMBL IDs that returns the ENSEMBL ID
def write_ensembl_from_refseq(dict1):
with open("Refseq.txt", "r") as filevar1:
list_refseq = filevar1.read().splitlines()
with open("Ensembl.txt","w") as filevar2:
for id in list_refseq:
ens_id = dict1[id]
filevar2.write(ens_id)
filevar2.write("\n")
#function to filter out fragments that are extremely similar to each other
#similar to filter_subsequence, but instead of filtering by subsequence, we filter out the
#sequences that are the same, except for the first 0 - 20 nucleotides
#input: dictionary; output: dictionary
def filter_similar(dict1):
values_list = list(dict1.values())
new_dict = {}
for key in dict1:
add_or_not = True
for value in values_list:
if dict1[key] != value and dict1[key][19:] in value:
add_or_not = False
if add_or_not == True:
new_dict[key] = dict1[key]
print("Length of new dictionary: " + str(len(new_dict)))
return new_dict
#manually add sequences to the original FASTA file with the 3' UTRs before everything
def manually_add_sequences():
dict1 = {}
with open("manually_added.txt","r") as filevar:
lines = filevar.readlines()
for lin in lines:
lin_list = lin.split()
gene_name = lin_list[0]
ensembl_id = lin_list[1]
utr_seq = lin_list[2]
key = ">" + gene_name + "|" + ensembl_id
dict1[key] = utr_seq
#put in format: gene name|ENSEMBL ID
# sequence
with open("UTRs of haploinsufficient genes MANE curated 641 out of 660.fasta", "a") as filevar2:
for key in dict1:
str_to_write = key + "\n" + dict1[key] + "\n"
filevar2.write(str_to_write)
#function to take in FASTA with 270 nt fragments, and convert the CDS sequence they encode
#with the reverse complement (which is what we want to synthesize)
def cds_to_template():
new_dict = {}
utr_dict = SeqIO.to_dict(SeqIO.parse("270ntsequences.fasta", "fasta"))
for key in utr_dict:
cds = utr_dict[key]
rev_comp = cds.reverse_complement()
new_dict[key] = rev_comp
return new_dict
#given two sequences, sequence to be appended at beginning and sequence to be
#appended at end, add ends which include restriction site cut sites that will
#be used to fragment the ends.
def add_ends_to_template(dict1, begin_seq, end_seq):
for key in dict1:
new_seq = begin_seq + dict1[key] + end_seq
dict1[key] = new_seq
return dict1
def run_program():
#never run this (manually_add_sequences) again with the same seqeuences unless you delete the added sequences in the UTR seqdict_to_strdict
#manually_add_sequences()
conversion_dict = refseq_to_ensembl()
write_ensembl_from_refseq(conversion_dict)
#write_ensembl_from_refseq(co
utr_dict = SeqIO.to_dict(SeqIO.parse("UTRs of haploinsufficient genes MANE curated 641 out of 660.fasta", "fasta"))
find_genes(utr_dict)
utr_dict = filter_unavailable(utr_dict)
utr_dict = seqdict_to_strdict(utr_dict)
#plot_hist(utr_dict)
utr_dict = filter_subsequence(utr_dict) #new 11/9
utr_dict = add_GFP_to_270(utr_dict)
dict_270_nt, dict_number_seqs = dict_with_270nt_seq(utr_dict)
write_seqs_before_dup_removal(dict_270_nt)
write_seqs_per_transcript(dict_number_seqs)
print("Number of total 270 nt transcripts: " + str(len(dict_270_nt))) #32561 transcripts/stuff currently in the dictionary
no_dupl_dict, qc_dict = remove_duplicates(dict_270_nt, "deleted_transcripts.txt")
no_dupl_dict2 = remove_duplicates2(dict_270_nt, "deleted_transcripts2.txt")
#assertion test: fragments in no_dupl_dict = fragments in no_dupl_dict2 (they're just flipped so can't do a direct comparison). Keys are different though.
assert set(no_dupl_dict.values()) == set(no_dupl_dict2.keys())
make_fragment_deleted_table(dict_number_seqs, qc_dict)
subsequence_dict = filter_subsequence(no_dupl_dict)
count_15(subsequence_dict)
filter_similar_dict = filter_similar(subsequence_dict)
#filter_similar_dict only got rid of 63 sequences...
count_15(filter_similar_dict)
write_270_nt(filter_similar_dict, "270ntsequences.fasta")
template_dict = cds_to_template()
template_dict = seqdict_to_strdict(template_dict)
write_270_nt(template_dict, "270ntTemplate.fasta")
run_program()