-
Notifications
You must be signed in to change notification settings - Fork 1
/
6-1-2_fill_missing_sequences.py
97 lines (82 loc) · 3.03 KB
/
6-1-2_fill_missing_sequences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# %%
import glob
import os
import numpy as np
species_list = ['hsapiens',
'ptroglodytes',
'mmulatta',
'mmusculus',
'btaurus',
'dnovemcinctus',
'sharrisii',
'oanatinus',
'ggallus',
'xtropicalis',
'drerio',
'ccarcharias',
'pmarinus']
# find all the files in the directory
files = glob.glob('./data/evo/03codalign/*.fa')
files.sort()
# create directory for filled files if it not already exists
if not os.path.exists('./data/evo/04codalign_filled'):
os.makedirs('./data/evo/04codalign_filled')
for file in files:
gene = os.path.basename(file).split('_')[0]
print(gene)
# per file read the protein sequences
txt = np.loadtxt(file, dtype=str)
if np.array_equal(txt, []):
print(f'WARNING: Empty file for {gene}\n')
continue
# find headers
# header start with >
headers = np.where(np.char.startswith(txt, '>'))[0]
if len(headers) < 8:
print(f'WARNING: Something went wrong with {gene}\n')
continue
# make sure space between headers is same
spacing = np.diff(headers)
spacing = np.unique(spacing)
if len(spacing) > 1:
print(f'WARNING: Unequal spacing between sequences for {gene}\n')
continue
# measure block length from the first to second header
sequence_block = txt[1:spacing[0]].tolist()
sequence_block = ''.join(sequence_block)
sequence_length = len(sequence_block)
# characters per line
nchars = len(txt[1])
empty_line = '-' * nchars
# characters in last line
nchars_last = len(txt[-1])
empty_last_line = '-' * nchars_last
# species are in the end of headers (last characters after _)
ordered = []
not_found = []
for species in species_list:
# find species line in the headers
line_matches = np.char.rfind(txt, species) > 0
header_idx = np.where(line_matches)[0]
if np.array_equal(header_idx, []):
empty_header = f'>XM_00000.0_XP_00000.0_{gene.lower()}_{species}'
# add empty lines as many times as spacing
sequence_block = [empty_line] * (spacing[0] - 2)
sequence_block.insert(0, empty_header)
sequence_block.append(empty_last_line)
if len(''.join(sequence_block[1:])) != sequence_length:
print(f'WARNING: Blank sequence length mismatch for {gene} {species}\n')
break
ordered.extend(sequence_block)
not_found.append(species)
else:
sequence_block = txt[header_idx[0]:header_idx[0]+spacing[0]].tolist()
ordered.extend(sequence_block)
if not_found == []:
print('All species found\n')
else:
print(f'Species not found: {", ".join(not_found)}\n')
# save the ordered sequences
with open(f'./data/evo/04codalign_filled/{gene}_codalign.fa', 'w') as f:
for line in ordered:
f.write(f'{line}\n')