forked from eric2302/ceballos_peptides
-
Notifications
You must be signed in to change notification settings - Fork 0
/
6-1-1_reorder_sequences.py
72 lines (60 loc) · 2.14 KB
/
6-1-1_reorder_sequences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# %%
import glob
import os
import numpy as np
species_list = ['hsapiens',
'ptroglodytes',
'mmulatta',
'mmusculus',
'btaurus',
'dnovemcinctus',
'sharrisii',
'oanatinus',
'ggallus',
'xtropicalis',
'drerio',
'ccarcharias',
'pmarinus']
# find all the files in the directory
proseq_files = glob.glob('./data/evo/01proseqs_align/*.fasta')
proseq_files.sort()
# create directory for reordered files if it not already exists
if not os.path.exists('./data/evo/02proseqs_ordered'):
os.makedirs('./data/evo/02proseqs_ordered')
for proseq in proseq_files:
gene = os.path.basename(proseq).split('_')[0]
print(gene)
# per file read the protein sequences
txt = np.loadtxt(proseq, dtype=str)
# find headers
# header start with >
headers = np.where(np.char.startswith(txt, '>'))[0]
if len(headers) < 8:
print(f'WARNING: Something went wrong with {gene}\n')
continue
# make sure space between headers is same
spacing = np.diff(headers)
spacing = np.unique(spacing)
if len(spacing) > 1:
print(f'WARNING: Unequal spacing between sequences for {gene}\n')
continue
# species are in the end of headers (last characters after _)
ordered = []
not_found = []
for species in species_list:
# find species line in the headers
line_matches = np.char.rfind(txt, species) > 0
header_idx = np.where(line_matches)[0]
if np.array_equal(header_idx, []):
not_found.append(species)
continue
sequence_block = txt[header_idx[0]:header_idx[0]+spacing[0]]
ordered.extend(sequence_block.tolist())
if not_found == []:
print('All species found\n')
else:
print(f'The following species were not found for {gene}: {", ".join(not_found)}\n')
# save the ordered sequences into new file
with open(f'./data/evo/02proseqs_ordered/{gene}_proseq_ordered.fasta', 'w') as f:
for line in ordered:
f.write(f'{line}\n')