-
Notifications
You must be signed in to change notification settings - Fork 1
/
PDB_flipper.py
371 lines (319 loc) · 12.6 KB
/
PDB_flipper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
import logging
from Bio.PDB import NeighborSearch
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.MMCIFParser import FastMMCIFParser
import util_functions as uf
C_RANGE = 3.8
LONG_SHORT_THRESHOLD = 7
MISS_THRESHOLD = 6
# Flipper object to represent a residue
class FlipperResidue:
# constructor sets important default parameters (has_prev and has_next must be False)
def __init__(self):
self.pdb_id = None
self.model_id = None
self.pdb_index = None
self.pdb_insertion_code = None
self.chain_id = None
self.pos_in_chain = None
self.name3 = 'XXX'
self.name1 = 'X'
self.has_prev = False
self.has_next = False
self.atoms_coord = []
self.c_alpha_coord = None
self.uniprot_id = None
self.uniprot_index = None
# id of the residue as string with insertion code
def string_index(self):
if not self.pdb_insertion_code == ' ':
return str(self.pdb_index) + self.pdb_insertion_code
return str(self.pdb_index)
# !!! RESIDUE FULL IDENTIFIER, used in FLIPPER to identify uniquely a residue. it is used from independent objects as DSSP_MANAGER
def get_full_identifier(self):
return "{}_{}_{}_{}{}".format(self.pdb_id, self.model_id, self.chain_id, self.pdb_index, self.pdb_insertion_code)
# Flipper object to represent a chain of residues
# if has a LIST of residues, but also has a DICTIONARY of the residues to access them directly
class FlipperChain:
# constructor sets the label of the chain and initializes some variables
def __init__(self, label):
self.id_label = label
self.residues = []
self.string_index_map = {}
self.rna_dna_chain = False
# get a residue from his string id (get position in list from map, then retrieve it)
def get_by_id(self, string_id):
return self.residues[self.string_index_map[string_id]]
# get the position in the chain of a residue, given its string id
def get_pos_by_id(self, string_id):
return self.string_index_map[string_id]
# check if a chain contains gaps between two residues.
def have_gaps(self, res1, res2):
for i in range(res1.pos_in_chain, res2.pos_in_chain):
if not self.residues[i].has_next:
return True
return False
# Flipper object to represent a Structure
# it is a map of labels ==> FlipperChains
class FlipperStructure:
# constructor
def __init__(self, pdb_id, mod_id):
self.pdb_id = pdb_id
self.model_id = mod_id
self.chains = {}
# get all the chains as a list
def get_chains(self):
return self.chains.values()
# Network of neighbors in a structure
# it is made as a dictionary, to be used with given methods
class NeighborsNet:
# constructor
def __init__(self):
self.nn = {}
# add default entry for a residue (empty)
def add_default(self, res):
self.nn.setdefault(res.get_full_identifier(), ([],[],[]))
# add an inter-chain neighbor for res1, that is res2
def add_inter(self, res1, res2):
self.nn[res1.get_full_identifier()][0].append(res2)
# add a short range intra-chain neighbor for res1, that is res2
def add_short(self, res1, res2):
self.nn[res1.get_full_identifier()][1].append(res2)
# add a long range intra-chain neighbor for res1, that is res2
def add_long(self, res1, res2):
self.nn[res1.get_full_identifier()][2].append(res2)
# return all neighbors of a residue
def get_neighbors(self, residue):
return self.nn.get(residue.get_full_identifier())
# return iter chain neighbor for given residue
def get_inter(self, residue):
neighbors = self.nn.get(residue.get_full_identifier())
if neighbors:
return neighbors[0]
return None
# return intra chain neighbor for given residue
def get_intra(self, residue):
neighbors = self.nn.get(residue.get_full_identifier())
if neighbors:
return neighbors[1]+neighbors[2]
return None
# return intra chain short range neighbor for given residue
def get_short(self, residue):
neighbors = self.nn.get(residue.get_full_identifier())
if neighbors:
return neighbors[1]
return None
# return intra chain long range neighbor for given residue
def get_long(self, residue):
neighbors = self.nn.get(residue.get_full_identifier())
if neighbors:
return neighbors[2]
return None
# count inter chain neighbors for given residue
def get_inter_count(self, residue):
inter = self.get_inter(residue)
if inter != None:
return len(inter)
return None
# count intra chain neighbors for given residue
def get_intra_count(self, residue):
intra = self.get_intra(residue)
if intra != None:
return len(intra)
return None
# count short intra chain neighbors for given residue
def get_short_count(self, residue):
sh = self.get_short(residue)
if sh != None:
return len(sh)
return None
# count long intra chain neighbors for given residue
def get_long_count(self, residue):
lo = self.get_long(residue)
if lo != None:
return len(lo)
return None
# count neighbors for given residue
def get_inter_and_long_count(self, residue):
neighbors = self.nn.get(residue.get_full_identifier())
if neighbors != None:
return len(neighbors[0]), len(neighbors[2])
return None, None
# get unique chains from residue neighbors, as pdb and uniprot identifiers
def get_chains(self, residue):
chains = []
uni_chains=[]
for res in self.get_inter(residue):
if not (res.chain_id in chains):
chains.append(res.chain_id)
if not (res.uniprot_id in uni_chains):
uni_chains.append(res.uniprot_id)
return chains, uni_chains
# Structure builder (FROM Bio.PDB objects)
class StructureBuilder:
# constructor, sets the path of the pdb file and configuration parameters
def __init__(self, file_path, config):
self.config = config
pdb_id = uf.parse_pdb_id_from_file(file_path)
self.bio_struct = None
if file_path[-4:] == '.pdb' or file_path[-4:] == '.ent':
self.bio_struct = PDBParser(QUIET=True).get_structure(pdb_id, file_path)
elif file_path[-4:] == '.cif':
self.bio_struct = FastMMCIFParser(auth_chains=False, auth_residues=True, QUIET=True).get_structure(pdb_id, file_path)
# self.bio_struct = MMCIFParser(QUIET=True).get_structure(pdb_id, file_path)
if self.bio_struct == None:
logging.error("{} Flipper unable to parse structure file: {}".format(pdb_id, file_path))
#sys.exit("ERROR! unable to parse structure: pdb_id: {} file: {}".format(pdb_id, file_path))
# add a residue to the structure
def add_residue(self, fl_chain, res):
# create an empty residue
fl_res = FlipperResidue()
### extract res info ###
fl_res.pdb_id = res_pdb_id(res)
fl_res.model_id = res_model_id(res)
fl_res.pdb_index = res_pdb_index(res)
fl_res.pdb_insertion_code = res_insertion_code(res)
fl_res.chain_id = fl_chain.id_label
fl_res.name3 = res.get_resname()
# if the name in one letter does not exists, let default 'X'
n = uf.aa_3to1.get(fl_res.name3)
if n:
fl_res.name1 = n
# if the chain we are inserting in is not a DNA or RNA chain
if not fl_chain.rna_dna_chain:
# get the atoms coordinates of the residue, and the alpha carbon also separately
for atom in res:
fl_res.atoms_coord.append(atom.get_coord())
if atom.id == 'CA':
fl_res.c_alpha_coord = atom.get_coord()
if fl_res.c_alpha_coord is not None:
# residues are inserted in order, so give it position equal to the length of the list
fl_res.pos_in_chain = len(fl_chain.residues)
# calculate distance between aplah carbons of this new residue and the previous one, to insert gap flags eventually
if fl_chain.residues and uf.distance_3D(fl_chain.residues[-1].c_alpha_coord, fl_res.c_alpha_coord) < self.config["open_gap_threshold"]:
fl_chain.residues[-1].has_next = True
fl_res.has_prev = True
# if the chain we are inserting in is a DNA or RNA chain
else:
# set the uniprot identifier as a string "DNA-RNA"
fl_res.uniprot_id = "DNA-RNA"
# add residue inside the chain object
if fl_res.c_alpha_coord is not None or fl_chain.rna_dna_chain:
fl_chain.string_index_map[fl_res.string_index()] = len(fl_chain.residues)
fl_chain.residues.append(fl_res)
# build the structure, giving it pdb_id as identigier
def build_structure(self, pdb_id, model_id=0):
# create Bio.PDB structure
fl_struct = FlipperStructure(pdb_id, self.bio_struct[model_id].id)
# for each chain from this structure (first model)
for chain in self.bio_struct[model_id]:
# create a FLipper chain with same id
fl_chain = FlipperChain(chain.id)
# if this chain is a DNA-RNA chain, set flag
if is_DNA(chain):
fl_chain.rna_dna_chain = True
# for each residue in the chain
for residue in chain:
# if it is not hetero, add it (so DNA-RNA residues too)
if is_good_res(residue):
self.add_residue(fl_chain, residue)
# add chain to the structure
fl_struct.chains[chain.id] = fl_chain
# return the structure
return fl_struct
# create the neighbors network for given struture
def make_neighbors(self, fl_struct):
# create an empty NeighborsNet
nn = NeighborsNet()
# use NeighborSearch from Bio.PDB to compute distances
ns = NeighborSearch(list(self.bio_struct.get_atoms()))
# for each chain in structure that is not a dna-rna one
for fl_chain in fl_struct.get_chains():
if not fl_chain.rna_dna_chain:
# for each residue in this chain
for fl_res in fl_chain.residues:
# add a default entry
nn.add_default(fl_res)
# keep track of already inserted nieghbors (the search is mate for each atom in the residue)
already_have = []
# for each atom (coordinates) in the residue
for atom_coord in fl_res.atoms_coord:
# for each residue in range
for res in ns.search(atom_coord, self.config["neighbors_range"], level='R'):
# check if it is good atom and the same model, cause sometimes NS computes all models
if is_good_res(res) and fl_res.model_id == res_model_id(res) and not res.get_full_id() in already_have:
# try to get FlipperResidueAssociated
pos_2 = fl_struct.chains[res_chain_id(res)].string_index_map.get(res_string_index(res))
# print(fl_res.get_full_identifier(), res.get_full_id(), fl_struct.chains[res_chain_id(res)].string_index_map.get(res_string_index(res)))
if not pos_2 == None:
fl_res_2 = fl_struct.chains[res_chain_id(res)].residues[pos_2]
# if the chain is the same
if fl_res.chain_id == fl_res_2.chain_id:
already_have.append(res.get_full_id())
if fl_res.pos_in_chain == fl_res_2.pos_in_chain:
continue
# if distance (as residue number) is less than threshold, then it is a short range neighbor
if abs(fl_res.pos_in_chain - fl_res_2.pos_in_chain) < self.config["long_short_threshold"] and not fl_chain.have_gaps(fl_res, fl_res_2):
nn.add_short(fl_res, fl_res_2)
# else it is a long rage neighbor
else:
nn.add_long(fl_res, fl_res_2)
# if it is not in the same chain it is an inter chain neighbor
else:
nn.add_inter(fl_res, fl_res_2)
already_have.append(res.get_full_id())
return nn
# check if a residue is not heteroatom
def is_good_res(res):
if res.id[0] == ' ':
return True
for atom in res:
if atom.id == 'CA':
return True
return False
# extract the index of the residue (relative to pdb file)
def res_pdb_index(res):
return res.id[1]
# extract the insertion of the residue
def res_insertion_code(res):
return res.id[2]
# extract the id of the pdb from a residue
def res_pdb_id(res):
return res.get_full_id()[0]
# extract the id of the model from a residue
def res_model_id(res):
return res.get_full_id()[1]
# extract the id of the chain from a residue
def res_chain_id(res):
return res.get_full_id()[2]
# res string index (id + insertion code) from a Bio.Residue object
def res_string_index(res):
if not res_insertion_code(res) == ' ':
return str(res_pdb_index(res)) + res_insertion_code(res)
return str(res_pdb_index(res))
# get one letter name of an amino acid, Bio.Residue object
def res_name_1_letter(res):
if res.get_resname() in uf.aa_3to1.keys():
return uf.aa_3to1[res.get_resname()]
else:
return 'X'
# check if given chain is a DNA/RNA chain
def is_DNA(chain):
# for each residue (cicle it is the only way to access residue info from a chain)
for res in chain:
# get the first good residue
if is_good_res(res):
return is_dna_res(res)
# no good residue has been found, discard the chain anyway
return True
# check if given chain is a DNA/RNA residue
# current strategy is that a DNA/RNA residue is not an hetero atom and does not have an alpha carbon
def is_dna_res(res):
if is_good_res(res):
for atom in res:
if atom.id == 'CA':
return False
return True
return False
# check if the residue name is one of nucleic acids names (If another different encoding is found add it in this list)
#return res.get_resname() in [' DA','DU',' DT',' DG',' DC','A','U','T','G','C',' A',' U',' T',' G',' C']