-
Notifications
You must be signed in to change notification settings - Fork 7
/
inchi.py
484 lines (419 loc) · 20.3 KB
/
inchi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
# -*- coding: utf-8 -*-
import re
# dictionary with mass references used in the inchi-isotopic layer
atom_mass_reference = {
"H":1, "He":4, "Li":7, "Be":9, "B":11, "C":12, "N":14, "O":16, "F":19, "Ne":20, "Na":23, "Mg":24, "Al":27, "Si":28,
"P":31, "S":32, "Cl":35, "Ar":40, "K":39, "Ca":40, "Sc":45, "Ti":48, "V":51, "Cr":52, "Mn":55, "Fe":56, "Co":59,
"Ni":59, "Cu":64, "Zn":65, "Ga":70, "Ge":73, "As":75, "Se":79, "Br":80, "Kr":84, "Rb":85, "Sr":88, "Y":89, "Zr":91,
"Nb":93, "Mo":96, "Ru":101, "Rh":103, "Pd":106, "Ag":108, "Cd":112, "In":115, "Sn":119, "Sb":122, "Te":128, "I":127,
"Xe":131, "Cs":133, "Ba":137, "La":139, "Ce":140, "Pr":141, "Nd":144, "Sm":150, "Eu":152, "Gd":157, "Tb":159, "Dy":163,
"Ho":165, "Er":167, "Tm":169, "Yb":173, "Lu":175, "Hf":178, "Ta":181, "W":184, "Re":186, "Os":190, "Ir":192, "Pt":195,
"Au":197, "Hg":201, "Tl":204, "Pb":207, "Bi":209, "Th":232, "U":238, }
class InChI:
"""
This class creates an InChI instance.
It parses the information for an InChI and generates some useful informations out of the InChI,
such as the stoichiometric formula, the massnumber, total charge
inchi = InChI (The IUPAC International Chemical Identifier)
example: inchi_example = '1S/CHN/c1-2/h1H/i2+1' which is the inchi of HC(15N)
"""
dict_layers = {'c':'connections',
'h':'H_atoms',
'q':'charge',
'p':'protons',
'b':'stereo_dbond',
't':'stereo_sp3',
'm':'stereo_sp3_inverted',
's':'stereo_type',
'r':'reconnectMetals',
'rc':'reconnect_connections',
'rh':'reconnect_H_atoms',
'rq':'reconnect_charge',
'rp':'reconnect_protons',
'rb':'reconnect_stereo_dbond',
'rt':'reconnect_stereo_sp3',
'rm':'reconnect_stereo_sp3_inverted',
'rs':'reconnect_stereo_type',
'ii':'isotopic_atoms',
'ih':'isotopic_exchangable_H',
'ib':'isotopic_stereo_bond',
'it':'isotopic_stereo_sp3',
'im':'isotopic_stereo_sp3_inverted',
'is':'isotopic_stereo_type',
'ff':'fixedH_formula',
'fh':'fixedH_H_fixed',
'fq':'fixedH_charge',
'ft':'fixedH_stereo_sp3',
'fm':'fixedH_stereo_sp3_inverted',
'fs':'fixedH_stereo_type',
'fi':'fixedH_isotopic_atoms',
}
def __init__(self, inchi):
self.inchi = inchi
self.parse_inchi(inchi = inchi)
def __clearall__(self):
self.massnumber = None
self.stoichiometric_formula = None
self.charge = None
self.hydrogens = None
self.dict_species = None
self.atoms = None
def __generate_dict_atomids__(self, formula):
"""
Creates a dictionary with all atoms of the specie except all hydrogens (H, D, T)
The key is an integer which can be used to identify the bonds/structure
formula: stoichiometric formula of the skeleton. In most cases it is the stoichiometric
formula of the specie but in some cases protons have to be added or substracted
(e.g. H3O+)
"""
# will contain a dictionary with ids of each atom except for hydrogens
dict_atoms = {}
# will contain the number of hydrogen atoms
h_atoms = 0
# find start position of new atom
indices = [m.start() for m in re.finditer("[A-Z]", formula)]
# append the length of the formula to the list to simplify splitting into different atoms
indices.append(len(formula))
# id - counter
id = 1
# split formula into different atoms
for i in xrange(len(indices)-1):
atoms = formula[indices[i]:indices[i+1]]
# loop through all identical atoms
found = re.search("[0-9]",atoms)
if found is None:
numberofatoms = 1
atomsymbol = atoms
else:
numberofatoms = int(atoms[found.start():])
atomsymbol = atoms[:found.start()]
# hydrogens are not included into dictionary and only the number
# of hydrogens will be returned
if atomsymbol == 'H':
h_atoms = numberofatoms
continue
for i in xrange(numberofatoms):
dict_atoms[id] = {'symbol':atomsymbol, 'isotope_mass_diff':0}
id += 1
return dict_atoms, h_atoms
def __parse_isotopic_layer__(self):
"""
Parses the isotopic layer of the InChI and updates the dictionary of atoms and hydrogens accordingly
"""
# initialize all isotopes with standard masses
for atomid in self.atoms:
atom = self.atoms[atomid]
mass_reference = atom_mass_reference[atom['symbol']]
atom['isotope'] = mass_reference
if not self.dict_species.has_key('isotopic_atoms'):
return
# parse isotopic layer and change isotopic mass number accordingly
layers = self.dict_species['isotopic_atoms'].split(',')
for layer in layers:
if layer.find('D')>-1:
# layer should have format aDn
# a is the identification number of the atom to which they are attached
# n is the number of these atoms attached to the (a)th atom, n=1 is ommited
layer, num_atoms = layer.split('D')
if num_atoms == '':
num_atoms = 1
else:
num_atoms = int(num_atoms)
found = re.search("[\+\-]", layer)
if found:
atom_id = int(layer[:found.start()])
else:
atom_id = int(layer)
layer = ''
for hid in self.hydrogens:
if num_atoms > 0 and self.hydrogens[hid]['connected_to'] == [atom_id]:
self.hydrogens[hid]['massnumber'] = 2
self.hydrogens[hid]['isotope'] = 'D'
num_atoms -= 1
elif layer.find('T')>-1:
# layer should have format aDn
# a is the identification number of the atom to which they are attached
# n is the number of these atoms attached to the (a)th atom, n=1 is ommited
layer, num_atoms = layer.split('T')
if num_atoms == '':
num_atoms = 1
else:
num_atoms = int(num_atoms)
found = re.search("[\+\-]", layer)
if found:
atom_id = int(layer[:found.start()])
else:
atom_id = int(layer)
layer = ''
for hid in self.hydrogens:
if num_atoms > 0 and self.hydrogens[hid]['connected_to'] == [atom_id]:
self.hydrogens[hid]['massnumber'] = 3
self.hydrogens[hid]['isotope'] = 'T'
num_atoms -= 1
if layer == '':
continue
elif layer.find('+')>-1:
atomid, isotope = layer.split('+')
isotope = int(isotope)
elif layer.find('-')>-1:
atomid, isotope = layer.split('-')
isotope = -int(isotope)
else:
print "Could not parse isotope info %s" % layer
continue
atom = self.atoms[int(atomid)]
mass_reference = atom_mass_reference[atom['symbol']]
atom['isotope_mass_diff'] = isotope
atom['isotope'] = mass_reference + isotope
#self.atoms[atomid]['isotope'] = isotope
# process /h isotopic sublayer (isotopic information on mobile H atoms)
if self.dict_species.has_key('isotopic_exchangable_H'):
exchangableH = self.dict_species['isotopic_exchangable_H']
found = re.search("[H,D,T][0-9]*",exchangableH)
while found:
isotope = exchangableH[found.start():found.end()]
exchangableH = exchangableH[found.end():]
found = re.search("[H,D,T][0-9]*",exchangableH)
# if the number of replacements is just one, then 1 has been ommited
if len(isotope)>1:
count_isotopes = int(isotope[1:])
else:
count_isotopes = 1
isotope = isotope[0]
if isotope == 'D':
mass = 2
elif isotope == 'T':
mass = 3
else:
# if it is just hydrogen then nothing has to be done
continue
# replace first all the mobile hydrogens
for hydrogen in self.hydrogens:
# if all hydrogens have been replaced then stop replacements
if count_isotopes == 0:
break
# check if it has been replaced already
if self.hydrogens[hydrogen]['massnumber'] != 1:
continue
if len(self.hydrogens[hydrogen]['connected_to'])>1:
self.hydrogens[hydrogen]['isotope'] = isotope
self.hydrogens[hydrogen]['massnumber'] = mass
count_isotopes -= 1
# replace then normal hydrogens in the order of occurances
for hydrogen in self.hydrogens:
# if all hydrogens have been replaced then stop replacements
if count_isotopes == 0:
break
# check if it has been replaced already
if self.hydrogens[hydrogen]['massnumber'] != 1:
continue
self.hydrogens[hydrogen]['isotope'] = isotope
self.hydrogens[hydrogen]['massnumber'] = mass
count_isotopes -= 1
def __parse_charge__(self):
"""
Parses the main charge layer.
"""
self.dict_species['totalcharge'] = 0
if not self.dict_species.has_key('charge'):
return
for q in self.dict_species['charge'].split(";"):
if q != "":
self.dict_species['totalcharge'] += int(q)
def __parse_protons__(self):
"""
Determines the number of additional protons from the inchi
"""
self.dict_species['num_protons'] = 0
if not self.dict_species.has_key('protons'):
return
for q in self.dict_species['protons'].split(";"):
if q != "":
self.dict_species['num_protons'] += int(q)
def __parse_hydrogen_layer__(self):
"""
The hyrdogne layer contains the information how many hydrogens are connected to each atom
example: 1H2 means 2 hydrogens are connected to atom 1
1,3H2 means 2 hydrogens are connected to atom1 and 2 to atom 3
1-3H2 means 2 hydrogens are connected to atom 1, 2 to atom 2 and 2 to atom 3
(H2, 1,3) means 2 hydrogens are mobile and migrate between atom 1 and 3 (2 hydrogens in total)
"""
# first hydrogen info which is enclosed in brackets have to be identified
# otherwise a split by ',' will not work
if not self.dict_species.has_key('H_atoms'):
self.dict_species['H_atoms'] = ""
hlayer = self.dict_species['H_atoms']
hydrogenlist = {}
h_id = 1
while hlayer.find('(')!=-1:
left = hlayer.find('(')
right = hlayer.find(')')
# add info to a list
mobileH = hlayer[left+1:right].split(',')
atomids = [int(id) for id in mobileH[1:]]
if mobileH[0]=='H':
count_H = 1
else:
count_H = int(mobileH[0][1:])
for i in xrange(count_H):
hydrogenlist[h_id] = {'isotope':'H', 'massnumber': 1,'connected_to':atomids}
h_id += 1
# remove info from the string
hlayer = hlayer.replace(hlayer[left:right+1],'')
layers = [i for i in hlayer.split(',') if i!='']
last_layer = ''
for layer in layers:
# there might be a kommata before the first occurance of 'H'
# which belongs to a list of atoms to which H are connected
layer = last_layer+','+layer
if layer.find('H') == -1:
# ignore this splitting and merge the result with next loop
last_layer = layer
continue
else:
last_layer = ''
# layer contains aHn (a is id of the connected atom, n is the number of hydrogens connected
atomids, count_H = layer.split('H')
id_range = atomids.split('-')
if len(id_range) == 2:
id_range = range(int(atomids.split('-')[0]), int(atomids.split('-')[1])+1)
else:
id_range = [int(id) for id in id_range[0].split(',') if id!='']
if count_H == '':
count_H = 1
else:
count_H = int(count_H)
for atomid in id_range:
for i in xrange(count_H):
hydrogenlist[h_id] = {'isotope':'H', 'massnumber': 1,'connected_to':[atomid]}
h_id += 1
# if less H atoms are in the list of hydrogens than expected from the formula specified in the InChI
# then the remaining H atoms have to be added to the skeleton (atomids)
if len(hydrogenlist) < self.dict_species['num_H_atoms']:
for i in xrange(self.dict_species['num_H_atoms'] - len(hydrogenlist)):
self.atoms[len(self.atoms)+1+i] = {'isotope': 1, 'symbol': 'H', 'isotope_mass_diff' : 0}
# add protons (to the end of the list)
if self.dict_species['num_protons']>0:
for i in xrange(self.dict_species['num_protons']):
hydrogenlist[h_id] = {'isotope':'H', 'massnumber': 1, 'charge': '1', 'connected_to':[]}
h_id += 1
# remove protons (from the end of the list)
if self.dict_species['num_protons']<0:
for i in xrange(abs(self.dict_species['num_protons'])):
h_id -= 1
hydrogen = hydrogenlist.pop(h_id)
# put a charge to the first atom to which it was connected
self.atoms[hydrogen['connected_to'][0]]['charge'] = -1
self.hydrogens = hydrogenlist
def __get_charge__(self):
self.__parse_charge__()
self.__parse_protons__()
# each proton adds a charge of +1 to the total charge
self.charge = self.dict_species['totalcharge'] + self.dict_species['num_protons']
def __generate_stoichiometric_formula__(self):
"""
Creates the stoichiometric formula from the main and the charge layer
"""
# create the string which contains the charge info in the stoichiometric formula e.g. '+', '-2', '+3'
charge_str = ''
if self.charge > 0:
charge_str = "+"
elif self.charge < 0:
charge_str = "-"
if abs(self.charge) > 1:
charge_str += "%d" % abs(self.charge)
# create dictionary with number of each atom
dict_atom_count = {}
for atomid in self.atoms:
if not dict_atom_count.has_key(self.atoms[atomid]['symbol']):
dict_atom_count[self.atoms[atomid]['symbol']] = 1
else:
dict_atom_count[self.atoms[atomid]['symbol']] += 1
# calculate number of hydrogens (include protons) and add them to the dict
count_hydrogens = self.dict_species['num_H_atoms'] + self.dict_species['num_protons']
if count_hydrogens > 0:
dict_atom_count['H'] = count_hydrogens
# create the string which contains the atom info in Hill's notation (alphabetic order)
self.stoichiometric_formula = ""
for atom in sorted(dict_atom_count):
self.stoichiometric_formula += atom
if dict_atom_count[atom]>1:
self.stoichiometric_formula += "%d" % dict_atom_count[atom]
# combine atom- and charge-info
self.stoichiometric_formula += "%s" % charge_str
def __get_massnumber__(self):
"""
Determines the mass number from the atom dict and the hydrogen dict
"""
self.massnumber = 0
for id in self.atoms:
self.massnumber += self.atoms[id]['isotope']
for id in self.hydrogens:
self.massnumber += self.hydrogens[id]['massnumber']
def __add_hydrogens2atoms__(self):
"""
Add the hydrogens to the dictionary of atoms. Hydrogens have been removed from the list
in the InChI
"""
atom_length = len(self.atoms)
for id in self.hydrogens:
if self.hydrogens[id].has_key('charge'):
self.atoms[int(id)+atom_length] = {'isotope': self.hydrogens[id]['massnumber'],
'symbol': 'H',
'charge': self.hydrogens[id]['charge'],
}
else:
self.atoms[int(id)+atom_length] = {'isotope': self.hydrogens[id]['massnumber'],
'symbol': 'H',
}
def parse_inchi(self, inchi):
"""
Parses the InChI.
"""
# make sure old values will be overwritten
self.__clearall__()
self.inchi = inchi
# will contain a dictionary of return values
self.dict_species = {}
# generate list of layers
layers = inchi.split("/")
# fist layer defines if inchi is standard
if layers[0] == "1S":
self.dict_species['standard_inchi'] = True
else:
self.dict_species['standard_inchi'] = False
init_layer = 1 # start processing with this layer
# second layer specifies stoichiometric formula in Hill's notation
if layers[1][0].upper() == layers[1][0]:
self.dict_species['formula'] = layers[1]
init_layer += 1
else:
self.dict_species['formula'] = ""
self.dict_species['atomids'], self.dict_species['num_H_atoms'] = self.__generate_dict_atomids__(self.dict_species['formula'])
self.atoms = self.dict_species['atomids']
# loop through all the layers
layer_str = ''
for layer in layers[init_layer:]:
# connection layer
# there are two different '/h'-layers, one before and one after the '/i' isotopic
# layer. Both have different meanings!!! As far as I understand the second one
# contains additonal information for fixed hydrogens (or D,T) (isotopic info for hydrogens)
# isotopic layer begins with sublayer i
if layer[0] == 'i':
layer_str = 'i'
# fixed_H layer begins with sublayer f (only present if non-standard inchi)
if layer[0] == 'f':
layer_str = 'f'
self.dict_species[self.dict_layers["%s%s" % (layer_str, layer[0])]] = layer[1:]
# parse layer with charge and proton information
self.__get_charge__()
# parse main layer with hydrogen information
self.__parse_hydrogen_layer__()
# parse layer with istotopic information
self.__parse_isotopic_layer__()
# create the stoichiometric formula
self.__generate_stoichiometric_formula__()
# get the massnumber
self.__get_massnumber__()
# create a dictionary with all atoms including hydrogens
self.__add_hydrogens2atoms__()