-
Notifications
You must be signed in to change notification settings - Fork 8
/
utils.py
executable file
·151 lines (117 loc) · 4.33 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import yaml
import re
import selfies as sf
class CharVocab:
def __init__(self, vocab_path):
self.name = "char"
# load the pre-computed vocabulary
with open(vocab_path, 'r') as f:
self.vocab = yaml.full_load(f)
# a dictionary to map integer back to SMILES
# tokens for sampling
self.int2tocken = {}
for token, num in self.vocab.items():
self.int2tocken[num] = token
# a hashset of tokens for O(1) lookup
self.tokens = self.vocab.keys()
def tokenize_smiles(self, smiles):
"""
Takes a SMILES string and returns a list of tokens.
Atoms with 2 characters are treated as one token. The
logic references this code piece:
https://github.com/topazape/LSTM_Chem/blob/master/lstm_chem/utils/smiles_tokenizer2.py
"""
n = len(smiles)
tokenized = ['<sos>']
i = 0
# process all characters except the last one
while (i < n - 1):
# procoss tokens with length 2 first
c2 = smiles[i:i + 2]
if c2 in self.tokens:
tokenized.append(c2)
i += 2
continue
# tokens with length 2
c1 = smiles[i]
if c1 in self.tokens:
tokenized.append(c1)
i += 1
continue
raise ValueError(
"Unrecognized charater in SMILES: {}, {}".format(c1, c2))
# process last character if there is any
if i == n:
pass
elif i == n - 1 and smiles[i] in self.tokens:
tokenized.append(smiles[i])
else:
raise ValueError(
"Unrecognized charater in SMILES: {}".format(smiles[i]))
tokenized.append('<eos>')
tokenized = [self.vocab[token] for token in tokenized]
return tokenized
def combine_list(self, smiles):
return "".join(smiles)
class RegExVocab:
def __init__(self, vocab_path):
self.name = "regex"
# load the pre-computed vocabulary
with open(vocab_path, 'r') as f:
self.vocab = yaml.full_load(f)
# a dictionary to map integer back to SMILES
# tokens for sampling
self.int2tocken = {}
for token, num in self.vocab.items():
if token == "R":
self.int2tocken[num] = "Br"
elif token == "L":
self.int2tocken[num] = "Cl"
else:
self.int2tocken[num] = token
def tokenize_smiles(self, smiles):
"""Takes a SMILES string and returns a list of tokens.
This will swap 'Cl' and 'Br' to 'L' and 'R' and treat
'[xx]' as one token."""
regex = '(\[[^\[\]]{1,6}\])'
smiles = self.replace_halogen(smiles)
char_list = re.split(regex, smiles)
tokenized = ['<sos>']
for char in char_list:
if char.startswith('['):
tokenized.append(char)
else:
chars = [unit for unit in char]
[tokenized.append(unit) for unit in chars]
tokenized.append('<eos>')
# convert tokens to integer tokens
tokenized = [self.vocab[token] for token in tokenized]
return tokenized
def replace_halogen(self, string):
"""Regex to replace Br and Cl with single letters"""
br = re.compile('Br')
cl = re.compile('Cl')
string = br.sub('R', string)
string = cl.sub('L', string)
return string
def combine_list(self, smiles):
return "".join(smiles)
class SELFIESVocab:
def __init__(self, vocab_path):
self.name = "selfies"
# load the pre-computed vocabulary
with open(vocab_path, 'r') as f:
self.vocab = yaml.full_load(f)
self.int2tocken = {value: key for key, value in self.vocab.items()}
def tokenize_smiles(self, smiles):
"""convert the smiles to selfies, then return
integer tokens."""
ints = [self.vocab['<sos>']]
encoded_selfies = sf.encoder(smiles)
selfies_list = list(sf.split_selfies(encoded_selfies))
for token in selfies_list:
ints.append(self.vocab[token])
ints.append(self.vocab['<eos>'])
return ints
def combine_list(self, selfies):
return "".join(selfies)