-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_smiles_list.py
83 lines (60 loc) · 2 KB
/
build_smiles_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import scipy.spatial
from MolCapArena.data.processing import MolecularDataset, index_predetermined_split
import pickle
import pandas as pd
import os
from collections import defaultdict
from tqdm import tqdm
all_smiles = set()
class BenchmarkDataset:
def __init__(self, name, tasks, task_type):
self.name = name
self.tasks = tasks
self.task_type = task_type
def load_dataset(self):
dataset = MolecularDataset.load_csv_dataset(
os.path.join("splits/", self.name + ".csv"),
smiles_column_name="smiles",
y_column_names=self.tasks,
)
return dataset
def load_splits(self):
with open(os.path.join("splits/", self.name + "_splits.pkl"), "rb") as handle:
b = pickle.load(handle)
return b
datasets = [
BenchmarkDataset(
name="BBBP_clean", tasks=("p_np",), task_type="binary_classification"
),
BenchmarkDataset(
name="bace_clean", tasks=("Class",), task_type="binary_classification"
),
BenchmarkDataset(
name="clintox_clean", tasks=("CT_TOX",), task_type="binary_classification"
),
BenchmarkDataset(
name="esol_clean",
tasks=("measured log solubility in mols per litre",),
task_type="regression",
),
BenchmarkDataset(name="freesolv_clean", tasks=("y",), task_type="regression"),
BenchmarkDataset(name="lipo_clean", tasks=("exp",), task_type="regression"),
]
tasks = defaultdict(list)
fix_task = {
"BBBP_clean": "BBBP",
"clintox_clean": "ClinTox",
"bace_clean": "BACE",
"esol_clean": "ESOL",
"freesolv_clean": "FreeSolv",
"lipo_clean": "Lipo",
}
for i in datasets:
dataset = i.load_dataset()
all_smiles.update(dataset.smiles)
for smi in dataset.smiles:
tasks[smi].append(fix_task[i.name])
all_smiles = list(all_smiles)
tasks = [":".join(tasks[smi]) for smi in all_smiles]
df = pd.DataFrame({"SMILES": all_smiles, "tasks": tasks})
df.to_csv(f"all_smiles.csv", index=False)