diff --git a/teachopencadd/talktorials/T039_molecular_transformers/code/data.py b/teachopencadd/talktorials/T039_molecular_transformers/code/data.py new file mode 100644 index 00000000..50da031f --- /dev/null +++ b/teachopencadd/talktorials/T039_molecular_transformers/code/data.py @@ -0,0 +1,97 @@ +import numpy as np +from torch.nn.utils.rnn import pad_sequence + +import random +import tokenizer +import math +import torch + + +def generate_random_data(n): + SOS_token = np.array([2]) + EOS_token = np.array([3]) + length = 8 + + data = [] + + # 1,1,1,1,1,1 -> 1,1,1,1,1 + for i in range(n // 3): + X = np.concatenate((SOS_token, np.ones(length), EOS_token)) + y = 1 # np.concatenate((SOS_token, np.ones(length), EOS_token)) + data.append([X, y]) + + # 0,0,0,0 -> 0,0,0,0 + for i in range(n // 3): + X = np.concatenate((SOS_token, np.zeros(length), EOS_token)) + y = 0 # np.concatenate((SOS_token, np.zeros(length), EOS_token)) + data.append([X, y]) + + # 1,0,1,0 -> 1,0,1,0,1 + for i in range(n // 3): + X = np.zeros(length) + start = random.randint(0, 1) + + X[start::2] = 1 + + # y = np.zeros(length) + #if X[-1] == 0: + # y[::2] = 1 + #else: + # y[1::2] = 1 + + X = np.concatenate((SOS_token, X, EOS_token)) + y = 1 # np.concatenate((SOS_token, y, EOS_token)) + + data.append([X, y]) + + np.random.shuffle(data) + + return data + + +def batchify_data(data, batch_size=64, padding=True, padding_token=30): + batches = [] + for idx in range(0, len(data), batch_size): + # We make sure we dont get the last bit if its not batch_size size + if idx + batch_size < len(data): + # Here you would need to get the max length of the batch, + # and normalize the length with the PAD token. + if padding: + max_batch_length = 0 + + # Get longest sentence in batch + for seq in data[idx : idx + batch_size]: + if len(seq[0]) > max_batch_length: + max_batch_length = len(seq[0]) + # Append X padding tokens until it reaches the max length + for seq_idx in range(batch_size): + remaining_length = max_batch_length - len(data[idx + seq_idx][0]) + data[idx + seq_idx][0] = np.concatenate([data[idx + seq_idx][0], np.array([padding_token] * remaining_length, dtype=np.int64)], axis=0) + batches.append(np.array(data[idx : idx + batch_size])) + + print(f"{len(batches)} batches of size {batch_size}") + # batches = add_padding(batches) + return batches + + +def generate_dataset(smiles, y, token, vocab): + # data = np.array(zip(smiles, y)) + # build a vocab using the training data + + data = [] + smiles = [tokenizer.smiles_to_ohe(smi, token, vocab) for smi in smiles] + for smi, tar in zip(smiles, y): + if not math.isnan(tar): + smi = np.array(smi) + # smi = np.concatenate((SOS_token, smi, EOS_token)) + data.append([smi, tar]) + + np.random.shuffle(data) + return data + +def add_padding(data): + data = pad_sequence(sequences=torch.tensor(data), + batch_first=True, + padding_value=0, + ) + return data \ No newline at end of file diff --git a/teachopencadd/talktorials/T039_molecular_transformers/code/encoder.py b/teachopencadd/talktorials/T039_molecular_transformers/code/encoder.py new file mode 100644 index 00000000..fa324aef --- /dev/null +++ b/teachopencadd/talktorials/T039_molecular_transformers/code/encoder.py @@ -0,0 +1,42 @@ +import torch +import torch.nn as nn +import math + + +class PositionalEncoding(nn.Module): + def __init__(self, dim_model, dropout_p, max_len): + super().__init__() + # Modified version from: https://pytorch.org/tutorials/beginner/transformer_tutorial.html + # max_len determines how far the position can have an effect on a token (window) + + # Info + self.dropout = nn.Dropout(dropout_p) + + # Encoding - From formula + pos_encoding = torch.zeros(max_len, 1, dim_model) + # positions_list = torch.arange(0, max_len, dtype=torch.float).view(-1, 1) # 0, 1, 2, 3, 4, 5 + # positions_list = torch.arange(max_len).unsqueeze(1) + # division_term = torch.exp(torch.arange(0, dim_model, 2) * (-math.log(10000.0) / dim_model)) + + # division_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-math.log(10000.0)) / dim_model) # 1000^(2i/dim_model) + factor = -math.log(10000.0) / dim_model # outs loop + for pos in range(0, max_len): # position of word in seq + for i in range(0, dim_model, 2): # pos of embed of word + div_term = math.exp(i * factor) + pos_encoding[pos, 0, i] = math.sin(pos * div_term) + pos_encoding[pos, 0, i+1] = math.cos(pos * div_term) + # PE(pos, 2i) = sin(pos/1000^(2i/dim_model)) + # pos_encoding[:, 0, 0::2] = torch.sin(positions_list * division_term) + + # PE(pos, 2i + 1) = cos(pos/1000^(2i/dim_model)) + # pos_encoding[:, 0, 1::2] = torch.cos(positions_list * division_term) + + # Saving buffer (same as parameter without gradients needed) + # pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1) + self.register_buffer("pos_encoding", pos_encoding) + + def forward(self, token_embedding: torch.tensor) -> torch.tensor: + # Residual connection + pos encoding + pos_enc = self.pos_encoding[:token_embedding.size(0), :] + token_embedding = token_embedding + pos_enc + return self.dropout(token_embedding) \ No newline at end of file diff --git a/teachopencadd/talktorials/T039_molecular_transformers/code/inference.py b/teachopencadd/talktorials/T039_molecular_transformers/code/inference.py new file mode 100644 index 00000000..24e9c014 --- /dev/null +++ b/teachopencadd/talktorials/T039_molecular_transformers/code/inference.py @@ -0,0 +1,47 @@ +import torch +import numpy as np + + +def predict(model, input_sequence): + """ + Method from "A detailed guide to Pytorch's nn.Transformer() module.", by + Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1 + """ + model.eval() + + # Get source mask + # tgt_mask = model.get_tgt_mask(y_input.size(1)).to(device) + + pred = model(input_sequence) + # pred = pred.argmax(axis=1) + + return pred + + +def test_loop(model, loss_fn, dataloader, device): + + total_loss = 0 + # total_acc = 0 + # total = 0 + prediction = np.empty((0)) + ground_truth = np.empty((0)) + model.eval() + + for batch in dataloader: + with torch.no_grad(): + X, y = batch[:, 0], batch[:, 1] + X = np.array([arr.astype(np.int64) for arr in X]) + X, y = torch.tensor(X).to(device), torch.tensor(y.astype(np.float32)).to(device) + + pred = model(X) + loss = loss_fn(pred, y.float().unsqueeze(1)) + + # correct = pred.argmax(axis=1) == y + total_loss += loss.detach().item() + # total_acc += correct.sum().item() + # total += correct.size(0) + prediction = np.concatenate((prediction, pred.cpu().detach().numpy()[:, 0])) + ground_truth = np.concatenate((ground_truth, y.cpu().detach().numpy())) + + return total_loss / len(dataloader), prediction, ground_truth# , total_acc / total + diff --git a/teachopencadd/talktorials/T039_molecular_transformers/code/main.py b/teachopencadd/talktorials/T039_molecular_transformers/code/main.py new file mode 100644 index 00000000..3f32d2a0 --- /dev/null +++ b/teachopencadd/talktorials/T039_molecular_transformers/code/main.py @@ -0,0 +1,70 @@ +import torch +import torch.nn as nn + +import transformer +import data +import inference +import training +import tokenizer +import plot + +from pathlib import Path +import pandas as pd +import os + + +HERE = Path(__file__).parent.resolve() +DATA = HERE / "data" + +# load the dataset +df = pd.read_csv(os.path.join(DATA, "qm9.csv.gz"), compression="gzip") +df = df.sample(frac=1).reset_index(drop=True) + +smiles = df["smiles"].tolist() +y = df["mu"] + + + +# smiles = [tokenizer.smiles_to_ohe(smi, token, vocab) for smi in smiles] +sample_size = len(y) # 50000 +train_index = int(sample_size * 0.8) +test_index = train_index + int(sample_size * 0.1) + +# normalize data +y_mean = y[:train_index].mean() +y_std = y[:train_index].std() +y = (y - y_mean) / y_std + + +max_vocab_size = 30 +token = tokenizer.SmilesTokenizer() +vocab = tokenizer.build_vocab(smiles[:sample_size], token, max_vocab_size) +vocab_size = len(vocab) + +train_data = data.generate_dataset(smiles[:train_index], y[:train_index], token, vocab) +val_data = data.generate_dataset(smiles[train_index:test_index], y[train_index:test_index], token, vocab) +test_data = data.generate_dataset(smiles[test_index:sample_size], y[test_index:sample_size], token, vocab) + +train_dataloader = data.batchify_data(train_data) +val_dataloader = data.batchify_data(val_data) +test_dataloader = data.batchify_data(test_data) + + +# device = "cuda" if torch.cuda.is_available() else "cpu" +device = "mps" if torch.backends.mps.is_available and torch.backends.mps.is_built() else "cpu" +print(device) + +model = transformer.Transformer( + num_tokens=vocab_size, dim_model=100, num_heads=4, num_encoder_layers=3, dropout_p=0.2 +).to(device) +opt = torch.optim.SGD(model.parameters(), lr=0.01) +# loss_fn = nn.CrossEntropyLoss() +loss_fn = nn.MSELoss() + +train_loss_list, val_loss_list = training.fit(model, opt, loss_fn, train_dataloader, val_dataloader, 50, device) + +plot.plot_loss(train_loss_list, val_loss_list) + +test_loss, predictions, ground_truth = inference.test_loop(model, loss_fn, test_dataloader, device) +print(f"Test loss: {test_loss:.4f}") +plot.plot_targets(predictions, ground_truth) diff --git a/teachopencadd/talktorials/T039_molecular_transformers/code/plot.py b/teachopencadd/talktorials/T039_molecular_transformers/code/plot.py new file mode 100644 index 00000000..5c6315db --- /dev/null +++ b/teachopencadd/talktorials/T039_molecular_transformers/code/plot.py @@ -0,0 +1,44 @@ +import matplotlib.pylab as plt +from matplotlib.ticker import MaxNLocator + + +def plot_loss(train_loss, val_loss): + """Plot the loss for each epoch + + Args: + epochs (int): number of epochs + train_loss (array): training losses for each epoch + val_loss (array): validation losses for each epoch + """ + plt.plot(train_loss, label="Training loss") + plt.plot(val_loss, label="Validation loss") + plt.legend() + plt.ylabel("loss") + plt.xlabel("epoch") + plt.title("Model Loss") + plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True)) + # plt.show() + plt.savefig("plots/loss_4.png") + + + + +def plot_targets(pred, ground_truth): + """Plot true vs predicted value in a scatter plot + + Args: + pred (array): predicted values + ground_truth (array): ground truth values + """ + f, ax = plt.subplots(figsize=(6, 6)) + ax.scatter(pred, ground_truth, s=0.5) + plt.xlim(-2, 7) + plt.ylim(-2, 7) + ax.axline((1, 1), slope=1) + plt.xlabel("Predicted Value") + plt.ylabel("Ground truth") + plt.title("Ground truth vs prediction") + # plt.show() + plt.savefig("plots/scatter_4.png") + + diff --git a/teachopencadd/talktorials/T039_molecular_transformers/code/tokenizer.py b/teachopencadd/talktorials/T039_molecular_transformers/code/tokenizer.py new file mode 100644 index 00000000..227045e0 --- /dev/null +++ b/teachopencadd/talktorials/T039_molecular_transformers/code/tokenizer.py @@ -0,0 +1,81 @@ +import re +from collections import Counter + + +class SmilesTokenizer(object): + """ + A simple regex-based tokenizer adapted from the deepchem smiles_tokenizer package. + SMILES regex pattern for the tokenization is designed by Schwaller et. al., ACS Cent. Sci 5 (2019) + """ + + def __init__(self): + self.regex_pattern = ( + r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\." + r"|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])" + ) + self.regex = re.compile(self.regex_pattern) + + def tokenize(self, smiles): + """ + Tokenizes SMILES string. + + Parameters + ---------- + smiles : str + Input SMILES string. + + Returns + ------- + List[str] + A list of tokens. + """ + tokens = [token for token in self.regex.findall(smiles)] + return tokens + +def build_vocab(smiles_list, tokenizer, max_vocab_size): + """ + Builds a vocabulary of N=max_vocab_size most common tokens from list of SMILES strings. + + Parameters + ---------- + smiles_list : List[str] + List of SMILES strings. + tokenizer : SmilesTokenizer + max_vocab_size : int + Maximum size of vocabulary. + + Returns + ------- + Dict[str, int] + A dictionary that defines mapping of a token to its index in the vocabulary. + """ + tokenized_smiles = [tokenizer.tokenize(s) for s in smiles_list] + token_counter = Counter(c for s in tokenized_smiles for c in s) + tokens = [token for token, _ in token_counter.most_common(max_vocab_size)] + vocab = {token: idx for idx, token in enumerate(tokens)} + return vocab + + +def smiles_to_ohe(smiles, tokenizer, vocab): + """ + Transforms SMILES string to one-hot encoding representation. + + Parameters + ---------- + smiles : str + Input SMILES string. + tokenizer : SmilesTokenizer + vocab : Dict[str, int] + A dictionary that defines mapping of a token to its index in the vocabulary. + + Returns + ------- + Tensor + A pytorch Tensor with shape (n_tokens, vocab_size), where n_tokens is the + length of tokenized input string, vocab_size is the number of tokens in + the vocabulary + """ + unknown_token_id = len(vocab) - 1 + token_ids = [vocab.get(token, unknown_token_id) for token in tokenizer.tokenize(smiles)] + # ohe = torch.eye(len(vocab))[token_ids] + return token_ids # ohe \ No newline at end of file diff --git a/teachopencadd/talktorials/T039_molecular_transformers/code/training.py b/teachopencadd/talktorials/T039_molecular_transformers/code/training.py new file mode 100644 index 00000000..59a3ffd4 --- /dev/null +++ b/teachopencadd/talktorials/T039_molecular_transformers/code/training.py @@ -0,0 +1,117 @@ +import torch +import numpy as np + + +def train_loop(model, opt, loss_fn, dataloader, device): + """ + Method from "A detailed guide to Pytorch's nn.Transformer() module.", by + Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1 + """ + + model.train() + total_loss = 0 + # total_acc = 0 + # total = 0 + + for batch in dataloader: + X, y = batch[:, 0], batch[:, 1] + # X = np.vstack(X).astype(np.int64) + X = np.array([arr.astype(np.int64) for arr in X]) + X, y = torch.tensor(X).to(device), torch.tensor(y.astype(np.float32)).to(device) + + # Now we shift the tgt by one so with the we predict the token at pos 1 + # y_input = y[:,:-1] + # y_expected = y[:,1:] + + # Get mask to mask out the next words + # sequence_length = y_input.size(1) + # tgt_mask = model.get_tgt_mask(sequence_length).to(device) + + # Standard training except we pass in y_input and tgt_mask + pred = model(X) # y_input, tgt_mask) + + # Permute pred to have batch size first again + # pred = pred.permute(1, 2, 0) + loss = loss_fn(pred, y.float().unsqueeze(1)) # y_expected) + + # correct = pred.argmax(axis=1) == y + + opt.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) + opt.step() + + total_loss += loss.detach().item() + # total_acc += correct.sum().item() + # total += correct.size(0) + + return total_loss / len(dataloader)# , total_acc / total + + +def validation_loop(model, loss_fn, dataloader, device): + """ + Method from "A detailed guide to Pytorch's nn.Transformer() module.", by + Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1 + """ + + model.eval() + total_loss = 0 + # total_acc = 0 + # total = 0 + + with torch.no_grad(): + for batch in dataloader: + X, y = batch[:, 0], batch[:, 1] + X = np.vstack(X).astype(np.int64) + X, y = torch.tensor(X).to(device), torch.tensor(y.astype(np.float32)).to(device) + # Now we shift the tgt by one so with the we predict the token at pos 1 + # y_input = y[:,:-1] + # y_expected = y[:,1:] + + # Get mask to mask out the next words + # sequence_length = y_input.size(1) + # tgt_mask = model.get_tgt_mask(sequence_length).to(device) + + # Standard training except we pass in y_input and src_mask + pred = model(X) #y_input, tgt_mask) + + # Permute pred to have batch size first again + # pred = pred.permute(1, 2, 0) + loss = loss_fn(pred, y.unsqueeze(1)) #y_expected) + # correct = pred.argmax(axis=1) == y + total_loss += loss.detach().item() + # total_acc += correct.sum().item() + # total += correct.size(0) + + return total_loss / len(dataloader)# , total_acc / total + + +def fit(model, opt, loss_fn, train_dataloader, val_dataloader, epochs, device): + """ + Method from "A detailed guide to Pytorch's nn.Transformer() module.", by + Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1 + """ + + # Used for plotting later on + train_loss_list, validation_loss_list = [], [] + + print("Training and validating model") + for epoch in range(epochs): + print("-"*25, f"Epoch {epoch + 1}","-"*25) + + train_loss = train_loop(model, opt, loss_fn, train_dataloader, device) + train_loss_list += [train_loss] + #train_acc_list += [train_acc] + + validation_loss = validation_loop(model, loss_fn, val_dataloader, device) + validation_loss_list += [validation_loss] + #validation_acc_list += [validation_acc] + + print(f"Training loss: {train_loss:.4f}") + #print(f"Training accuracy: {train_acc:.4f}") + print(f"Validation loss: {validation_loss:.4f}") + #print(f"Validation accuracy: {validation_acc:.4f}") + print() + + return train_loss_list, validation_loss_list# , train_acc_list, validation_acc_list + diff --git a/teachopencadd/talktorials/T039_molecular_transformers/code/transformer.py b/teachopencadd/talktorials/T039_molecular_transformers/code/transformer.py new file mode 100644 index 00000000..ff898464 --- /dev/null +++ b/teachopencadd/talktorials/T039_molecular_transformers/code/transformer.py @@ -0,0 +1,100 @@ +import torch +import torch.nn as nn +import encoder + +import math +import numpy + + +class Transformer(nn.Module): + """ + Model from "A detailed guide to Pytorch's nn.Transformer() module.", by + Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1 + """ + # Constructor + def __init__( + self, + num_tokens, + dim_model, + num_heads, + num_encoder_layers, + dropout_p, + ): + super().__init__() + + # INFO + self.model_type = "Transformer" + self.dim_model = dim_model + + # LAYERS + self.positional_encoder = encoder.PositionalEncoding( + dim_model=dim_model, dropout_p=dropout_p, max_len=5000 + ) + self.embedding = nn.Embedding(num_tokens, dim_model) + #self.transformer = nn.Transformer( + # d_model=dim_model, + # nhead=num_heads, + # num_encoder_layers=num_encoder_layers, + # num_decoder_layers=num_decoder_layers, + # dropout=dropout_p, + #) + + encoder_layer = nn.TransformerEncoderLayer( + d_model=dim_model, + nhead=num_heads, + #dim_feedforward=num_encoder_layers, + dropout=dropout_p, + ) + self.transformer_encoder = nn.TransformerEncoder( + encoder_layer, + num_layers=num_encoder_layers, + ) + # binary classification + self.out = nn.Linear(dim_model, 1) + # torch.nn.init.xavier_uniform(self.out.weight) + # self.out.bias.data.fill_(numpy.log(0.25)) + + + def forward(self, src): #, tgt_mask=None, src_pad_mask=None, tgt_pad_mask=None): + # Src size must be (batch_size, src sequence length) + # Tgt size must be (batch_size, tgt sequence length) + + # Embedding + positional encoding - Out size = (batch_size, sequence length, dim_model) + src = self.embedding(src) * math.sqrt(self.dim_model) + # tgt = self.embedding(tgt) * math.sqrt(self.dim_model) + src = self.positional_encoder(src) + # tgt = self.positional_encoder(tgt) + + # We could use the parameter batch_first=True, but our KDL version doesn't support it yet, so we permute + # to obtain size (sequence length, batch_size, dim_model), + # src = src.permute(1,0,2) + # tgt = tgt.permute(1,0,2) + + # Transformer blocks - Out size = (sequence length, batch_size, num_tokens) + transformer_out = self.transformer_encoder(src) #tgt_mask=tgt_mask, src_key_padding_mask=src_pad_mask, tgt_key_padding_mask=tgt_pad_mask) + out = transformer_out.mean(dim=1) + + out = self.out(out) + + return out + + def get_tgt_mask(self, size) -> torch.tensor: + # Generates a squeare matrix where the each row allows one word more to be seen + mask = torch.tril(torch.ones(size, size) == 1) # Lower triangular matrix + mask = mask.float() + mask = mask.masked_fill(mask == 0, float('-inf')) # Convert zeros to -inf + mask = mask.masked_fill(mask == 1, float(0.0)) # Convert ones to 0 + + # EX for size=5: + # [[0., -inf, -inf, -inf, -inf], + # [0., 0., -inf, -inf, -inf], + # [0., 0., 0., -inf, -inf], + # [0., 0., 0., 0., -inf], + # [0., 0., 0., 0., 0.]] + + return mask + + def create_pad_mask(self, matrix: torch.tensor, pad_token: int) -> torch.tensor: + # If matrix = [1,2,3,0,0,0] where pad_token=0, the result mask is + # [False, False, False, True, True, True] + return (matrix == pad_token) \ No newline at end of file diff --git a/teachopencadd/talktorials/T039_molecular_transformers/data/qm9.csv.gz b/teachopencadd/talktorials/T039_molecular_transformers/data/qm9.csv.gz new file mode 100644 index 00000000..97a7ce02 Binary files /dev/null and b/teachopencadd/talktorials/T039_molecular_transformers/data/qm9.csv.gz differ