embeddings-generation/02_transe_kge_training.py

#script to calculate TransE embeddings for the entities and relations of the linkedpaperswithcode dataset
import torch
from torch_geometric.data import Dataset, download_url, Data
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric import seed_everything
from torch_geometric.nn import TransE
import torch
import torch.optim as optim
import time
import pandas as pd
import numpy as np

#path to the output evaluation file
with open(".../transe-kge-summary.txt", "w",) as z:
    start_time = time.ctime()
    z.write(f'Calculate embeddings with TransE started at: {start_time}\n')
    z.flush()
    print('Calculate embeddings with TransE started at: ', start_time)

    first_numbers = []
    second_numbers = []
    third_numbers = []

    #input dataset
    with open('.../triples.txt', 'r') as file:
        for line in file:
            numbers = line.split()
            first_numbers.append(int(numbers[0]))
            second_numbers.append(int(numbers[1]))
            third_numbers.append(int(numbers[2]))

    # Konvertiere die Listen in PyTorch-Tensoren
    first_tensor = torch.tensor(first_numbers)
    second_tensor = torch.tensor(second_numbers)
    third_tensor = torch.tensor(third_numbers)

    # Kombiniere die beiden Tensoren in einem 2D-Tensor
    combined_tensor = torch.stack((first_tensor, third_tensor))

    data = Data(edge_index=combined_tensor,
                edge_type=second_tensor,
                num_nodes=combined_tensor.max().item() + 1,
                ) 

    seed_everything(1)
    transform = RandomLinkSplit(
        num_val=0.1,
        num_test=0.1,
    )
    train_data, val_data, test_data = transform(data)

    device = torch.device('cuda:0') 
    z.write(f'Using device: {device}\n')
    z.flush()
    print('Using device:', device)

    model = TransE(
        num_nodes=train_data.num_nodes,
        num_relations=train_data.num_edge_types,
        hidden_channels=256,
    ).to(device)

    loader = model.loader(
        head_index=train_data.edge_index[0].to(device),
        rel_type=train_data.edge_type.to(device),
        tail_index=train_data.edge_index[1].to(device),
        batch_size=2000,
        shuffle=True,
    )

    optimizer = optim.Adam(model.parameters(), lr=0.001)

    def train():
        model.train()
        total_loss = total_examples = 0
        for head_index, rel_type, tail_index in loader:
            optimizer.zero_grad()
            loss = model.loss(head_index, rel_type, tail_index)
            loss.backward()
            optimizer.step()
            total_loss += float(loss) * head_index.numel()
            total_examples += head_index.numel()
        return total_loss / total_examples


    @torch.no_grad()
    def test(val_data, k):
        model.eval()
        head_index = val_data.edge_index[0].to(device)
        rel_type = val_data.edge_type.to(device)
        tail_index = val_data.edge_index[1].to(device)
        return model.test(
            head_index=head_index,
            rel_type=rel_type,
            tail_index=tail_index,
            batch_size=20000,
            k=k,
        )


    for epoch in range(1, 901):
        loss = train()
        z.write(f'Epoch: {epoch:03d}, Loss: {loss:.4f}\n')
        z.flush()

        '''
        Uncomment if you want to evaluate the training every 300 epochs based on the validation set (e.g., for early stopping)
        if epoch % 300 == 0:
            rank, hits = test(val_data, k=10)
            z.write(f'Epoch: {epoch:03d}, Val Mean Rank: {rank:.2f}, Val Hits@10: {hits:.4f}\n')
            z.flush()
        '''


    k_values = [1, 3, 10]
    results = {}
    for k in k_values:
        results[k] = test(test_data, k)

    for k in k_values:
        rank, hits_at_k = results[k]
        z.write(f'Test Mean Rank: {rank:.2f}, Test Hits@{k}: {hits_at_k:.4f}\n')
        z.flush()

    end_time = time.ctime()
    z.write(f'Calculate embeddings with TransE end at: {end_time}\n')
    z.flush()
    print('Calculate embeddings with TransE end at: ', end_time)

print('Done training')


#Save the trained TransE embeddings for the entites and for the relations in csv files
entity_embeddings = model.node_emb.weight.cpu().detach().numpy()

#path to the output entity embeddings file 
np.savetxt('.../entity_embeddings_transe.csv', entity_embeddings, delimiter=',')

relation_embeddings = model.rel_emb.weight.cpu().detach().numpy()

#path to the output relation embeddings file 
np.savetxt('.../relation_embeddings_transe.csv', relation_embeddings, delimiter=',')


print('Done')