PetriNet2Vec.py

import numpy as np
import pm4py
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

class PetriNet2Vec():
    r"""An implementation of the "PetriNet2Vec" algorithm for converting Petri Nets (PN models) from pm4py, stored in .pnml files, into embedding vectors. These embeddings capture structural information about the Petri Nets, representing each Petri Net as a vector in the embedding space.

    Args:
        - embedding_dim (int): The size of the embedding dimension.
        - negative_sampling (int, optional): Similar to the word2vec approach, specifies the number of negative samples to be used during training.
        - seed (int, optional): Random seed. In Python 3, reproducibility between launches also requires setting the PYTHONHASHSEED environment variable.
        - black_transitions (bool, optional): If it set 'True' all black boxes transitions will be represented by the word 'None'.
        - workers (int, optional): The number of parallel jobs to launch during training.

    Methods:
        - petriNet2doc(net: pm4py.objects.petri_net.obj.PetriNet) -> list:
            Parse a Petri Net into a list of tokens representing pairs of tasks (transitions).

        - fit(petriNets: list, epochs: int):
            Fit a PetriNet2Vec model on a list of Petri Nets in Pm4py format.

        - get_net_embeddings() -> np.array:
            Retrieve the Petri Net embeddings generated by the fitted PetriNet2Vec model.

        - get_task_embeddings() -> np.array:
            Retrieve the task embeddings generated by the fitted PetriNet2Vec model.

        - similarity(net1: pm4py.objects.petri_net.obj.PetriNet, net2: pm4py.objects.petri_net.obj.PetriNet) -> float:
            Compute the cosine similarity between two Petri Nets using the pre-trained PetriNet2Vec model.

        - infer_vector(net: pm4py.objects.petri_net.obj.PetriNet, epochs: int = None) -> np.array:
            Infer an embedding vector for a new Petri Net using the pre-trained Pnml2Vec model.

        - save_model(name: str):
            Save the trained PetriNet2Vec model to a file.

        - load_model(name: str):
            Load a pre-trained PetriNet2Vec model from a file.

    Dependencies:
        - numpy (np): Numerical computing library for efficient array operations.
        - pm4py: Process Mining for Python, used for working with Petri Nets and process-related data.
        - gensim.models.doc2vec: Doc2Vec model from the Gensim library, utilized for training and generating embeddings.

    Example:
        >>> net_1, _, _ = pm4py.read_pnml("example1.pnml")
        >>> net_2, _, _ = pm4py.read_pnml("example2.pnml")
        >>> petri_net2vec = PetriNet2Vec(embedding_dim=100)
        >>> petri_net2vec.fit([net_1, net_2], epochs=10)
        >>> embeddings = petri_net2vec.get_net_embeddings()
        >>> print(embeddings)
        [[0.1, 0.2, ..., 0.99], [0.3, 0.4, ..., 0.95]]

    Author:
        Dr. Juan G. Colonna <juancolonna@icomp.ufam.edu.br>
    """

    def __init__(self, 
                 embedding_dim:int=2,
                 negative:int=5,
                 workers:int=4,
                 black_transitions:bool=True,
                 seed:int=None):
        
        self.embedding_dim = embedding_dim # The size of the embedding dimension.
        self.workers = workers             # The number of parallel jobs during training.
        self.negative = negative           # The number of negative samples used during training. 
        self.seed = seed                   # The seed for reproducibility.
        self._fitted = False               # To keep track of were the model was fitted at least one time
        self._task_keys = {}               # Dictionary with taks keys usefull to query taks embeddings
        self._net_keys = {}                # Dictionary with petri Nets keys usefull to query net embeddings
        self.black_transitions = black_transitions # Whether to use the word "None" for all black box transitions.
        self._embeddings = []       # Internal storage for generated embeddings.


        self.model = Doc2Vec(vector_size = self.embedding_dim, 
                             window = 2, # The number of tasks allowed in the context. For instance, if context_window=1, task i+1 is considered when predicting task_i. We set it to 1 aming to capture the relation between the current task and the following task.
                             min_count = 1, # The minimum frequency a task must have to be included in the tokens dictionary.
                             negative = self.negative, 
                             workers = self.workers,
                             seed = self.seed,
                             dm = 1) # Defines the training algorithm. Recommended dm=1, for 'distributed memory' (PV-DM) training. Be aware that dm=0 does not learn task embeddings! 
        
        self.model.init_weights() # initialize random weights
        

    def __petriNets2docs(self, nets:list) -> list:
        r"""Private method: Parse a list of Petri Nets (in Pm4py format) into a list of Tagged documents representing pairs of tasks (transitions).

        Args:
            nets (list of pm4py.objects.petri_net.obj.PetriNet): A list of Petri Nets using Pm4py format.

        Example:
            Processes a list of Petri Nets, extracting sequences of transitions (pairs of transitions) and creates a list of documents where each document is formed by several pairs of tokens. 'tags' is the document Id. Each token represents the transition from task_i to task_j. For example:
            
            [
                TaggedDocument(words=['t13', 't20'], tags=['0']),
                TaggedDocument(words=['None', 't20'], tags=['0']),
                TaggedDocument(words=['t6', 'None'], tags=['1']),
                TaggedDocument(words=['t6', 't7'], tags=['1']), 
                TaggedDocument(words=['t9', 't10'], tags=['2']),
                TaggedDocument(words=['t11', 't12'], tags=['2']),
                TaggedDocument(words=['None', 't18'], tags=['2']),
                ...
            ]
        """
        for i, net in enumerate(nets):
            Transitions = [arc for arc in net.arcs if type(arc.source) == pm4py.objects.petri_net.obj.PetriNet.Transition]
            Places = [arc for arc in net.arcs if type(arc.source) == pm4py.objects.petri_net.obj.PetriNet.Place]

            for transition in Transitions:
                for place in Places:
                    if place.source == transition.target:
                        if self.black_transitions:
                            left = 'None' if transition.source.label == None else transition.source.label
                            right = 'None' if place.target.label == None else place.target.label
                        else:
                            left = transition.source.name.split('_')[0] if transition.source.label == None else transition.source.label
                            right = place.target.name.split('_')[0] if place.target.label == None else place.target.label
                        if left != right:
                            self._documents.append(TaggedDocument(words=[left, right], tags=[str(i)]))

    def __build_vocabulary(self, petriNets:list):
            """
                petriNets (list): A list of Petri Nets using Pm4py format.
            """
            self._documents = []        # Internal storage for processed Petri Nets.

            # Convert Petri Nets to documents
            self.__petriNets2docs(petriNets)

            # Build vocabulary and train the model
            self.model.build_vocab(corpus_iterable = self._documents, update=False)

    def fit(self, petriNets:list, epochs:int):
        r"""Fit a PetriNet2Vec model on a list of Petri Nets in Pm4py format.

        Args:
            petriNets (list): A list of Petri Nets using Pm4py format.
            epochs (int): The number of training iterations to perform.

        This method fits a PetriNet2Vec model by processing a list of Petri Nets and generating embeddings for each Petri Net. The embeddings capture structural information based on the transitions within the Petri Nets tasks.

        If the model has already been fitted (`self._fitted` is True), this method continues training for additional epochs to update the model embeddings.

        Example:
            >>> net_1, _, _ = pm4py.read_pnml("example1.pnml")
            >>> net_2, _, _ = pm4py.read_pnml("example2.pnml")
            >>> petri_nets = [net_1, net_2]
            >>> petri_net2vec = PetriNet2Vec(embedding_dim=100)
            >>> petri_net2vec.fit(petri_nets, epochs=100)
        """      
        if not self._fitted:
            self.__build_vocabulary(petriNets)
            self._task_keys = self.model.wv.key_to_index
            self._net_keys = self.model.dv.key_to_index

        self.model.train(corpus_iterable = self._documents,
                         epochs = epochs,
                         total_examples = self.model.corpus_count,
                         compute_loss=True)
        self.model.update_weights()

        # Update embeddings and set the model as fitted
        self._embeddings = [self.model.docvecs[str(i)] for i in self.model.dv.index_to_key]
        self._fitted = True

    def get_net_embeddings(self, petriNet_key:str=None) -> np.array:
        r"""Retrieve the embeddings of Petri Nets generated by the fitted PetriNet2Vec model.

            Args:
                petriNet_key (str, optional): A string with the key of the Petri Net. The list of keys is available via the attribute "model._net_keys".

            Returns:
                np.array: A numpy array containing the embeddings of Petri Nets.

            This method retrieves the Petri Net embeddings generated by the fitted PetriNet2Vec model. The embeddings capture structural information about the Petri Nets, representing each Petri Net as a row vector in the embedding space.

            Example:
                >>> net_1, _, _ = pm4py.read_pnml("example1.pnml")
                >>> net_2, _, _ = pm4py.read_pnml("example2.pnml")
                >>> petri_net2vec = PetriNet2Vec(embedding_dim=100)
                >>> petri_net2vec.fit([net_1, net_2], epochs=10)
                >>> embeddings = petri_net2vec.get_net_embeddings()
                >>> print(embeddings)
                [[0.1, 0.2, ..., 0.99], [0.3, 0.4, ..., 0.95]]

        """
        if petriNet_key is None:
            return self.model.dv.vectors
        else:
            return self.model.dv[petriNet_key]
    
    def get_task_embeddings(self, task_key:str=None) -> np.array:
        r"""Retrieve the embeddings of tasks generated by the fitted PetriNet2Vec model.

            Args:
                task_key (str, optional): A string with the key of the task. The list of keys is available via the attribute "model._task_keys".

            Returns:
                np.array: A numpy array containing the embeddings of tasks.

            This method retrieves the task embeddings generated by the fitted PetriNet2Vec model. The embeddings capture semantic information about the tasks, representing each task as a row vector in the embedding space.

            Example:
                >>> task_1 = "task1"
                >>> task_2 = "task2"
                >>> petri_net2vec = PetriNet2Vec(embedding_dim=100)
                >>> petri_net2vec.fit([task_1, task_2], epochs=10)
                >>> embeddings = petri_net2vec.get_task_embeddings()
                >>> print(embeddings)
                [[0.1, 0.2, ..., 0.99], [0.3, 0.4, ..., 0.95]]
        """
        if task_key is None:
            return self.model.wv.vectors
        else:
            return self.model.wv[task_key]

    def similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
        r"""Compute the cosine similarity between two embedding vectors.

        Args:
            vec1 (numpy array): The first embedding vector.
            vec2 (numpy array): The second embedding vector.

        Returns:
            float: The cosine similarity score between the two embeddings.

        This method calculates the cosine similarity between two embedding vectors. Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space. It is computed as the dot product of the normalized vectors.

        Example:
            >>> petri_net2vec = PetriNet2Vec(embedding_dim=100)
            >>> petri_net2vec.fit([net1, net2], epochs=100)
            >>> embeddings = petri_net2vec.get_net_embeddings()
            >>> similarity_score = petri_net2vec.similarity(embeddings[0], embeddings[1])
            >>> print(similarity_score)
            0.85

        Raises:
            AssertionError: If either of the input embedding vectors has a zero norm.

        """
        norm_vec1 = np.linalg.norm(vec1)
        assert norm_vec1 > 0, "Problem: First embedding vector with zero norm"

        norm_vec2 = np.linalg.norm(vec2)
        assert norm_vec2 > 0, "Problem: Second embedding vector with zero norm"
        
        return np.dot(vec1/norm_vec1, vec2/norm_vec2)

    def infer_vector(self, net:pm4py.objects.petri_net.obj.PetriNet, epochs:int=None) -> np.array:
        r"""Infer an embedding vector for a new Petri Net using the pre-trained Pnml2Vec model.

        Args:
            net (pm4py.objects.petri_net.obj.PetriNet): The Petri Net for which the embedding is to be inferred.
            epochs (int, optional): The number of iterations to perform. Higher values increase training time but might enhance the quality and consistency of inferred vectors across runs. If unspecified, the epochs value from model initialization will be reused.

        Returns:
            np.array: The inferred embedding vector for the new Petri Net.

        This method utilizes the pre-trained Pnml2Vec model to infer an embedding vector for a new Petri Net. The inference process considers the structural information captured during training and generates a vector representation for the provided Petri Net.

        Example:
            >>> petri_net2vec = PetriNet2Vec(embedding_dim=100)
            >>> petri_net2vec.fit([net1, net2], epochs=100)
            >>> net3, _, _ = pm4py.read_pnml("example3.pnml")
            >>> new_embedding = petri_net2vec.infer_vector(net3)
            >>> print(new_embedding)
            [0.1, 0.2, ..., 0.99]
        """
        doc = []
        Transitions = [arc for arc in net.arcs if type(arc.source) == pm4py.objects.petri_net.obj.PetriNet.Transition]
        Places = [arc for arc in net.arcs if type(arc.source) == pm4py.objects.petri_net.obj.PetriNet.Place]

        for transition in Transitions:
            for place in Places:
                if place.source == transition.target:
                    if self.black_transitions:
                        left = 'None' if transition.source.label == None else transition.source.label
                        right = 'None' if place.target.label == None else place.target.label
                    else:
                        left = transition.source.name.split('_')[0] if transition.source.label == None else transition.source.label
                        right = place.target.name.split('_')[0] if place.target.label == None else place.target.label
                    if left != right:
                        doc.append(left)
                        doc.append(right)
                    
        return self.model.infer_vector(doc, alpha=None, min_alpha=None, epochs=epochs)
    
    def save_model(self, name:str):
        r"""
        Save the trained PetriNet2Vec model to a file.

        Args:
            name (str): The name of the file to save the model.

        Example:
            >>> model = PetriNet2Vec(embedding_dim=2, seed=42)
            >>> model.fit(petriNet, epochs=100)
            >>> model.save_model("pnml2vec.model")
        """
        self.model.save(name)

    def load_model_from(self, name:str):
        r"""Load a pre-trained PetriNet2Vec model from a file.

        Args:
            name (str): The name of the file containing the pre-trained model.

        Example:
            >>> model = PetriNet2Vec()
            >>> model.load_model_from("pnml2vec.model")
        """
        
        #self.__build_vocabulary(petriNets)

        self.model = Doc2Vec.load(name)
        self.embedding_dim = self.model.vector_size
        self.negative = self.model.negative
        self.workers = self.model.workers
        self.seed = self.model.seed
        self._fitted = False # importante to set it to False here!
        self._task_keys = self.model.wv.key_to_index
        self._net_keys = self.model.dv.key_to_index

        #self.black_transitions = black_transitions # Whether to use the word "None" for all black box transitions.

    def fine_tune(self, petriNets:list, epochs:int):
        r"""Fine-tune a PetriNet2Vec model on a new list of Petri Nets in Pm4py format. The model must be fitted before calling this method.

        Args:
            petriNets (list): A new list of Petri Nets in Pm4py format.
            epochs (int): The number of training iterations.

        This method fine-tunes a pre-trained PetriNet2Vec model by processing a new list of Petri Nets and generating embeddings for each Petri Net in the list.

        Example:
            >>> petri_net2vec.fit(petri_nets, epochs=100)
            >>> net_3, _, _ = pm4py.read_pnml("example3.pnml")
            >>> petri_net2vec.fine_tune(net_3, epochs=100)
            >>> petri_net2vec.get_net_embeddings()
        """
        assert self._fitted, 'Pre-Train the model first by calling fit().' 
        
        # Convert new Petri Nets to new documents
        new_documents = []
        new_examples = int(self._documents[-1].tags[0])
        for i, net in enumerate(petriNets):
            new_examples += 1
            Transitions = [arc for arc in net.arcs if type(arc.source) == pm4py.objects.petri_net.obj.PetriNet.Transition]
            Places = [arc for arc in net.arcs if type(arc.source) == pm4py.objects.petri_net.obj.PetriNet.Place]

            for transition in Transitions:
                for place in Places:
                    if place.source == transition.target:
                        if self.black_transitions:
                            left = 'None' if transition.source.label == None else transition.source.label
                            right = 'None' if place.target.label == None else place.target.label
                        else:
                            left = transition.source.name.split('_')[0] if transition.source.label == None else transition.source.label
                            right = place.target.name.split('_')[0] if place.target.label == None else place.target.label
                        if left != right:
                            new_documents.append(TaggedDocument(words=[left, right], tags=[str(new_examples)]))

        # Build vocabulary and train the model
        self.model.build_vocab(corpus_iterable = new_documents, update=True)
        self.model.train(corpus_iterable = self._documents,
                         epochs = epochs,
                         total_examples = self.model.corpus_count,
                         compute_loss=True)
        self.model.update_weights()
        
        self._task_keys = self.model.wv.key_to_index
        self._net_keys = self.model.dv.key_to_index
        # Update embeddings and set the model as fitted
        self._embeddings = [self.model.docvecs[str(i)] for i in range(new_examples)]