-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathPetriNet2Vec.py
376 lines (298 loc) · 19.2 KB
/
PetriNet2Vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
import numpy as np
import pm4py
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
class PetriNet2Vec():
r"""An implementation of the "PetriNet2Vec" algorithm for converting Petri Nets (PN models) from pm4py, stored in .pnml files, into embedding vectors. These embeddings capture structural information about the Petri Nets, representing each Petri Net as a vector in the embedding space.
Args:
- embedding_dim (int): The size of the embedding dimension.
- negative_sampling (int, optional): Similar to the word2vec approach, specifies the number of negative samples to be used during training.
- seed (int, optional): Random seed. In Python 3, reproducibility between launches also requires setting the PYTHONHASHSEED environment variable.
- black_transitions (bool, optional): If it set 'True' all black boxes transitions will be represented by the word 'None'.
- workers (int, optional): The number of parallel jobs to launch during training.
Methods:
- petriNet2doc(net: pm4py.objects.petri_net.obj.PetriNet) -> list:
Parse a Petri Net into a list of tokens representing pairs of tasks (transitions).
- fit(petriNets: list, epochs: int):
Fit a PetriNet2Vec model on a list of Petri Nets in Pm4py format.
- get_net_embeddings() -> np.array:
Retrieve the Petri Net embeddings generated by the fitted PetriNet2Vec model.
- get_task_embeddings() -> np.array:
Retrieve the task embeddings generated by the fitted PetriNet2Vec model.
- similarity(net1: pm4py.objects.petri_net.obj.PetriNet, net2: pm4py.objects.petri_net.obj.PetriNet) -> float:
Compute the cosine similarity between two Petri Nets using the pre-trained PetriNet2Vec model.
- infer_vector(net: pm4py.objects.petri_net.obj.PetriNet, epochs: int = None) -> np.array:
Infer an embedding vector for a new Petri Net using the pre-trained Pnml2Vec model.
- save_model(name: str):
Save the trained PetriNet2Vec model to a file.
- load_model(name: str):
Load a pre-trained PetriNet2Vec model from a file.
Dependencies:
- numpy (np): Numerical computing library for efficient array operations.
- pm4py: Process Mining for Python, used for working with Petri Nets and process-related data.
- gensim.models.doc2vec: Doc2Vec model from the Gensim library, utilized for training and generating embeddings.
Example:
>>> net_1, _, _ = pm4py.read_pnml("example1.pnml")
>>> net_2, _, _ = pm4py.read_pnml("example2.pnml")
>>> petri_net2vec = PetriNet2Vec(embedding_dim=100)
>>> petri_net2vec.fit([net_1, net_2], epochs=10)
>>> embeddings = petri_net2vec.get_net_embeddings()
>>> print(embeddings)
[[0.1, 0.2, ..., 0.99], [0.3, 0.4, ..., 0.95]]
Author:
Dr. Juan G. Colonna <[email protected]>
"""
def __init__(self,
embedding_dim:int=2,
negative:int=5,
workers:int=4,
black_transitions:bool=True,
seed:int=None):
self.embedding_dim = embedding_dim # The size of the embedding dimension.
self.workers = workers # The number of parallel jobs during training.
self.negative = negative # The number of negative samples used during training.
self.seed = seed # The seed for reproducibility.
self._fitted = False # To keep track of were the model was fitted at least one time
self._task_keys = {} # Dictionary with taks keys usefull to query taks embeddings
self._net_keys = {} # Dictionary with petri Nets keys usefull to query net embeddings
self.black_transitions = black_transitions # Whether to use the word "None" for all black box transitions.
self._embeddings = [] # Internal storage for generated embeddings.
self.model = Doc2Vec(vector_size = self.embedding_dim,
window = 2, # The number of tasks allowed in the context. For instance, if context_window=1, task i+1 is considered when predicting task_i. We set it to 1 aming to capture the relation between the current task and the following task.
min_count = 1, # The minimum frequency a task must have to be included in the tokens dictionary.
negative = self.negative,
workers = self.workers,
seed = self.seed,
dm = 1) # Defines the training algorithm. Recommended dm=1, for 'distributed memory' (PV-DM) training. Be aware that dm=0 does not learn task embeddings!
self.model.init_weights() # initialize random weights
def __petriNets2docs(self, nets:list) -> list:
r"""Private method: Parse a list of Petri Nets (in Pm4py format) into a list of Tagged documents representing pairs of tasks (transitions).
Args:
nets (list of pm4py.objects.petri_net.obj.PetriNet): A list of Petri Nets using Pm4py format.
Example:
Processes a list of Petri Nets, extracting sequences of transitions (pairs of transitions) and creates a list of documents where each document is formed by several pairs of tokens. 'tags' is the document Id. Each token represents the transition from task_i to task_j. For example:
[
TaggedDocument(words=['t13', 't20'], tags=['0']),
TaggedDocument(words=['None', 't20'], tags=['0']),
TaggedDocument(words=['t6', 'None'], tags=['1']),
TaggedDocument(words=['t6', 't7'], tags=['1']),
TaggedDocument(words=['t9', 't10'], tags=['2']),
TaggedDocument(words=['t11', 't12'], tags=['2']),
TaggedDocument(words=['None', 't18'], tags=['2']),
...
]
"""
for i, net in enumerate(nets):
Transitions = [arc for arc in net.arcs if type(arc.source) == pm4py.objects.petri_net.obj.PetriNet.Transition]
Places = [arc for arc in net.arcs if type(arc.source) == pm4py.objects.petri_net.obj.PetriNet.Place]
for transition in Transitions:
for place in Places:
if place.source == transition.target:
if self.black_transitions:
left = 'None' if transition.source.label == None else transition.source.label
right = 'None' if place.target.label == None else place.target.label
else:
left = transition.source.name.split('_')[0] if transition.source.label == None else transition.source.label
right = place.target.name.split('_')[0] if place.target.label == None else place.target.label
if left != right:
self._documents.append(TaggedDocument(words=[left, right], tags=[str(i)]))
def __build_vocabulary(self, petriNets:list):
"""
petriNets (list): A list of Petri Nets using Pm4py format.
"""
self._documents = [] # Internal storage for processed Petri Nets.
# Convert Petri Nets to documents
self.__petriNets2docs(petriNets)
# Build vocabulary and train the model
self.model.build_vocab(corpus_iterable = self._documents, update=False)
def fit(self, petriNets:list, epochs:int):
r"""Fit a PetriNet2Vec model on a list of Petri Nets in Pm4py format.
Args:
petriNets (list): A list of Petri Nets using Pm4py format.
epochs (int): The number of training iterations to perform.
This method fits a PetriNet2Vec model by processing a list of Petri Nets and generating embeddings for each Petri Net. The embeddings capture structural information based on the transitions within the Petri Nets tasks.
If the model has already been fitted (`self._fitted` is True), this method continues training for additional epochs to update the model embeddings.
Example:
>>> net_1, _, _ = pm4py.read_pnml("example1.pnml")
>>> net_2, _, _ = pm4py.read_pnml("example2.pnml")
>>> petri_nets = [net_1, net_2]
>>> petri_net2vec = PetriNet2Vec(embedding_dim=100)
>>> petri_net2vec.fit(petri_nets, epochs=100)
"""
if not self._fitted:
self.__build_vocabulary(petriNets)
self._task_keys = self.model.wv.key_to_index
self._net_keys = self.model.dv.key_to_index
self.model.train(corpus_iterable = self._documents,
epochs = epochs,
total_examples = self.model.corpus_count,
compute_loss=True)
self.model.update_weights()
# Update embeddings and set the model as fitted
self._embeddings = [self.model.docvecs[str(i)] for i in self.model.dv.index_to_key]
self._fitted = True
def get_net_embeddings(self, petriNet_key:str=None) -> np.array:
r"""Retrieve the embeddings of Petri Nets generated by the fitted PetriNet2Vec model.
Args:
petriNet_key (str, optional): A string with the key of the Petri Net. The list of keys is available via the attribute "model._net_keys".
Returns:
np.array: A numpy array containing the embeddings of Petri Nets.
This method retrieves the Petri Net embeddings generated by the fitted PetriNet2Vec model. The embeddings capture structural information about the Petri Nets, representing each Petri Net as a row vector in the embedding space.
Example:
>>> net_1, _, _ = pm4py.read_pnml("example1.pnml")
>>> net_2, _, _ = pm4py.read_pnml("example2.pnml")
>>> petri_net2vec = PetriNet2Vec(embedding_dim=100)
>>> petri_net2vec.fit([net_1, net_2], epochs=10)
>>> embeddings = petri_net2vec.get_net_embeddings()
>>> print(embeddings)
[[0.1, 0.2, ..., 0.99], [0.3, 0.4, ..., 0.95]]
"""
if petriNet_key is None:
return self.model.dv.vectors
else:
return self.model.dv[petriNet_key]
def get_task_embeddings(self, task_key:str=None) -> np.array:
r"""Retrieve the embeddings of tasks generated by the fitted PetriNet2Vec model.
Args:
task_key (str, optional): A string with the key of the task. The list of keys is available via the attribute "model._task_keys".
Returns:
np.array: A numpy array containing the embeddings of tasks.
This method retrieves the task embeddings generated by the fitted PetriNet2Vec model. The embeddings capture semantic information about the tasks, representing each task as a row vector in the embedding space.
Example:
>>> task_1 = "task1"
>>> task_2 = "task2"
>>> petri_net2vec = PetriNet2Vec(embedding_dim=100)
>>> petri_net2vec.fit([task_1, task_2], epochs=10)
>>> embeddings = petri_net2vec.get_task_embeddings()
>>> print(embeddings)
[[0.1, 0.2, ..., 0.99], [0.3, 0.4, ..., 0.95]]
"""
if task_key is None:
return self.model.wv.vectors
else:
return self.model.wv[task_key]
def similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
r"""Compute the cosine similarity between two embedding vectors.
Args:
vec1 (numpy array): The first embedding vector.
vec2 (numpy array): The second embedding vector.
Returns:
float: The cosine similarity score between the two embeddings.
This method calculates the cosine similarity between two embedding vectors. Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space. It is computed as the dot product of the normalized vectors.
Example:
>>> petri_net2vec = PetriNet2Vec(embedding_dim=100)
>>> petri_net2vec.fit([net1, net2], epochs=100)
>>> embeddings = petri_net2vec.get_net_embeddings()
>>> similarity_score = petri_net2vec.similarity(embeddings[0], embeddings[1])
>>> print(similarity_score)
0.85
Raises:
AssertionError: If either of the input embedding vectors has a zero norm.
"""
norm_vec1 = np.linalg.norm(vec1)
assert norm_vec1 > 0, "Problem: First embedding vector with zero norm"
norm_vec2 = np.linalg.norm(vec2)
assert norm_vec2 > 0, "Problem: Second embedding vector with zero norm"
return np.dot(vec1/norm_vec1, vec2/norm_vec2)
def infer_vector(self, net:pm4py.objects.petri_net.obj.PetriNet, epochs:int=None) -> np.array:
r"""Infer an embedding vector for a new Petri Net using the pre-trained Pnml2Vec model.
Args:
net (pm4py.objects.petri_net.obj.PetriNet): The Petri Net for which the embedding is to be inferred.
epochs (int, optional): The number of iterations to perform. Higher values increase training time but might enhance the quality and consistency of inferred vectors across runs. If unspecified, the epochs value from model initialization will be reused.
Returns:
np.array: The inferred embedding vector for the new Petri Net.
This method utilizes the pre-trained Pnml2Vec model to infer an embedding vector for a new Petri Net. The inference process considers the structural information captured during training and generates a vector representation for the provided Petri Net.
Example:
>>> petri_net2vec = PetriNet2Vec(embedding_dim=100)
>>> petri_net2vec.fit([net1, net2], epochs=100)
>>> net3, _, _ = pm4py.read_pnml("example3.pnml")
>>> new_embedding = petri_net2vec.infer_vector(net3)
>>> print(new_embedding)
[0.1, 0.2, ..., 0.99]
"""
doc = []
Transitions = [arc for arc in net.arcs if type(arc.source) == pm4py.objects.petri_net.obj.PetriNet.Transition]
Places = [arc for arc in net.arcs if type(arc.source) == pm4py.objects.petri_net.obj.PetriNet.Place]
for transition in Transitions:
for place in Places:
if place.source == transition.target:
if self.black_transitions:
left = 'None' if transition.source.label == None else transition.source.label
right = 'None' if place.target.label == None else place.target.label
else:
left = transition.source.name.split('_')[0] if transition.source.label == None else transition.source.label
right = place.target.name.split('_')[0] if place.target.label == None else place.target.label
if left != right:
doc.append(left)
doc.append(right)
return self.model.infer_vector(doc, alpha=None, min_alpha=None, epochs=epochs)
def save_model(self, name:str):
r"""
Save the trained PetriNet2Vec model to a file.
Args:
name (str): The name of the file to save the model.
Example:
>>> model = PetriNet2Vec(embedding_dim=2, seed=42)
>>> model.fit(petriNet, epochs=100)
>>> model.save_model("pnml2vec.model")
"""
self.model.save(name)
def load_model_from(self, name:str):
r"""Load a pre-trained PetriNet2Vec model from a file.
Args:
name (str): The name of the file containing the pre-trained model.
Example:
>>> model = PetriNet2Vec()
>>> model.load_model_from("pnml2vec.model")
"""
#self.__build_vocabulary(petriNets)
self.model = Doc2Vec.load(name)
self.embedding_dim = self.model.vector_size
self.negative = self.model.negative
self.workers = self.model.workers
self.seed = self.model.seed
self._fitted = False # importante to set it to False here!
self._task_keys = self.model.wv.key_to_index
self._net_keys = self.model.dv.key_to_index
#self.black_transitions = black_transitions # Whether to use the word "None" for all black box transitions.
def fine_tune(self, petriNets:list, epochs:int):
r"""Fine-tune a PetriNet2Vec model on a new list of Petri Nets in Pm4py format. The model must be fitted before calling this method.
Args:
petriNets (list): A new list of Petri Nets in Pm4py format.
epochs (int): The number of training iterations.
This method fine-tunes a pre-trained PetriNet2Vec model by processing a new list of Petri Nets and generating embeddings for each Petri Net in the list.
Example:
>>> petri_net2vec.fit(petri_nets, epochs=100)
>>> net_3, _, _ = pm4py.read_pnml("example3.pnml")
>>> petri_net2vec.fine_tune(net_3, epochs=100)
>>> petri_net2vec.get_net_embeddings()
"""
assert self._fitted, 'Pre-Train the model first by calling fit().'
# Convert new Petri Nets to new documents
new_documents = []
new_examples = int(self._documents[-1].tags[0])
for i, net in enumerate(petriNets):
new_examples += 1
Transitions = [arc for arc in net.arcs if type(arc.source) == pm4py.objects.petri_net.obj.PetriNet.Transition]
Places = [arc for arc in net.arcs if type(arc.source) == pm4py.objects.petri_net.obj.PetriNet.Place]
for transition in Transitions:
for place in Places:
if place.source == transition.target:
if self.black_transitions:
left = 'None' if transition.source.label == None else transition.source.label
right = 'None' if place.target.label == None else place.target.label
else:
left = transition.source.name.split('_')[0] if transition.source.label == None else transition.source.label
right = place.target.name.split('_')[0] if place.target.label == None else place.target.label
if left != right:
new_documents.append(TaggedDocument(words=[left, right], tags=[str(new_examples)]))
# Build vocabulary and train the model
self.model.build_vocab(corpus_iterable = new_documents, update=True)
self.model.train(corpus_iterable = self._documents,
epochs = epochs,
total_examples = self.model.corpus_count,
compute_loss=True)
self.model.update_weights()
self._task_keys = self.model.wv.key_to_index
self._net_keys = self.model.dv.key_to_index
# Update embeddings and set the model as fitted
self._embeddings = [self.model.docvecs[str(i)] for i in range(new_examples)]