-
Notifications
You must be signed in to change notification settings - Fork 0
/
eval_baseline.py
324 lines (271 loc) · 12.7 KB
/
eval_baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
"""
The BASELINE 1 consists of
- external intent predictor
- dialog component: handcrafted, similarity matching for NLU
- faq component: similarity matching
Other possible baseline:
- BASELINE 2
- external intent predictor
- dialog component: RL (seperate)
- faq component: RL (seperate)
Comparison to our system:
- RL
- predicts also intent
Metrics for comparison:
- Intent tracker accuracy / F1
- Success
- Overall
- Free
- Guided
- Goal Asked
- Overall
- Free
- Guided
Necessary switches:
- faq/dialog ratio
- train / test data
- use synonyms
- noise level
"""
from dataclasses import dataclass
from statistics import mean
import os
from tqdm import tqdm
from chatbot.adviser.app.encoding.similiarity import AnswerSimilarityEncoding
from chatbot.adviser.app.faqPolicy import FAQPolicy, GuidedPolicy, IntentTracker, Intent
from chatbot.adviser.app.rl.dialogenv import DialogEnvironment, EnvironmentMode
from chatbot.adviser.app.rl.dialogtree import DialogTree
import chatbot.adviser.app.rl.dataset as Data
from chatbot.adviser.app.rl.utils import EMBEDDINGS, AutoSkipMode, EnvInfo, safe_division
import random
import numpy as np
import torch
import numpy as np
JOINT_DATA = False
RUN_SEEDS = [12345678, 89619, 7201944, 398842, 57063456]
@dataclass
class SimulatorConfig:
mode: EnvironmentMode
action_masking: bool
use_answer_synonyms: bool
max_steps: int
user_patience: int
dialog_faq_ratio: float
dialogs: int
dialog_faq_ratio: float
stop_action: bool
train_noise: float
eval_noise: float
test_noise: float
auto_skip: AutoSkipMode
@dataclass
class FAQSettings:
top_k: int
@dataclass
class Experiment:
cudnn_deterministic: bool
class Evaluator:
def setUp(self) -> None:
self.device = "cuda:0" if len(os.environ["CUDA_VISIBLE_DEVICES"].strip()) > 0 else "cpu"
self.exp_name_prefix = "TOP1_JOINTDATA_test_10noise_synonyms_intentpredictor_similarity"
self.args = {
"configuration": SimulatorConfig(
mode = EnvironmentMode.TEST, # For eval: EnvironmentMode.TRAIN
action_masking = True,
use_answer_synonyms = True,
max_steps = 50,
user_patience = 3,
dialogs = 500,
dialog_faq_ratio = 0.5,
stop_action=False,
train_noise=0.0,
eval_noise=0.0,
test_noise=0.1,
auto_skip=AutoSkipMode.SIMILARITY
),
"faq_settings": FAQSettings(
top_k = 1
),
"experiment": Experiment(
cudnn_deterministic = False,
)
}
torch.backends.cudnn.deterministic = self.args["experiment"].cudnn_deterministic
config : SimulatorConfig = self.args['configuration']
print("MODE", config.mode.name)
# load text embedding
text_embedding_name = "distiluse-base-multilingual-cased-v2"
EMBEDDINGS[text_embedding_name]['args'].pop('cache_db_index')
self.text_enc = EMBEDDINGS[text_embedding_name]['class'](device=self.device, **EMBEDDINGS[text_embedding_name]['args'])
self.exp_name = f"BASELINE_{self.exp_name_prefix}"
os.makedirs(f"/fs/scratch/users/vaethdk/adviser_reisekosten/newruns/{self.exp_name}")
dialog_logfile = f"/fs/scratch/users/vaethdk/adviser_reisekosten/newruns/{self.exp_name}/dialogs.txt"
# TODO save config file to this directory
self.tree = DialogTree(version=0 if config.mode in [EnvironmentMode.TRAIN, EnvironmentMode.EVAL] else 1)
# load models
self.sentence_embeddings = AnswerSimilarityEncoding(model_name="distiluse-base-multilingual-cased-v2", dialog_tree=self.tree, device=self.device, caching=False)
self.similarity_model = self.sentence_embeddings.similarity_model
self.intent_tracker = IntentTracker(device=self.device, ckpt_dir='./.models/intentpredictor')
if config.mode == EnvironmentMode.TRAIN:
noise = config.train_noise
elif config.mode == EnvironmentMode.EVAL:
noise = config.eval_noise
else:
noise = config.test_noise
# load env
self.eval_env = DialogEnvironment(dialog_tree=self.tree, adapter=None, stop_action=config.stop_action,
use_answer_synonyms=config.use_answer_synonyms, mode=config.mode,
train_noise=config.train_noise, eval_noise=config.eval_noise, test_noise=config.test_noise,
max_steps=config.max_steps, user_patience=config.user_patience,
auto_skip=AutoSkipMode.NONE, dialog_faq_ratio=config.dialog_faq_ratio,
log_to_file=dialog_logfile, return_obs=False, normalize_rewards=True,
stop_when_reaching_goal=True, similarity_model = self.sentence_embeddings)
# load policies
self.guided_policy = GuidedPolicy(similarity_model=self.sentence_embeddings, stop_action=config.stop_action, auto_skip=config.auto_skip, noise=noise)
self.free_policy = FAQPolicy(dialog_tree=self.tree, similarity_model=self.similarity_model, top_k=self.args['faq_settings'].top_k, noise=noise)
@torch.no_grad()
def _play_free_episode(self):
results = self.free_policy.top_k(query=self.eval_env.initial_user_utterance)
# TODO: missing_variable? could be 1 if we draw an FAQ with a template
mode_key = 'faq' if self.eval_env.is_faq_mode else 'dialog'
for result in results:
# check if any result matches the goal -> if so, success!
if result.goal_node_key == self.eval_env.goal_node.key:
return {
f"goal_asked_{mode_key}": 1.0,
f"success_{mode_key}": 1.0,
"episode_length": 1,
"faq_dialog_ratio": 1.0,
"ask_variable_irrelevant_ratio": 0.0,
"ask_question_irrelevant_ratio": 0.0,
"success": 1.0,
"goal_asked": 1.0
}
# no result matches the goal -> not successful
return {
f"goal_asked_{mode_key}": 0.0,
f"success_{mode_key}": 0.0,
"episode_length": 1,
"faq_dialog_ratio": 1.0,
"ask_variable_irrelevant_ratio": 0.0,
"ask_question_irrelevant_ratio": 0.0,
"success": 0.0,
"goal_asked": 0.0
}
@torch.no_grad()
def _play_guided_episode(self):
self.guided_policy.reset()
done = False
info = None
while not done:
action = self.guided_policy.get_action(self.eval_env.current_node, self.eval_env.current_user_utterance, self.eval_env.last_action_idx)
_, reward, done, info = self.eval_env.step(action)
mode_key = 'faq' if self.eval_env.is_faq_mode else 'dialog'
return {
"episode_length": float(info[EnvInfo.EPISODE_LENGTH]),
"success": float(info[EnvInfo.REACHED_GOAL_ONCE]),
"goal_asked": float(info[EnvInfo.ASKED_GOAL]),
f"success_{mode_key}": float(info[EnvInfo.REACHED_GOAL_ONCE]),
f"goal_asked_{mode_key}": float(info[EnvInfo.ASKED_GOAL]),
"episode_skip_length_ratio": self.eval_env.skipped_nodes / info[EnvInfo.EPISODE_LENGTH],
"skipped_question_ratio": safe_division(self.eval_env.actioncount_skip_question, self.eval_env.nodecount_question),
"skipped_variable_ratio": safe_division(self.eval_env.actioncount_skip_variable, self.eval_env.nodecount_variable),
"skipped_info_ratio": safe_division(self.eval_env.actioncount_skip_info, self.eval_env.nodecount_info),
"skipped_invalid_ratio": safe_division(self.eval_env.actioncount_skip_invalid, self.eval_env.actioncount_skip),
"faq_dialog_ratio": 0.0,
"ask_variable_irrelevant_ratio": safe_division(self.eval_env.actioncount_ask_variable_irrelevant, self.eval_env.actioncount_ask_variable),
"ask_question_irrelevant_ratio": safe_division(self.eval_env.actioncount_ask_question_irrelevant, self.eval_env.actioncount_ask_question),
"episode_missing_variable_ratio": self.eval_env.actioncount_missingvariable,
}
@torch.no_grad()
def eval(self, env: DialogEnvironment, eval_dialogs: int, seed) -> float:
"""
Returns:
goal_asked score (float)
"""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
self.similarity_model.eval()
eval_metrics = {
# "episode_return": [],
"episode_length": [],
"success": [],
"goal_asked": [],
"success_faq": [],
"success_dialog": [],
"goal_asked_faq": [],
"goal_asked_dialog": [],
"episode_skip_length_ratio": [],
"skipped_question_ratio": [],
"skipped_variable_ratio": [],
"skipped_info_ratio": [],
"skipped_invalid_ratio": [],
# "stop_prematurely_ratio": [],
"faq_dialog_ratio": [],
# "episode_stop_ratio": [],
"ask_variable_irrelevant_ratio": [],
"ask_question_irrelevant_ratio": [],
"episode_missing_variable_ratio": [],
# "episode_history_wordcount": [],
# "max_history_wordcount": [0],
}
intentprediction_tp = 0
intentprediction_tn = 0
intentprediction_fp = 0
intentprediction_fn = 0
for i in range(eval_dialogs):
# reset
self.eval_env.reset()
info = None
# intent prediction
# TODO do this per turn like real intent tracker in RL algorithm or just at the beginning?
intent = self.intent_tracker.get_intent(self.eval_env.current_node, gen_user_utterance=self.eval_env.initial_user_utterance)
self.eval_env.episode_log.append(f"Intent prediction: {intent.name}")
if intent == Intent.FREE:
# do free
info = self._play_free_episode()
else:
# do guided
info = self._play_guided_episode()
assert info
# evaluate intent tracker
if self.eval_env.is_faq_mode == False and intent == Intent.FREE:
intentprediction_fp += 1
elif self.eval_env.is_faq_mode == False and intent == Intent.GUIDED:
intentprediction_tn += 1
elif self.eval_env.is_faq_mode == True and intent == Intent.FREE:
intentprediction_tp += 1
elif self.eval_env.is_faq_mode == True and intent == Intent.GUIDED:
intentprediction_fn += 1
# update global evaluation metrics with current dialog
for metric in info:
eval_metrics[metric].append(info[metric])
self.eval_env.logger.info("\n".join(self.eval_env.episode_log))
# log metrics (averaged)
log_dict = {}
eval_metrics["intentprediction_f1"] = [safe_division(intentprediction_tp, intentprediction_tp + 0.5 * (intentprediction_fp + intentprediction_fn))]
eval_metrics["intentprediction_recall"] = [safe_division(intentprediction_tp, intentprediction_tp + intentprediction_fn)]
eval_metrics["intentprediction_precision"] = [safe_division(intentprediction_tp, intentprediction_tp + intentprediction_fp)]
eval_metrics["intentprediction_accuracy"] = [safe_division(intentprediction_tp + intentprediction_tn, eval_dialogs)]
for metric in eval_metrics:
numerical_entries = [num for num in eval_metrics[metric] if num is not None]
if len(numerical_entries) == 0:
numerical_entries = [0.0]
log_dict[f"{metric}"] = mean(numerical_entries)
print(log_dict)
if __name__ == "__main__":
os.environ["TOKENIZERS_PARALLELISM"] = "true"
if JOINT_DATA == True:
Data.objects[0] = Data.Dataset.fromJSON('traintest_graph.json', version=0)
Data.objects[1] = Data.Dataset.fromJSON('traintest_graph.json', version=1)
else:
Data.objects[0] = Data.Dataset.fromJSON('train_graph.json', version=0)
Data.objects[1] = Data.Dataset.fromJSON('test_graph.json', version=1)
evaluator = Evaluator()
evaluator.setUp()
# call eval() method
# NOTE: run seperately for eval / test setting (change config)
for run, seed in enumerate(RUN_SEEDS):
print(f"---- RUN {run} with seed {seed} ----")
evaluator.eval(evaluator.eval_env, evaluator.args['configuration'].dialogs, seed)