-
Notifications
You must be signed in to change notification settings - Fork 0
/
experiment.py
182 lines (152 loc) · 5.66 KB
/
experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import env
import os
import typing
import sys
sys.path.append("/home/turx/EvalBase")
import evalbase # the evaluation framework dependency
import dar_type # data type definitions for DocAsRef
import dataset_config # configurations for datasets used in DocAsRef benchmarking
# Metrics to evaluate #
def enable_metrics(
metric_dict: dar_type.MetricDict,
metric_names: typing.List[str]) -> dar_type.MetricDict:
metrics_enabled = {
metric_name:metric_fn
for metric_name, metric_fn in metric_dict.items()
if metric_name in metric_names}
return metrics_enabled
## Metrics from Approach 0 ##
import classic.metric
names_of_enabled_classic_metrics = [
"rouge",
"bleurt", # requires datasets-2.10.0 per https://github.com/huggingface/evaluate/issues/449
"moverscore-1gram",
"moverscore-2gram",
"bertscore-bert-base",
"bertscore-roberta-base",
"bertscore-deberta-base",
"bertscore-bart-base",
"bertscore-deberta-base-mnli",
"bertscore-deberta-v3-base",
"bertscore-deberta-v3-base-mnli-fever-anli",
"bertscore-deberta-large",
"bertscore-roberta-large",
"bertscore-bart-large",
"bertscore-deberta-large-mnli",
"bertscore-roberta-large-mnli",
"bertscore-bart-large-mnli",
]
classic_metrics_enabled = enable_metrics(
classic.metric.metrics,
names_of_enabled_classic_metrics
)
## Experiments for Approaches 1.1 + 1.2 ##
## BERTScore sentence-level without weighting ##
import bertscore_sentence.metric
names_of_enabled_bertscore_sentence_metrics = [
'bertscore-sentence-cos-mpnet',
'bertscore-sentence-cos-roberta-large',
'bertscore-sentence-cos-deberta-large',
'bertscore-sentence-mnli-roberta-large-mnli-not_neutral',
'bertscore-sentence-mnli-roberta-large-mnli-entail_only',
'bertscore-sentence-mnli-roberta-large-mnli-entail_contradict',
'bertscore-sentence-mnli-bart-large-mnli-not_neutral',
'bertscore-sentence-mnli-bart-large-mnli-entail_only',
'bertscore-sentence-mnli-bart-large-mnli-entail_contradict',
'bertscore-sentence-mnli-deberta-large-mnli-not_neutral',
'bertscore-sentence-mnli-deberta-large-mnli-entail_only',
'bertscore-sentence-mnli-deberta-large-mnli-entail_contradict'
]
# names_of_enabled_bertscore_sentence_metrics = \
# [ "bertscore-sentence-cos-{}".format(model_name)
# for model_name in ["mpnet", "roberta-large", "deberta-large"]
# ] + \
# [ "bertscore-sentence-mnli-{}-{}".format(model_name, mnli_expr)
# for model_name in ["roberta-large-mnli", "bart-large-mnli", "deberta-large-mnli"]
# for mnli_expr in ["not_neutral", "entail_only", "entail_contradict"]
# ]
bertscore_sentence_metrics_enabled = enable_metrics(
bertscore_sentence.metric.metrics,
names_of_enabled_bertscore_sentence_metrics
)
## Expeirments for Approaching 1.4 ##
## sentence weighting in the PageRank style ##
import pagerank.metric
### The base metrics to be weighted using PageRank
base_metrics = {
# **bertscore_sentence.metric.metrics,
**bertscore_sentence_metrics_enabled,
}
### Specify the weight schemes as a sublist of ["entropy", "sum"]
weight_schemes = ["entropy", "sum"]
pagerank_metrics_enabled = pagerank.metric.create_metrics(
base_metrics, weight_schemes)
## Experiments for Approach 1.5 ##
## BERTScore sentence with leadword pseudo-references ##
import top.metric
## Select base metrics to be rapped with top-k or top-p heuriostics
base_metrics = {
**bertscore_sentence_metrics_enabled,
**pagerank_metrics_enabled,
# **bertscore_sentence.metric.metrics,
# pagerank.metric.metrics,
}
## Set the k and p ranges
k_range = [3]
p_range = [0.3]
top_metrics_enabled = top.metric.create_metrics(
base_metrics, k_range, p_range)
## Experiments for Approach 1.6 ##
## BERTScore sentence with references generated by decent sumarizers ##
import anyref.metric
base_metrics = {
**bertscore_sentence_metrics_enabled,
**pagerank_metrics_enabled
}
summarizer_names = [
"bart",
# "pegasus-xsum",
# "pegasus-newsroom",
# "pegasus-cnndm",
"pegasus-large"
]
anyref_metrics_enabled = anyref.metric.create_metrics(
base_metrics, summarizer_names)
# Experiments on some baselines
names_of_enabled_baseline_metrics = [
"sacrebleu",
"meteor",
"bart",
"smd"
]
# Put all metrics together
all_metrics_enabled = {
**classic_metrics_enabled,
**bertscore_sentence_metrics_enabled,
**pagerank_metrics_enabled,
**top_metrics_enabled,
**anyref_metrics_enabled
}
# Running experiments on different datasets #
## Experiment configurations for all datasets
common_exp_config = {
"nlg_metrics" : all_metrics_enabled,
"corr_metrics" : ["spearmanr", "pearsonr", "kendalltau"],
"approaches": ["trad", "new"],
"eval_levels": ["summary", "system"],
"result_path_root": "./results/",
"debug": False,
}
import dataset_config
experiment_fn_and_configs = [
(evalbase.summeval.main, dataset_config.summeval_config),
(evalbase.newsroom.main, dataset_config.newsroom_config),
(evalbase.realsumm.main, dataset_config.realsumm_abs_config),
(evalbase.realsumm.main, dataset_config.realsumm_ext_config),
# (evalbase.tac2010.main, dataset_config.tac2010_config),
# (evalbase.qags.main, dataset_config.qags_config),
# (evalbase.frank.main, dataset_config.frank_config),
# (evalbase.fastcc.main, dataset_config.fastcc_config),
]
for (exp_fn, exp_config) in experiment_fn_and_configs:
exp_fn(exp_config | common_exp_config)