Skip to content

Commit

Permalink
script: format model output, load human score, compute and analyze co…
Browse files Browse the repository at this point in the history
…rrelation

fix: #1, #2
TURX committed Jun 7, 2022
1 parent bc68f57 commit b486b9f
Showing 6 changed files with 293 additions and 27 deletions.
46 changes: 36 additions & 10 deletions analysis.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,26 @@
import json
import pickle
from os import path

import numpy as np


def read_result(path_result: str) -> dict:
def read_json(path_result: str) -> dict:
with open(path_result, 'r') as infile:
return json.load(infile)


def read_results() -> (dict, dict):
def read_pkl(path_result: str) -> dict:
with open(path_result, 'rb') as infile:
return pickle.load(infile)


def read_results() -> (dict, dict, dict):
path_results = 'results'
newsroom_results = read_result(path.join(path_results, 'newsroom.json'))
realsumm_results = read_result(path.join(path_results, 'realsumm.json'))
return newsroom_results, realsumm_results
newsroom_results = read_json(path.join(path_results, 'model/newsroom.json'))
realsumm_results = read_json(path.join(path_results, 'model/realsumm.json'))
corr_results = read_pkl(path.join(path_results, 'model/corr.pkl'))
return newsroom_results, realsumm_results, corr_results


def extract_results(metric_name: str, newsroom_results: dict, realsumm_results: dict) -> (dict, dict, dict, dict):
@@ -31,10 +38,10 @@ def rouge_analysis(newsroom_results: dict, realsumm_results: dict) -> None:
metrics = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
for metric in metrics:
metric_score = dict()
metric_score['newsroom_trad'] = newsroom_trad[metric][1][2]
metric_score['newsroom_new'] = newsroom_new[metric][1][2]
metric_score['realsumm_trad'] = realsumm_trad[metric][1][2]
metric_score['realsumm_new'] = realsumm_new[metric][1][2]
metric_score['newsroom_trad'] = np.median(newsroom_trad[metric], axis=1)[2]
metric_score['newsroom_new'] = np.median(newsroom_new[metric], axis=1)[2]
metric_score['realsumm_trad'] = np.median(realsumm_trad[metric], axis=1)[2]
metric_score['realsumm_new'] = np.median(realsumm_new[metric], axis=1)[2]
rouge_scores[metric] = metric_score
with open('results/analysis/rouge.json', 'w') as outfile:
json.dump(rouge_scores, outfile, indent=4)
@@ -76,9 +83,28 @@ def bleurt_analysis(newsroom_results: dict, realsumm_results: dict) -> None:
json.dump(bleurt_score, outfile, indent=4)


def corr_analysis(corr_results: dict) -> None:
results = dict()
datasets = ['newsroom', 'realsumm']
metrics = ['rouge', 'bertscore', 'bleurt']
approaches = ['trad', 'new']
for dataset in datasets:
results[dataset] = dict()
for metric in metrics:
results[dataset][metric] = dict()
for approach in approaches:
results[dataset][metric][approach] = dict()
for corr_type in corr_results[dataset][metric][approach].keys():
values = list(corr_results[dataset][metric][approach][corr_type].values())
results[dataset][metric][approach][corr_type] = np.mean(values)
with open('results/analysis/corr.json', 'w') as outfile:
json.dump(results, outfile, indent=4)


if __name__ == '__main__':
newsroom_results, realsumm_results = read_results()
newsroom_results, realsumm_results, corr_results = read_results()
rouge_analysis(newsroom_results, realsumm_results)
bertscore_analysis(newsroom_results, realsumm_results)
bleu_analysis(newsroom_results, realsumm_results)
bleurt_analysis(newsroom_results, realsumm_results)
corr_analysis(corr_results)
148 changes: 148 additions & 0 deletions corr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import json
import pickle

import numpy as np

import dataloader.newsroom as newsroom
import dataloader.realsumm as realsumm

import scipy

model_scores = dict()
corr = dict()
approaches = ['trad', 'new']


def read_system_scores() -> dict:
with open('results/model/scores.json', 'r') as infile:
return json.load(infile)


def newsroom_read(metrics: list) -> dict:
"""
Return data structure:
{
docID: {
system1: {
"Coherence": float,
"Fluency": float,
"Informativeness": float,
"Relevance": float,
"precision": float,
"recall": float,
"f1": float
}
system2: { ... }
...
system7: {... }
}
}
"""
system_scores = dict()
for approach in approaches:
system_scores[approach] = dict()
_, _, _, human_scores = newsroom.read('dataloader')
for i in range(len(human_scores)):
for approach in approaches:
system_scores[approach][i] = dict()
human_keys = human_scores[i].keys()
for metric in metrics:
if metric != 'bleu':
system_scores[approach][i][metric] = dict()
for key in human_keys:
system_scores[approach][i][metric][key] = human_scores[i][key]
system_keys = model_scores['newsroom'][metric][approach].keys()
for key in system_keys:
system_scores[approach][i][metric][key] = model_scores['newsroom'][metric][approach][key][i]
return system_scores


def system_judge(scores, metrics_human, metrics_system, correlation_types) -> dict:
# ref: suenes.human.newsroom.test_eval
all_system_names = list(scores[list(scores.keys())[0]].keys())

def get_correlation_two_metrics(scores, metric_human, metric_system, correlation_type):
mean_score_vector_newsroom = []
mean_score_vector_other = []
for system in all_system_names:
vector_human = [] # scores from a human metric
vector_system = [] # scores from a non-human metric
for docID in scores.keys():
score_local = scores[docID][system]
score_newsroom = score_local[metric_human] # one float
score_other = score_local[metric_system] # one float
vector_human.append(score_newsroom)
vector_system.append(score_other)

mean_score_vector_newsroom.append(np.mean(vector_human))
mean_score_vector_other.append(np.mean(vector_system))
return eval(f"scipy.stats.{correlation_type}(vector_human, vector_system)")[0]

# now begins the system-level judge
correlations = {}
for correlation_type in correlation_types:
correlations[correlation_type] = {}
for metric_human in metrics_human: # one metric from human
for metric_system in metrics_system: # one metric to evaluate against human
correlations[correlation_type] \
[(metric_human, metric_system)] = \
get_correlation_two_metrics(scores, metric_human, metric_system, correlation_type)

return correlations


def realsumm_read(metrics: list) -> dict:
_, _, _, dataset_scores = realsumm.read('suenes/human/realsumm/scores_dicts/',
'suenes/human/realsumm/analysis/test.tsv')
system_scores = dict()
for approach in approaches:
system_scores[approach] = dict()
for i in range(len(dataset_scores)):
system_scores[approach][i] = dict()
for metric in metrics:
system_scores[approach][i][metric] = dict()
system_scores[approach][i][metric]['litepyramid_recall'] = dataset_scores[i][
'litepyramid_recall'] # human score
system_keys = model_scores['realsumm'][metric]['trad'].keys()
for key in system_keys:
system_scores[approach][i][metric][key] = model_scores['realsumm'][metric][approach][key][i]
return system_scores


def calculate(dataset: str) -> None:
corr[dataset] = dict()
available_metrics_systems = {
'rouge': ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'],
'bertscore': ['bertscore'],
'bleurt': ['bleurt']
}
for id in range(len(available_metrics_systems.keys())):
metric_systems_name = list(available_metrics_systems.keys())[id]
metric_systems = available_metrics_systems[metric_systems_name]
if dataset == 'newsroom':
system_scores = newsroom_read(metric_systems)
metrics_human = ['Coherence', 'Informativeness', 'Fluency', 'Relevance']
elif dataset == 'realsumm':
system_scores = realsumm_read(metric_systems)
metrics_human = ['litepyramid_recall']
else:
raise NotImplementedError()
if metric_systems_name == 'bleurt':
metrics_system = ['scores']
else:
metrics_system = ['precision', 'recall', 'f1']
correlation_types = ['pearsonr', 'kendalltau', 'spearmanr']
my_corr = dict()
for approach in approaches:
my_corr[approach] = system_judge(system_scores[approach], metrics_human, metrics_system,
correlation_types)
corr[dataset][metric_systems_name] = my_corr


if __name__ == '__main__':
model_scores = read_system_scores()
datasets = ['newsroom', 'realsumm']
for dataset in datasets:
calculate(dataset)
with open('results/model/corr.pkl', 'wb') as outfile:
pickle.dump(corr, outfile)
19 changes: 14 additions & 5 deletions dataloader/newsroom.py
Original file line number Diff line number Diff line change
@@ -2,17 +2,26 @@
import string
from os import path

def read(path_read: string) -> (list, list, list):

def read(path_read: string) -> (list, list, list, list):
data = list()
with open(path.join(path_read, "newsroom-human-eval.csv")) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
data.append(row)
sys_summaries = list()
ref_summaries = list()
docs = list()
sys_summaries, ref_summaries, docs, scores = list(), list(), list(), list()
for datum in data:
sys_summaries.append(datum['SystemSummary'])
ref_summaries.append(datum['ArticleTitle'])
docs.append(datum['ArticleText'])
return sys_summaries, ref_summaries, docs
score = dict()
to_copy = ['Coherence', 'Fluency', 'Informativeness', 'Relevance']
for i in to_copy:
score[i] = int(datum[i + 'Rating'])
scores.append(score)
return sys_summaries, ref_summaries, docs, scores


if __name__ == '__main__':
data = read('.')
print(data)
19 changes: 14 additions & 5 deletions dataloader/realsumm.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import copy
import string
from os import path

import suenes.human.realsumm.analysis.utils as utils


def read_summary(path_summary: string) -> (list, list):
sd_abs_path = path.join(path_summary, "abs_ours.pkl")
sd_ext_path = path.join(path_summary, "ext_ours.pkl")
@@ -12,13 +14,14 @@ def read_summary(path_summary: string) -> (list, list):
for doc_id in sd:
isd_sota_ext = sd_ext[doc_id]
sd[doc_id]['system_summaries'].update(isd_sota_ext['system_summaries'])
sys_summaries = list()
ref_summaries = list()
sys_summaries, ref_summaries, scores = list(), list(), list()
for sd_item in sd.items():
for sys_item in sd_item[1]['system_summaries'].items():
ref_summaries.append(sd_item[1]['ref_summ'])
sys_summaries.append(sys_item[1]['system_summary'])
return sys_summaries, ref_summaries
scores.append(sys_item[1]['scores'])
return sys_summaries, ref_summaries, scores


def read_docs(path_docs: string) -> list:
with open(path_docs, 'r') as infile:
@@ -29,7 +32,13 @@ def read_docs(path_docs: string) -> list:
docs.append(row)
return docs


def read(path_summary: string, path_docs: string) -> (list, list, list):
sys_summaries, ref_summaries = read_summary(path_summary)
sys_summaries, ref_summaries, scores = read_summary(path_summary)
docs = read_docs(path_docs)
return sys_summaries, ref_summaries, docs
return sys_summaries, ref_summaries, docs, scores


if __name__ == '__main__':
data = read('../suenes/human/realsumm/scores_dicts/', '../suenes/human/realsumm/analysis/test.tsv')
print(data)
19 changes: 12 additions & 7 deletions eval.py
Original file line number Diff line number Diff line change
@@ -15,21 +15,26 @@ def model_eval(sys_summaries: list, ref_summaries: list, docs: list) -> dict:
if model_name != 'bleurt':
model = evaluate.load(model_name)
else:
model = evaluate.load('bleurt', config_name='bleurt-large-512', module_type='metric')
model = evaluate.load('bleurt', config_name='BLEURT-20', module_type='metric')

model_result = dict()

# calculate traditional (reference, system summary) pairs
print('Eval trad')
if model_name == 'bertscore':
model_result['trad'] = model.compute(predictions=sys_summaries, references=ref_summaries, lang='en')
elif model_name == 'rouge':
model_result['trad'] = model.compute(predictions=sys_summaries, references=ref_summaries,
use_aggregator=False)
else:
model_result['trad'] = model.compute(predictions=sys_summaries, references=ref_summaries)

# calculate new (document, system summary) pairs
print('Eval new')
if model_name == 'bertscore':
model_result['new'] = model.compute(predictions=sys_summaries, references=docs, lang='en')
elif model_name == 'rouge':
model_result['new'] = model.compute(predictions=sys_summaries, references=docs, use_aggregator=False)
else:
model_result['new'] = model.compute(predictions=sys_summaries, references=docs)

@@ -39,19 +44,19 @@ def model_eval(sys_summaries: list, ref_summaries: list, docs: list) -> dict:


def realsumm_eval():
print('[Realsumm]')
sys_summaries, ref_summaries, docs = realsumm.read('suenes/human/realsumm/scores_dicts/',
'suenes/human/realsumm/analysis/test.tsv')
print('[RealSumm]')
sys_summaries, ref_summaries, docs, _ = realsumm.read('suenes/human/realsumm/scores_dicts/',
'suenes/human/realsumm/analysis/test.tsv')
results = model_eval(sys_summaries, ref_summaries, docs)
with open('results/realsumm.json', 'w') as outfile:
with open('results/model/realsumm.json', 'w') as outfile:
json.dump(results, outfile, indent=4)


def newsroom_eval():
print('[Newsroom]')
sys_summaries, ref_summaries, docs = newsroom.read('dataloader')
sys_summaries, ref_summaries, docs, _ = newsroom.read('dataloader')
results = model_eval(sys_summaries, ref_summaries, docs)
with open('results/newsroom.json', 'w') as outfile:
with open('results/model/newsroom.json', 'w') as outfile:
json.dump(results, outfile, indent=4)


Loading

0 comments on commit b486b9f

Please sign in to comment.