From b486b9f8b0171585660dce2a7c3775c1036fd5f1 Mon Sep 17 00:00:00 2001 From: Ruixuan Tu Date: Tue, 7 Jun 2022 00:40:46 -0500 Subject: [PATCH] script: format model output, load human score, compute and analyze correlation fix: https://github.com/SigmaWe/DocAsRef/issues/1, https://github.com/SigmaWe/DocAsRef/issues/2 --- analysis.py | 46 ++++++++++--- corr.py | 148 +++++++++++++++++++++++++++++++++++++++++ dataloader/newsroom.py | 19 ++++-- dataloader/realsumm.py | 19 ++++-- eval.py | 19 ++++-- format.py | 69 +++++++++++++++++++ 6 files changed, 293 insertions(+), 27 deletions(-) create mode 100644 corr.py create mode 100644 format.py diff --git a/analysis.py b/analysis.py index 90a0de5..ef41173 100644 --- a/analysis.py +++ b/analysis.py @@ -1,19 +1,26 @@ import json +import pickle from os import path import numpy as np -def read_result(path_result: str) -> dict: +def read_json(path_result: str) -> dict: with open(path_result, 'r') as infile: return json.load(infile) -def read_results() -> (dict, dict): +def read_pkl(path_result: str) -> dict: + with open(path_result, 'rb') as infile: + return pickle.load(infile) + + +def read_results() -> (dict, dict, dict): path_results = 'results' - newsroom_results = read_result(path.join(path_results, 'newsroom.json')) - realsumm_results = read_result(path.join(path_results, 'realsumm.json')) - return newsroom_results, realsumm_results + newsroom_results = read_json(path.join(path_results, 'model/newsroom.json')) + realsumm_results = read_json(path.join(path_results, 'model/realsumm.json')) + corr_results = read_pkl(path.join(path_results, 'model/corr.pkl')) + return newsroom_results, realsumm_results, corr_results def extract_results(metric_name: str, newsroom_results: dict, realsumm_results: dict) -> (dict, dict, dict, dict): @@ -31,10 +38,10 @@ def rouge_analysis(newsroom_results: dict, realsumm_results: dict) -> None: metrics = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'] for metric in metrics: metric_score = dict() - metric_score['newsroom_trad'] = newsroom_trad[metric][1][2] - metric_score['newsroom_new'] = newsroom_new[metric][1][2] - metric_score['realsumm_trad'] = realsumm_trad[metric][1][2] - metric_score['realsumm_new'] = realsumm_new[metric][1][2] + metric_score['newsroom_trad'] = np.median(newsroom_trad[metric], axis=1)[2] + metric_score['newsroom_new'] = np.median(newsroom_new[metric], axis=1)[2] + metric_score['realsumm_trad'] = np.median(realsumm_trad[metric], axis=1)[2] + metric_score['realsumm_new'] = np.median(realsumm_new[metric], axis=1)[2] rouge_scores[metric] = metric_score with open('results/analysis/rouge.json', 'w') as outfile: json.dump(rouge_scores, outfile, indent=4) @@ -76,9 +83,28 @@ def bleurt_analysis(newsroom_results: dict, realsumm_results: dict) -> None: json.dump(bleurt_score, outfile, indent=4) +def corr_analysis(corr_results: dict) -> None: + results = dict() + datasets = ['newsroom', 'realsumm'] + metrics = ['rouge', 'bertscore', 'bleurt'] + approaches = ['trad', 'new'] + for dataset in datasets: + results[dataset] = dict() + for metric in metrics: + results[dataset][metric] = dict() + for approach in approaches: + results[dataset][metric][approach] = dict() + for corr_type in corr_results[dataset][metric][approach].keys(): + values = list(corr_results[dataset][metric][approach][corr_type].values()) + results[dataset][metric][approach][corr_type] = np.mean(values) + with open('results/analysis/corr.json', 'w') as outfile: + json.dump(results, outfile, indent=4) + + if __name__ == '__main__': - newsroom_results, realsumm_results = read_results() + newsroom_results, realsumm_results, corr_results = read_results() rouge_analysis(newsroom_results, realsumm_results) bertscore_analysis(newsroom_results, realsumm_results) bleu_analysis(newsroom_results, realsumm_results) bleurt_analysis(newsroom_results, realsumm_results) + corr_analysis(corr_results) diff --git a/corr.py b/corr.py new file mode 100644 index 0000000..85c207a --- /dev/null +++ b/corr.py @@ -0,0 +1,148 @@ +import json +import pickle + +import numpy as np + +import dataloader.newsroom as newsroom +import dataloader.realsumm as realsumm + +import scipy + +model_scores = dict() +corr = dict() +approaches = ['trad', 'new'] + + +def read_system_scores() -> dict: + with open('results/model/scores.json', 'r') as infile: + return json.load(infile) + + +def newsroom_read(metrics: list) -> dict: + """ + Return data structure: + { + docID: { + system1: { + "Coherence": float, + "Fluency": float, + "Informativeness": float, + "Relevance": float, + "precision": float, + "recall": float, + "f1": float + } + system2: { ... } + ... + system7: {... } + } + } + """ + system_scores = dict() + for approach in approaches: + system_scores[approach] = dict() + _, _, _, human_scores = newsroom.read('dataloader') + for i in range(len(human_scores)): + for approach in approaches: + system_scores[approach][i] = dict() + human_keys = human_scores[i].keys() + for metric in metrics: + if metric != 'bleu': + system_scores[approach][i][metric] = dict() + for key in human_keys: + system_scores[approach][i][metric][key] = human_scores[i][key] + system_keys = model_scores['newsroom'][metric][approach].keys() + for key in system_keys: + system_scores[approach][i][metric][key] = model_scores['newsroom'][metric][approach][key][i] + return system_scores + + +def system_judge(scores, metrics_human, metrics_system, correlation_types) -> dict: + # ref: suenes.human.newsroom.test_eval + all_system_names = list(scores[list(scores.keys())[0]].keys()) + + def get_correlation_two_metrics(scores, metric_human, metric_system, correlation_type): + mean_score_vector_newsroom = [] + mean_score_vector_other = [] + for system in all_system_names: + vector_human = [] # scores from a human metric + vector_system = [] # scores from a non-human metric + for docID in scores.keys(): + score_local = scores[docID][system] + score_newsroom = score_local[metric_human] # one float + score_other = score_local[metric_system] # one float + vector_human.append(score_newsroom) + vector_system.append(score_other) + + mean_score_vector_newsroom.append(np.mean(vector_human)) + mean_score_vector_other.append(np.mean(vector_system)) + return eval(f"scipy.stats.{correlation_type}(vector_human, vector_system)")[0] + + # now begins the system-level judge + correlations = {} + for correlation_type in correlation_types: + correlations[correlation_type] = {} + for metric_human in metrics_human: # one metric from human + for metric_system in metrics_system: # one metric to evaluate against human + correlations[correlation_type] \ + [(metric_human, metric_system)] = \ + get_correlation_two_metrics(scores, metric_human, metric_system, correlation_type) + + return correlations + + +def realsumm_read(metrics: list) -> dict: + _, _, _, dataset_scores = realsumm.read('suenes/human/realsumm/scores_dicts/', + 'suenes/human/realsumm/analysis/test.tsv') + system_scores = dict() + for approach in approaches: + system_scores[approach] = dict() + for i in range(len(dataset_scores)): + system_scores[approach][i] = dict() + for metric in metrics: + system_scores[approach][i][metric] = dict() + system_scores[approach][i][metric]['litepyramid_recall'] = dataset_scores[i][ + 'litepyramid_recall'] # human score + system_keys = model_scores['realsumm'][metric]['trad'].keys() + for key in system_keys: + system_scores[approach][i][metric][key] = model_scores['realsumm'][metric][approach][key][i] + return system_scores + + +def calculate(dataset: str) -> None: + corr[dataset] = dict() + available_metrics_systems = { + 'rouge': ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], + 'bertscore': ['bertscore'], + 'bleurt': ['bleurt'] + } + for id in range(len(available_metrics_systems.keys())): + metric_systems_name = list(available_metrics_systems.keys())[id] + metric_systems = available_metrics_systems[metric_systems_name] + if dataset == 'newsroom': + system_scores = newsroom_read(metric_systems) + metrics_human = ['Coherence', 'Informativeness', 'Fluency', 'Relevance'] + elif dataset == 'realsumm': + system_scores = realsumm_read(metric_systems) + metrics_human = ['litepyramid_recall'] + else: + raise NotImplementedError() + if metric_systems_name == 'bleurt': + metrics_system = ['scores'] + else: + metrics_system = ['precision', 'recall', 'f1'] + correlation_types = ['pearsonr', 'kendalltau', 'spearmanr'] + my_corr = dict() + for approach in approaches: + my_corr[approach] = system_judge(system_scores[approach], metrics_human, metrics_system, + correlation_types) + corr[dataset][metric_systems_name] = my_corr + + +if __name__ == '__main__': + model_scores = read_system_scores() + datasets = ['newsroom', 'realsumm'] + for dataset in datasets: + calculate(dataset) + with open('results/model/corr.pkl', 'wb') as outfile: + pickle.dump(corr, outfile) diff --git a/dataloader/newsroom.py b/dataloader/newsroom.py index 945022f..93191b7 100644 --- a/dataloader/newsroom.py +++ b/dataloader/newsroom.py @@ -2,17 +2,26 @@ import string from os import path -def read(path_read: string) -> (list, list, list): + +def read(path_read: string) -> (list, list, list, list): data = list() with open(path.join(path_read, "newsroom-human-eval.csv")) as csvfile: reader = csv.DictReader(csvfile) for row in reader: data.append(row) - sys_summaries = list() - ref_summaries = list() - docs = list() + sys_summaries, ref_summaries, docs, scores = list(), list(), list(), list() for datum in data: sys_summaries.append(datum['SystemSummary']) ref_summaries.append(datum['ArticleTitle']) docs.append(datum['ArticleText']) - return sys_summaries, ref_summaries, docs + score = dict() + to_copy = ['Coherence', 'Fluency', 'Informativeness', 'Relevance'] + for i in to_copy: + score[i] = int(datum[i + 'Rating']) + scores.append(score) + return sys_summaries, ref_summaries, docs, scores + + +if __name__ == '__main__': + data = read('.') + print(data) diff --git a/dataloader/realsumm.py b/dataloader/realsumm.py index 71cf045..778b7a9 100644 --- a/dataloader/realsumm.py +++ b/dataloader/realsumm.py @@ -1,8 +1,10 @@ import copy import string from os import path + import suenes.human.realsumm.analysis.utils as utils + def read_summary(path_summary: string) -> (list, list): sd_abs_path = path.join(path_summary, "abs_ours.pkl") sd_ext_path = path.join(path_summary, "ext_ours.pkl") @@ -12,13 +14,14 @@ def read_summary(path_summary: string) -> (list, list): for doc_id in sd: isd_sota_ext = sd_ext[doc_id] sd[doc_id]['system_summaries'].update(isd_sota_ext['system_summaries']) - sys_summaries = list() - ref_summaries = list() + sys_summaries, ref_summaries, scores = list(), list(), list() for sd_item in sd.items(): for sys_item in sd_item[1]['system_summaries'].items(): ref_summaries.append(sd_item[1]['ref_summ']) sys_summaries.append(sys_item[1]['system_summary']) - return sys_summaries, ref_summaries + scores.append(sys_item[1]['scores']) + return sys_summaries, ref_summaries, scores + def read_docs(path_docs: string) -> list: with open(path_docs, 'r') as infile: @@ -29,7 +32,13 @@ def read_docs(path_docs: string) -> list: docs.append(row) return docs + def read(path_summary: string, path_docs: string) -> (list, list, list): - sys_summaries, ref_summaries = read_summary(path_summary) + sys_summaries, ref_summaries, scores = read_summary(path_summary) docs = read_docs(path_docs) - return sys_summaries, ref_summaries, docs + return sys_summaries, ref_summaries, docs, scores + + +if __name__ == '__main__': + data = read('../suenes/human/realsumm/scores_dicts/', '../suenes/human/realsumm/analysis/test.tsv') + print(data) diff --git a/eval.py b/eval.py index c65a684..a3bf18e 100644 --- a/eval.py +++ b/eval.py @@ -15,7 +15,7 @@ def model_eval(sys_summaries: list, ref_summaries: list, docs: list) -> dict: if model_name != 'bleurt': model = evaluate.load(model_name) else: - model = evaluate.load('bleurt', config_name='bleurt-large-512', module_type='metric') + model = evaluate.load('bleurt', config_name='BLEURT-20', module_type='metric') model_result = dict() @@ -23,6 +23,9 @@ def model_eval(sys_summaries: list, ref_summaries: list, docs: list) -> dict: print('Eval trad') if model_name == 'bertscore': model_result['trad'] = model.compute(predictions=sys_summaries, references=ref_summaries, lang='en') + elif model_name == 'rouge': + model_result['trad'] = model.compute(predictions=sys_summaries, references=ref_summaries, + use_aggregator=False) else: model_result['trad'] = model.compute(predictions=sys_summaries, references=ref_summaries) @@ -30,6 +33,8 @@ def model_eval(sys_summaries: list, ref_summaries: list, docs: list) -> dict: print('Eval new') if model_name == 'bertscore': model_result['new'] = model.compute(predictions=sys_summaries, references=docs, lang='en') + elif model_name == 'rouge': + model_result['new'] = model.compute(predictions=sys_summaries, references=docs, use_aggregator=False) else: model_result['new'] = model.compute(predictions=sys_summaries, references=docs) @@ -39,19 +44,19 @@ def model_eval(sys_summaries: list, ref_summaries: list, docs: list) -> dict: def realsumm_eval(): - print('[Realsumm]') - sys_summaries, ref_summaries, docs = realsumm.read('suenes/human/realsumm/scores_dicts/', - 'suenes/human/realsumm/analysis/test.tsv') + print('[RealSumm]') + sys_summaries, ref_summaries, docs, _ = realsumm.read('suenes/human/realsumm/scores_dicts/', + 'suenes/human/realsumm/analysis/test.tsv') results = model_eval(sys_summaries, ref_summaries, docs) - with open('results/realsumm.json', 'w') as outfile: + with open('results/model/realsumm.json', 'w') as outfile: json.dump(results, outfile, indent=4) def newsroom_eval(): print('[Newsroom]') - sys_summaries, ref_summaries, docs = newsroom.read('dataloader') + sys_summaries, ref_summaries, docs, _ = newsroom.read('dataloader') results = model_eval(sys_summaries, ref_summaries, docs) - with open('results/newsroom.json', 'w') as outfile: + with open('results/model/newsroom.json', 'w') as outfile: json.dump(results, outfile, indent=4) diff --git a/format.py b/format.py new file mode 100644 index 0000000..0c79153 --- /dev/null +++ b/format.py @@ -0,0 +1,69 @@ +import json +from os import path + +results, formatted_scores = dict(), dict() +approaches = ['trad', 'new'] +datasets = ['newsroom', 'realsumm'] + + +def read_result(path_result: str) -> dict: + with open(path_result, 'r') as infile: + return json.load(infile) + + +def read_results() -> dict: + path_results = 'results/model' + newsroom_results = read_result(path.join(path_results, 'newsroom.json')) + realsumm_results = read_result(path.join(path_results, 'realsumm.json')) + return { + 'newsroom': newsroom_results, + 'realsumm': realsumm_results + } + + +def rouge_format() -> None: + metrics = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'] + for dataset in datasets: + dataset_scores = dict() + for metric in metrics: + dataset_scores[metric] = dict() + for approach in approaches: + approach_scores = results[dataset]['rouge'][approach][metric] + formatted_approach_scores = dict() + formatted_approach_scores['precision'], formatted_approach_scores['recall'], formatted_approach_scores[ + 'f1'] = list(), list(), list() + for approach_score in approach_scores: + formatted_approach_scores['precision'].append(approach_score[0]) + formatted_approach_scores['recall'].append(approach_score[1]) + formatted_approach_scores['f1'].append(approach_score[2]) + dataset_scores[metric][approach] = formatted_approach_scores + formatted_scores[dataset] = dataset_scores + + +def bertscore_format() -> None: + for dataset in datasets: + formatted_scores[dataset]['bertscore'] = dict() + for approach in approaches: + approach_dict = results[dataset]['bertscore'][approach] + del approach_dict['hashcode'] + formatted_scores[dataset]['bertscore'][approach] = approach_dict + + +def bleurt_format() -> None: + for dataset in datasets: + formatted_scores[dataset]['bleurt'] = results[dataset]['bleurt'] + + +def bleu_format() -> None: + for dataset in datasets: + formatted_scores[dataset]['bleu'] = results[dataset]['bleu'] + + +if __name__ == '__main__': + results = read_results() + rouge_format() + bertscore_format() + bleurt_format() + bleu_format() + with open('results/model/scores.json', 'w') as outfile: + json.dump(formatted_scores, outfile, indent=4)