script: format model output, load human score, compute and analyze co…

…rrelation fix: #1, #2
SigmaWe · Jun 7, 2022 · b486b9f · b486b9f
1 parent bc68f57
commit b486b9f
Showing 6 changed files with 293 additions and 27 deletions.
diff --git a/analysis.py b/analysis.py
@@ -1,19 +1,26 @@
 import json
+import pickle
 from os import path
 
 import numpy as np
 
 
-def read_result(path_result: str) -> dict:
+def read_json(path_result: str) -> dict:
     with open(path_result, 'r') as infile:
         return json.load(infile)
 
 
-def read_results() -> (dict, dict):
+def read_pkl(path_result: str) -> dict:
+    with open(path_result, 'rb') as infile:
+        return pickle.load(infile)
+
+
+def read_results() -> (dict, dict, dict):
     path_results = 'results'
-    newsroom_results = read_result(path.join(path_results, 'newsroom.json'))
-    realsumm_results = read_result(path.join(path_results, 'realsumm.json'))
-    return newsroom_results, realsumm_results
+    newsroom_results = read_json(path.join(path_results, 'model/newsroom.json'))
+    realsumm_results = read_json(path.join(path_results, 'model/realsumm.json'))
+    corr_results = read_pkl(path.join(path_results, 'model/corr.pkl'))
+    return newsroom_results, realsumm_results, corr_results
 
 
 def extract_results(metric_name: str, newsroom_results: dict, realsumm_results: dict) -> (dict, dict, dict, dict):
@@ -31,10 +38,10 @@ def rouge_analysis(newsroom_results: dict, realsumm_results: dict) -> None:
     metrics = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
     for metric in metrics:
         metric_score = dict()
-        metric_score['newsroom_trad'] = newsroom_trad[metric][1][2]
-        metric_score['newsroom_new'] = newsroom_new[metric][1][2]
-        metric_score['realsumm_trad'] = realsumm_trad[metric][1][2]
-        metric_score['realsumm_new'] = realsumm_new[metric][1][2]
+        metric_score['newsroom_trad'] = np.median(newsroom_trad[metric], axis=1)[2]
+        metric_score['newsroom_new'] = np.median(newsroom_new[metric], axis=1)[2]
+        metric_score['realsumm_trad'] = np.median(realsumm_trad[metric], axis=1)[2]
+        metric_score['realsumm_new'] = np.median(realsumm_new[metric], axis=1)[2]
         rouge_scores[metric] = metric_score
     with open('results/analysis/rouge.json', 'w') as outfile:
         json.dump(rouge_scores, outfile, indent=4)
@@ -76,9 +83,28 @@ def bleurt_analysis(newsroom_results: dict, realsumm_results: dict) -> None:
         json.dump(bleurt_score, outfile, indent=4)
 
 
+def corr_analysis(corr_results: dict) -> None:
+    results = dict()
+    datasets = ['newsroom', 'realsumm']
+    metrics = ['rouge', 'bertscore', 'bleurt']
+    approaches = ['trad', 'new']
+    for dataset in datasets:
+        results[dataset] = dict()
+        for metric in metrics:
+            results[dataset][metric] = dict()
+            for approach in approaches:
+                results[dataset][metric][approach] = dict()
+                for corr_type in corr_results[dataset][metric][approach].keys():
+                    values = list(corr_results[dataset][metric][approach][corr_type].values())
+                    results[dataset][metric][approach][corr_type] = np.mean(values)
+    with open('results/analysis/corr.json', 'w') as outfile:
+        json.dump(results, outfile, indent=4)
+
+
 if __name__ == '__main__':
-    newsroom_results, realsumm_results = read_results()
+    newsroom_results, realsumm_results, corr_results = read_results()
     rouge_analysis(newsroom_results, realsumm_results)
     bertscore_analysis(newsroom_results, realsumm_results)
     bleu_analysis(newsroom_results, realsumm_results)
     bleurt_analysis(newsroom_results, realsumm_results)
+    corr_analysis(corr_results)
diff --git a/corr.py b/corr.py
@@ -0,0 +1,148 @@
+import json
+import pickle
+
+import numpy as np
+
+import dataloader.newsroom as newsroom
+import dataloader.realsumm as realsumm
+
+import scipy
+
+model_scores = dict()
+corr = dict()
+approaches = ['trad', 'new']
+
+
+def read_system_scores() -> dict:
+    with open('results/model/scores.json', 'r') as infile:
+        return json.load(infile)
+
+
+def newsroom_read(metrics: list) -> dict:
+    """
+    Return data structure:
+    {
+        docID: {
+            system1: {
+                "Coherence":       float,
+                "Fluency":         float,
+                "Informativeness": float,
+                "Relevance":       float,
+                "precision":       float,
+                "recall":          float,
+                "f1":              float
+            }
+            system2: { ... }
+            ...
+            system7: {... }
+        }
+    }
+    """
+    system_scores = dict()
+    for approach in approaches:
+        system_scores[approach] = dict()
+    _, _, _, human_scores = newsroom.read('dataloader')
+    for i in range(len(human_scores)):
+        for approach in approaches:
+            system_scores[approach][i] = dict()
+            human_keys = human_scores[i].keys()
+            for metric in metrics:
+                if metric != 'bleu':
+                    system_scores[approach][i][metric] = dict()
+                    for key in human_keys:
+                        system_scores[approach][i][metric][key] = human_scores[i][key]
+                    system_keys = model_scores['newsroom'][metric][approach].keys()
+                    for key in system_keys:
+                        system_scores[approach][i][metric][key] = model_scores['newsroom'][metric][approach][key][i]
+    return system_scores
+
+
+def system_judge(scores, metrics_human, metrics_system, correlation_types) -> dict:
+    # ref: suenes.human.newsroom.test_eval
+    all_system_names = list(scores[list(scores.keys())[0]].keys())
+
+    def get_correlation_two_metrics(scores, metric_human, metric_system, correlation_type):
+        mean_score_vector_newsroom = []
+        mean_score_vector_other = []
+        for system in all_system_names:
+            vector_human = []  # scores from a human metric
+            vector_system = []  # scores from a non-human metric
+            for docID in scores.keys():
+                score_local = scores[docID][system]
+                score_newsroom = score_local[metric_human]  # one float
+                score_other = score_local[metric_system]  # one float
+                vector_human.append(score_newsroom)
+                vector_system.append(score_other)
+
+            mean_score_vector_newsroom.append(np.mean(vector_human))
+            mean_score_vector_other.append(np.mean(vector_system))
+        return eval(f"scipy.stats.{correlation_type}(vector_human, vector_system)")[0]
+
+    # now begins the system-level judge
+    correlations = {}
+    for correlation_type in correlation_types:
+        correlations[correlation_type] = {}
+        for metric_human in metrics_human:  # one metric from human
+            for metric_system in metrics_system:  # one metric to evaluate against human
+                correlations[correlation_type] \
+                    [(metric_human, metric_system)] = \
+                    get_correlation_two_metrics(scores, metric_human, metric_system, correlation_type)
+
+    return correlations
+
+
+def realsumm_read(metrics: list) -> dict:
+    _, _, _, dataset_scores = realsumm.read('suenes/human/realsumm/scores_dicts/',
+                                            'suenes/human/realsumm/analysis/test.tsv')
+    system_scores = dict()
+    for approach in approaches:
+        system_scores[approach] = dict()
+        for i in range(len(dataset_scores)):
+            system_scores[approach][i] = dict()
+            for metric in metrics:
+                system_scores[approach][i][metric] = dict()
+                system_scores[approach][i][metric]['litepyramid_recall'] = dataset_scores[i][
+                    'litepyramid_recall']  # human score
+                system_keys = model_scores['realsumm'][metric]['trad'].keys()
+                for key in system_keys:
+                    system_scores[approach][i][metric][key] = model_scores['realsumm'][metric][approach][key][i]
+    return system_scores
+
+
+def calculate(dataset: str) -> None:
+    corr[dataset] = dict()
+    available_metrics_systems = {
+        'rouge': ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'],
+        'bertscore': ['bertscore'],
+        'bleurt': ['bleurt']
+    }
+    for id in range(len(available_metrics_systems.keys())):
+        metric_systems_name = list(available_metrics_systems.keys())[id]
+        metric_systems = available_metrics_systems[metric_systems_name]
+        if dataset == 'newsroom':
+            system_scores = newsroom_read(metric_systems)
+            metrics_human = ['Coherence', 'Informativeness', 'Fluency', 'Relevance']
+        elif dataset == 'realsumm':
+            system_scores = realsumm_read(metric_systems)
+            metrics_human = ['litepyramid_recall']
+        else:
+            raise NotImplementedError()
+        if metric_systems_name == 'bleurt':
+            metrics_system = ['scores']
+        else:
+            metrics_system = ['precision', 'recall', 'f1']
+        correlation_types = ['pearsonr', 'kendalltau', 'spearmanr']
+        my_corr = dict()
+        for approach in approaches:
+            my_corr[approach] = system_judge(system_scores[approach], metrics_human, metrics_system,
+                                             correlation_types)
+        corr[dataset][metric_systems_name] = my_corr
+
+
+if __name__ == '__main__':
+    model_scores = read_system_scores()
+    datasets = ['newsroom', 'realsumm']
+    for dataset in datasets:
+        calculate(dataset)
+    with open('results/model/corr.pkl', 'wb') as outfile:
+        pickle.dump(corr, outfile)
diff --git a/dataloader/newsroom.py b/dataloader/newsroom.py
@@ -2,17 +2,26 @@
 import string
 from os import path
 
-def read(path_read: string) -> (list, list, list):
+
+def read(path_read: string) -> (list, list, list, list):
     data = list()
     with open(path.join(path_read, "newsroom-human-eval.csv")) as csvfile:
         reader = csv.DictReader(csvfile)
         for row in reader:
             data.append(row)
-    sys_summaries = list()
-    ref_summaries = list()
-    docs = list()
+    sys_summaries, ref_summaries, docs, scores = list(), list(), list(), list()
     for datum in data:
         sys_summaries.append(datum['SystemSummary'])
         ref_summaries.append(datum['ArticleTitle'])
         docs.append(datum['ArticleText'])
-    return sys_summaries, ref_summaries, docs
+        score = dict()
+        to_copy = ['Coherence', 'Fluency', 'Informativeness', 'Relevance']
+        for i in to_copy:
+            score[i] = int(datum[i + 'Rating'])
+        scores.append(score)
+    return sys_summaries, ref_summaries, docs, scores
+
+
+if __name__ == '__main__':
+    data = read('.')
+    print(data)
diff --git a/dataloader/realsumm.py b/dataloader/realsumm.py
@@ -1,8 +1,10 @@
 import copy
 import string
 from os import path
+
 import suenes.human.realsumm.analysis.utils as utils
 
+
 def read_summary(path_summary: string) -> (list, list):
     sd_abs_path = path.join(path_summary, "abs_ours.pkl")
     sd_ext_path = path.join(path_summary, "ext_ours.pkl")
@@ -12,13 +14,14 @@ def read_summary(path_summary: string) -> (list, list):
     for doc_id in sd:
         isd_sota_ext = sd_ext[doc_id]
         sd[doc_id]['system_summaries'].update(isd_sota_ext['system_summaries'])
-    sys_summaries = list()
-    ref_summaries = list()
+    sys_summaries, ref_summaries, scores = list(), list(), list()
     for sd_item in sd.items():
         for sys_item in sd_item[1]['system_summaries'].items():
             ref_summaries.append(sd_item[1]['ref_summ'])
             sys_summaries.append(sys_item[1]['system_summary'])
-    return sys_summaries, ref_summaries
+            scores.append(sys_item[1]['scores'])
+    return sys_summaries, ref_summaries, scores
+
 
 def read_docs(path_docs: string) -> list:
     with open(path_docs, 'r') as infile:
@@ -29,7 +32,13 @@ def read_docs(path_docs: string) -> list:
             docs.append(row)
     return docs
 
+
 def read(path_summary: string, path_docs: string) -> (list, list, list):
-    sys_summaries, ref_summaries = read_summary(path_summary)
+    sys_summaries, ref_summaries, scores = read_summary(path_summary)
     docs = read_docs(path_docs)
-    return sys_summaries, ref_summaries, docs
+    return sys_summaries, ref_summaries, docs, scores
+
+
+if __name__ == '__main__':
+    data = read('../suenes/human/realsumm/scores_dicts/', '../suenes/human/realsumm/analysis/test.tsv')
+    print(data)
diff --git a/eval.py b/eval.py
@@ -15,21 +15,26 @@ def model_eval(sys_summaries: list, ref_summaries: list, docs: list) -> dict:
         if model_name != 'bleurt':
             model = evaluate.load(model_name)
         else:
-            model = evaluate.load('bleurt', config_name='bleurt-large-512', module_type='metric')
+            model = evaluate.load('bleurt', config_name='BLEURT-20', module_type='metric')
 
         model_result = dict()
 
         # calculate traditional (reference, system summary) pairs
         print('Eval trad')
         if model_name == 'bertscore':
             model_result['trad'] = model.compute(predictions=sys_summaries, references=ref_summaries, lang='en')
+        elif model_name == 'rouge':
+            model_result['trad'] = model.compute(predictions=sys_summaries, references=ref_summaries,
+                                                 use_aggregator=False)
         else:
             model_result['trad'] = model.compute(predictions=sys_summaries, references=ref_summaries)
 
         # calculate new (document, system summary) pairs
         print('Eval new')
         if model_name == 'bertscore':
             model_result['new'] = model.compute(predictions=sys_summaries, references=docs, lang='en')
+        elif model_name == 'rouge':
+            model_result['new'] = model.compute(predictions=sys_summaries, references=docs, use_aggregator=False)
         else:
             model_result['new'] = model.compute(predictions=sys_summaries, references=docs)
 
@@ -39,19 +44,19 @@ def model_eval(sys_summaries: list, ref_summaries: list, docs: list) -> dict:
 
 
 def realsumm_eval():
-    print('[Realsumm]')
-    sys_summaries, ref_summaries, docs = realsumm.read('suenes/human/realsumm/scores_dicts/',
-                                                       'suenes/human/realsumm/analysis/test.tsv')
+    print('[RealSumm]')
+    sys_summaries, ref_summaries, docs, _ = realsumm.read('suenes/human/realsumm/scores_dicts/',
+                                                          'suenes/human/realsumm/analysis/test.tsv')
     results = model_eval(sys_summaries, ref_summaries, docs)
-    with open('results/realsumm.json', 'w') as outfile:
+    with open('results/model/realsumm.json', 'w') as outfile:
         json.dump(results, outfile, indent=4)
 
 
 def newsroom_eval():
     print('[Newsroom]')
-    sys_summaries, ref_summaries, docs = newsroom.read('dataloader')
+    sys_summaries, ref_summaries, docs, _ = newsroom.read('dataloader')
     results = model_eval(sys_summaries, ref_summaries, docs)
-    with open('results/newsroom.json', 'w') as outfile:
+    with open('results/model/newsroom.json', 'w') as outfile:
         json.dump(results, outfile, indent=4)