From 39d7d80fc314101692553cee7636b7f5fd6c6234 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-11-145.us-west-2.compute.internal>
Date: Fri, 26 Apr 2024 23:50:03 +0000
Subject: [PATCH] fix 2nd round comments

---
 python/graphstorm/eval/eval_func.py     | 24 ++++++++
 python/graphstorm/eval/evaluator.py     | 76 ++++++++++++++++---------
 python/graphstorm/trainer/lp_trainer.py |  8 +--
 tests/unit-tests/test_evaluator.py      |  4 +-
 4 files changed, 79 insertions(+), 33 deletions(-)

diff --git a/python/graphstorm/eval/eval_func.py b/python/graphstorm/eval/eval_func.py
index a08021d51d..1ca530cb9c 100644
--- a/python/graphstorm/eval/eval_func.py
+++ b/python/graphstorm/eval/eval_func.py
@@ -141,6 +141,14 @@ def __init__(self):
         self.metric_comparator = {}
         self.metric_comparator["mrr"] = operator.le
 
+        # This is the operator used to measure each metric performance
+        self.metric_function = {}
+        self.metric_function["mrr"] = compute_mrr
+
+        # This is the operator used to measure each metric performance in evaluation
+        self.metric_eval_function = {}
+        self.metric_eval_function["mrr"] = compute_mrr
+
     def assert_supported_metric(self, metric):
         """ check if the given metric is supported.
         """
@@ -589,3 +597,19 @@ def compute_mae(pred, labels):
 
     diff = th.abs(pred.cpu() - labels.cpu())
     return th.mean(diff).cpu().item()
+
+def compute_mrr(ranking):
+    """ Get link prediction mrr metrics
+
+        Parameters
+        ----------
+        ranking:
+            ranking of each positive edge
+
+        Returns
+        -------
+        link prediction mrr metrics: tensor
+    """
+    logs = th.div(1.0, ranking)
+    metrics = th.tensor(th.div(th.sum(logs),len(logs)))
+    return metrics
diff --git a/python/graphstorm/eval/evaluator.py b/python/graphstorm/eval/evaluator.py
index 37ed5edd8c..39bf1dd339 100644
--- a/python/graphstorm/eval/evaluator.py
+++ b/python/graphstorm/eval/evaluator.py
@@ -205,7 +205,7 @@ def evaluate(self, val_rankings, test_rankings, total_iters):
         """
 
     @abc.abstractmethod
-    def compute_score(self, rankings):
+    def compute_score(self, rankings, train=True):
         """ Compute evaluation score for Prediciton tasks
 
         Ranking-based link prediction evaluators should provide ranking values as input.
@@ -214,6 +214,8 @@ def compute_score(self, rankings):
         ----------
         rankings: dict of tensors
             Rankings of positive scores in format of {etype: ranking}
+        train: boolean
+            If in model training.
 
         Returns
         -------
@@ -460,15 +462,12 @@ class GSgnnClassificationEvaluator(GSgnnBaseEvaluator, GSgnnPredictionEvalInterf
         1) consecutive_increase and 2) average_increase.
     """
     def __init__(self, eval_frequency,
-                 eval_metric_list=None,
+                 eval_metric_list=["accuracy"],
                  multilabel=False,
                  use_early_stop=False,
                  early_stop_burnin_rounds=0,
                  early_stop_rounds=3,
                  early_stop_strategy=EARLY_STOP_AVERAGE_INCREASE_STRATEGY): # pylint: disable=unused-argument
-        # set up default metric to be accuracy
-        if eval_metric_list is None:
-            eval_metric_list = ["accuracy"]
         super(GSgnnClassificationEvaluator, self).__init__(eval_frequency,
                                                            eval_metric_list,
                                                            use_early_stop,
@@ -596,14 +595,11 @@ class GSgnnRegressionEvaluator(GSgnnBaseEvaluator, GSgnnPredictionEvalInterface)
         1) consecutive_increase and 2) average_increase.
     """
     def __init__(self, eval_frequency,
-                 eval_metric_list=None,
+                 eval_metric_list=["rmse"],
                  use_early_stop=False,
                  early_stop_burnin_rounds=0,
                  early_stop_rounds=3,
                  early_stop_strategy=EARLY_STOP_AVERAGE_INCREASE_STRATEGY):
-        # set up default metric to be "rmse"
-        if eval_metric_list is None:
-            eval_metric_list = ["rmse"]
         super(GSgnnRegressionEvaluator, self).__init__(eval_frequency,
             eval_metric_list, use_early_stop, early_stop_burnin_rounds,
             early_stop_rounds, early_stop_strategy)
@@ -708,7 +704,7 @@ def compute_score(self, pred, labels, train=True):
 class GSgnnMrrLPEvaluator(GSgnnBaseEvaluator, GSgnnLPRankingEvalInterface):
     """ Link Prediction Evaluator using "mrr" as metric.
 
-    GS built-in evaluator for Link Prediction task. It uses "mrr" as the default eval metric,
+    GS built-in evaluator for Link Prediction tasks. It uses "mrr" as the default eval metric,
     which implements the `GSgnnLPRankingEvalInterface`.
     
     To create a customized LP evaluator that use evaluation metric other than "mrr", users might
@@ -733,13 +729,11 @@ class GSgnnMrrLPEvaluator(GSgnnBaseEvaluator, GSgnnLPRankingEvalInterface):
         1) consecutive_increase and 2) average_increase.
     """
     def __init__(self, eval_frequency,
-                 eval_metric_list=None,
+                 eval_metric_list=["mrr"],
                  use_early_stop=False,
                  early_stop_burnin_rounds=0,
                  early_stop_rounds=3,
                  early_stop_strategy=EARLY_STOP_AVERAGE_INCREASE_STRATEGY):
-        if eval_metric_list is None:
-            eval_metric_list = ["mrr"]
         super(GSgnnMrrLPEvaluator, self).__init__(eval_frequency,
             eval_metric_list, use_early_stop, early_stop_burnin_rounds,
             early_stop_rounds, early_stop_strategy)
@@ -797,36 +791,50 @@ def evaluate(self, val_rankings, test_rankings, total_iters):
 
         return val_score, test_score
 
-    def compute_score(self, rankings):
+    def compute_score(self, rankings, train=True):
         """ Compute evaluation score
 
             Parameters
             ----------
             rankings: dict of tensors
                 Rankings of positive scores in format of {etype: ranking}
+            train: boolean
+                If in model training.
 
             Returns
             -------
             Evaluation metric values: dict
         """
         # We calculate global mrr, etype is ignored.
-        # User can develop its own per etype MRR evaluator
         ranking = []
         for _, rank in rankings.items():
             ranking.append(rank)
         ranking = th.cat(ranking, dim=0)
 
-        metrics = gen_mrr_score(ranking)
+        # compute ranking value for each metric
+        metrics = {}
+        for metric in self.metric_list:
+            if train:
+                # training expects always a single number to be
+                # returned and has a different (potentially) evluation function
+                metrics[metric] = self.metrics_obj.metric_function[metric](ranking)
+            else:
+                # validation or testing may have a different
+                # evaluation function, in our case the evaluation code
+                # may return a dictionary with the metric values for each metric
+                metrics[metric] = self.metrics_obj.metric_eval_function[metric](ranking)
 
         # When world size == 1, we do not need the barrier
         if get_world_size() > 1:
             barrier()
             for _, metric_val in metrics.items():
                 th.distributed.all_reduce(metric_val)
+
         return_metrics = {}
         for metric, metric_val in metrics.items():
             return_metric = metric_val / get_world_size()
             return_metrics[metric] = return_metric.item()
+
         return return_metrics
 
 
@@ -853,14 +861,12 @@ class GSgnnPerEtypeMrrLPEvaluator(GSgnnBaseEvaluator, GSgnnLPRankingEvalInterfac
         1) consecutive_increase and 2) average_increase.
     """
     def __init__(self, eval_frequency,
-                 eval_metric_list=None,
+                 eval_metric_list=["mrr"],
                  major_etype = LINK_PREDICTION_MAJOR_EVAL_ETYPE_ALL,
                  use_early_stop=False,
                  early_stop_burnin_rounds=0,
                  early_stop_rounds=3,
                  early_stop_strategy=EARLY_STOP_AVERAGE_INCREASE_STRATEGY):
-        if eval_metric_list is None:
-            eval_metric_list = ["mrr"]
         super(GSgnnPerEtypeMrrLPEvaluator, self).__init__(eval_frequency,
                                                           eval_metric_list,
                                                           use_early_stop,
@@ -878,13 +884,15 @@ def __init__(self, eval_frequency,
             self._best_test_score[metric] = self.metrics_obj.init_best_metric(metric=metric)
             self._best_iter[metric] = 0
 
-    def compute_score(self, rankings):
+    def compute_score(self, rankings, train=True):
         """ Compute evaluation score
 
             Parameters
             ----------
             rankings: dict of tensors
                 Rankings of positive scores in format of {etype: ranking}
+            train: boolean
+                If in model training.
 
             Returns
             -------
@@ -892,19 +900,31 @@ def compute_score(self, rankings):
         """
         # We calculate global mrr, etype is ignored.
         # User can develop its own per etype MRR evaluator
-        metrics = {}
-        for etype, rank in rankings.items():
-            metrics[etype] = gen_mrr_score(rank)
+        per_etype_metrics = {}
+        for etype, ranking in rankings.items():
+            # compute ranking value for each metric
+            metrics = {}
+            for metric in self.metric_list:
+                if train:
+                    # training expects always a single number to be
+                    # returned and has a different (potentially) evluation function
+                    metrics[metric] = self.metrics_obj.metric_function[metric](ranking)
+                else:
+                    # validation or testing may have a different
+                    # evaluation function, in our case the evaluation code
+                    # may return a dictionary with the metric values for each metric
+                    metrics[metric] = self.metrics_obj.metric_eval_function[metric](ranking)
+            per_etype_metrics[etype] = metrics
 
         # When world size == 1, we do not need the barrier
         if get_world_size() > 1:
             barrier()
-            for _, metric in metrics.items():
+            for _, metric in per_etype_metrics.items():
                 for _, metric_val in metric.items():
                     th.distributed.all_reduce(metric_val)
 
         return_metrics = {}
-        for etype, metric in metrics.items():
+        for etype, metric in per_etype_metrics.items():
             for metric_key, metric_val in metric.items():
                 return_metric = metric_val / get_world_size()
                 if metric_key not in return_metrics:
@@ -946,7 +966,8 @@ def evaluate(self, val_rankings, test_rankings, total_iters):
             if test_rankings is not None:
                 test_score = self.compute_score(test_rankings)
             else:
-                test_score = {"mrr": "N/A"} # Dummy
+                for metric in self.metric_list:
+                    test_score = {metric: "N/A"} # Dummy
 
             if val_rankings is not None:
                 val_score = self.compute_score(val_rankings)
@@ -962,7 +983,8 @@ def evaluate(self, val_rankings, test_rankings, total_iters):
                             self._best_test_score[metric] = major_test_score
                             self._best_iter[metric] = total_iters
             else:
-                val_score = {"mrr": "N/A"} # Dummy
+                for metric in self.metric_list:
+                    val_score = {metric: "N/A"} # Dummy
 
         return val_score, test_score
 
diff --git a/python/graphstorm/trainer/lp_trainer.py b/python/graphstorm/trainer/lp_trainer.py
index 7f1c178a64..d434c88f49 100644
--- a/python/graphstorm/trainer/lp_trainer.py
+++ b/python/graphstorm/trainer/lp_trainer.py
@@ -323,16 +323,16 @@ def eval(self, model, data, val_loader, test_loader,
                                           edge_mask=edge_mask_for_gnn_embeddings,
                                           task_tracker=self.task_tracker)
         sys_tracker.check('compute embeddings')
-        val_rankings = lp_mini_batch_predict(model, emb, val_loader, self.device) \
+        val_scores = lp_mini_batch_predict(model, emb, val_loader, self.device) \
             if val_loader is not None else None
         sys_tracker.check('after_val_score')
         if test_loader is not None:
-            test_rankings = lp_mini_batch_predict(model, emb, test_loader, self.device)
+            test_scores = lp_mini_batch_predict(model, emb, test_loader, self.device)
         else:
-            test_rankings = None
+            test_scores = None
         sys_tracker.check('after_test_score')
         val_score, test_score = self.evaluator.evaluate(
-            val_rankings, test_rankings, total_steps)
+            val_scores, test_scores, total_steps)
         sys_tracker.check('evaluate validation/test')
         model.train()
 
diff --git a/tests/unit-tests/test_evaluator.py b/tests/unit-tests/test_evaluator.py
index 2d332a4aae..50dae032e9 100644
--- a/tests/unit-tests/test_evaluator.py
+++ b/tests/unit-tests/test_evaluator.py
@@ -172,7 +172,7 @@ def test_mrr_per_etype_lp_evaluation():
 
     th.distributed.destroy_process_group()
 
-def test_lp_evaluator():
+def test_mrr_lp_evaluator():
     # system heavily depends on th distributed
     dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
         master_ip='127.0.0.1', master_port='12346')
@@ -852,7 +852,7 @@ def test_get_val_score_rank():
 if __name__ == '__main__':
     # test evaluators
     test_mrr_per_etype_lp_evaluation()
-    test_lp_evaluator()
+    test_mrr_lp_evaluator()
     test_regression_evaluator()
     test_early_stop_avg_increase_judge()
     test_early_stop_cons_increase_judge()