Sagacify · LucieNvz · Oct 18, 2023 · Oct 17, 2023 · Oct 17, 2023 · Oct 18, 2023
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -44,8 +44,8 @@ jobs:
 
       - name: Run formatter
         run: |
-          poetry run black --check saga_predictor tests
+          poetry run black --check saga_llm_evaluation_ml tests
 
       - name: Run linter
         run: |
-          poetry run pylint saga_predictor tests
+          poetry run pylint saga_llm_evaluation_ml tests
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,6 +9,11 @@ python = "^3.8,<3.11"
 transformers = "^4.21.1"
 numpy = "^1.21.2"
 spacy = "^3.1.3"
+evaluate = "^0.4.1"
+scikit-learn = "^1.3.1"
+mauve-text = "^0.3.0"
+bert-score = "^0.3.13"
+torch = ">=2.0.0, !=2.0.1, !=2.1.0"
 
 [tool.poetry.dev-dependencies]
 pylint = "^2.13"
@@ -23,6 +28,10 @@ url = "https://pypiserver.sagacify.com/"
 default = false
 secondary = true
 
+
+[tool.poetry.group.dev.dependencies]
+pytest = "^7.4.2"
+
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"

diff --git a/saga_llm_evaluation_ml/model/helpers/embedding_metrics.py b/saga_llm_evaluation_ml/model/helpers/embedding_metrics.py
@@ -0,0 +1,64 @@
+# TODO: Implement BERTScore
+# TODO: Implement MAUVE
+from evaluate import load
+
+
+class BERTScore:
+    def __init__(self, model_type="distilbert-base-uncased"):
+        """
+        BERTScore computes a similarity score for each token in the candidate sentence with each token in the reference sentence.
+        The final score is the average of the similarity scores of all tokens in the candidate sentence.
+
+        Args:
+            model_type (str, optional): Model type to use. Defaults to "roberta-large".
+        """
+        self.model_type = model_type
+        self.metric = load("bertscore")
+
+    def compute(self, references, predictions, **kwargs):
+        """
+        Args:
+            references (list): List of reference sentences.
+            predictions (list): List of candidate sentences.
+
+        Returns:
+            list: List of scores for each candidate sentence. Contains a list of scores for precisions, recalls, and F1 scores.
+        """
+        assert len(references) == len(
+            predictions
+        ), "Number of references and predictions must be equal."
+        assert isinstance(references, list), "References must be a list."
+        assert isinstance(predictions, list), "Predictions must be a list."
+
+        return self.metric.compute(
+            predictions=predictions,
+            references=references,
+            model_type=self.model_type,
+            **kwargs
+        )
+
+
+class MAUVE:
+    def __init__(self):
+        """
+        MAUVE score computes the difference between the candidate sentence distribution and the reference sentence distribution.
+        The bigger the MAUVE score, the better.
+        """
+        self.metric = load("mauve")
+        self.featurize_model_name = "gpt2"
+
+    def compute(self, references, predictions, **kwargs):
+        """
+        Args:
+            references (list): List of reference sentences.
+            predictions (list): List of candidate sentences.
+
+        Returns:
+            list: List of MAUVE scores for each candidate sentence.
+        """
+        return self.metric.compute(
+            predictions=predictions,
+            references=references,
+            featurize_model_name=self.featurize_model_name,
+            **kwargs
+        )
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -0,0 +1,8 @@
+import os
+import sys
+
+MODULE_ROOT = os.path.abspath("/www/app/src")
+sys.path.append(MODULE_ROOT)
+
+PROJ_ROOT = os.path.abspath("/www/app")
+sys.path.append(PROJ_ROOT)
diff --git a/tests/test.py b/tests/test.py
diff --git a/tests/test_embedding_metrics.py b/tests/test_embedding_metrics.py
@@ -0,0 +1,58 @@
+import unittest
+
+from saga_llm_evaluation_ml.model.helpers.embedding_metrics import BERTScore, MAUVE
+
+
+class TestBERTScore(unittest.TestCase):
+    def test_compute(self):
+        """Tests that the BERTScore class computes the correct scores. And that the scores are the same when the same inputs are given."""
+        references = ["The cat sat on the mat.", "The dog sat on the log."]
+        predictions = ["The cat sat on the mat.", "The dog sat on the log."]
+        bertscore = BERTScore()
+        scores = bertscore.compute(references, predictions)
+        print(scores)
+        self.assertEqual(len(scores["precision"]), len(references))
+        self.assertEqual(len(scores["recall"]), len(references))
+        self.assertEqual(len(scores["f1"]), len(references))
+
+        scores_2 = bertscore.compute(references, predictions)
+        self.assertEqual(scores["precision"], scores_2["precision"])
+        self.assertEqual(scores["recall"], scores_2["recall"])
+        self.assertEqual(scores["f1"], scores_2["f1"])
+
+    def test_compute_improved_input(self):
+        """Tests that the BERTScore improves for a better prediction."""
+        reference = "The cat sat on the mat."
+        prediction = "The dog sat on the mat."
+        better_prediction = "The cat sat on the mat."
+
+        bertscore = BERTScore()
+
+        scores = bertscore.compute([reference], [prediction])
+        better_scores = bertscore.compute([reference], [better_prediction])
+
+        self.assertGreater(better_scores["f1"][0], scores["f1"][0])
+
+
+class TestMAUVE(unittest.TestCase):
+    def test_compute(self):
+        """Tests that the MAUVE class computes the same scores when the same inputs are given."""
+        mauve = MAUVE()
+        references = ["The cat sat on the mat.", "The dog sat on the log."]
+        predictions = ["The cat sat on the mat.", "The dog sat on the log."]
+        scores = mauve.compute(references, predictions)
+        scores_2 = mauve.compute(references, predictions)
+        self.assertEqual(scores.mauve, scores_2.mauve)
+
+    def test_compute_improved_input(self):
+        """Tests that the MAUVE Score improves for a better prediction."""
+        reference = "The cat sat on the mat."
+        prediction = "The dog sat on the mat."
+        better_prediction = "The cat sat on the mat."
+
+        mauve = MAUVE()
+
+        scores = mauve.compute([reference], [prediction])
+        better_scores = mauve.compute([reference], [better_prediction])
+
+        self.assertGreater(better_scores.mauve, scores.mauve)