From 2be80d7d23c944c90529b48c852aa7e55b7d889e Mon Sep 17 00:00:00 2001
From: Lucie <lucie.navez@sagacify.com>
Date: Thu, 19 Oct 2023 15:16:51 +0200
Subject: [PATCH] refactor(structure): refactor file structure for pylint

---
 .../{model => }/helpers/__init__.py           |   0
 .../{model => }/helpers/embedding_metrics.py  |   9 +-
 .../{model => }/helpers/language_metrics.py   |   3 +-
 .../{model => }/helpers/llm_metrics.py        | 179 +++++++++++-------
 .../{model => }/helpers/utils.py              |   4 +-
 tests/test_embedding_metrics.py               |   2 +-
 tests/test_helpers.py                         |  12 +-
 tests/test_language_metrics.py                |   2 +-
 tests/test_llm_metrics.py                     |  55 +++---
 9 files changed, 164 insertions(+), 102 deletions(-)
 rename saga_llm_evaluation_ml/{model => }/helpers/__init__.py (100%)
 rename saga_llm_evaluation_ml/{model => }/helpers/embedding_metrics.py (90%)
 rename saga_llm_evaluation_ml/{model => }/helpers/language_metrics.py (91%)
 rename saga_llm_evaluation_ml/{model => }/helpers/llm_metrics.py (68%)
 rename saga_llm_evaluation_ml/{model => }/helpers/utils.py (92%)

diff --git a/saga_llm_evaluation_ml/model/helpers/__init__.py b/saga_llm_evaluation_ml/helpers/__init__.py
similarity index 100%
rename from saga_llm_evaluation_ml/model/helpers/__init__.py
rename to saga_llm_evaluation_ml/helpers/__init__.py
diff --git a/saga_llm_evaluation_ml/model/helpers/embedding_metrics.py b/saga_llm_evaluation_ml/helpers/embedding_metrics.py
similarity index 90%
rename from saga_llm_evaluation_ml/model/helpers/embedding_metrics.py
rename to saga_llm_evaluation_ml/helpers/embedding_metrics.py
index 6af0c38..fa06db7 100644
--- a/saga_llm_evaluation_ml/model/helpers/embedding_metrics.py
+++ b/saga_llm_evaluation_ml/helpers/embedding_metrics.py
@@ -4,7 +4,8 @@
 class BERTScore:
     def __init__(self, model_type="distilbert-base-uncased"):
         """
-        BERTScore computes a similarity score for each token in the candidate sentence with each token in the reference sentence.
+        BERTScore computes a similarity score for each token in the candidate sentence with each
+        token in the reference sentence.
         The final score is the average of the similarity scores of all tokens in the candidate sentence.
 
         Args:
@@ -20,7 +21,8 @@ def compute(self, references, predictions, **kwargs):
             predictions (list): List of candidate sentences.
 
         Returns:
-            list: List of scores for each candidate sentence. Contains a list of scores for precisions, recalls, and F1 scores.
+            list: List of scores for each candidate sentence. Contains a list of scores
+            for precisions, recalls, and F1 scores.
         """
         assert len(references) == len(
             predictions
@@ -39,7 +41,8 @@ def compute(self, references, predictions, **kwargs):
 class MAUVE:
     def __init__(self, featurize_model_name="gpt2"):
         """
-        MAUVE score computes the difference between the candidate sentence distribution and the reference sentence distribution.
+        MAUVE score computes the difference between the candidate sentence distribution
+        and the reference sentence distribution.
         The bigger the MAUVE score, the better.
         """
         self.metric = load("mauve")
diff --git a/saga_llm_evaluation_ml/model/helpers/language_metrics.py b/saga_llm_evaluation_ml/helpers/language_metrics.py
similarity index 91%
rename from saga_llm_evaluation_ml/model/helpers/language_metrics.py
rename to saga_llm_evaluation_ml/helpers/language_metrics.py
index 490a3f2..0d887d0 100644
--- a/saga_llm_evaluation_ml/model/helpers/language_metrics.py
+++ b/saga_llm_evaluation_ml/helpers/language_metrics.py
@@ -4,7 +4,8 @@
 class BLEURTScore:
     def __init__(self, checkpoint="BLEURT-tiny"):
         """
-        BLEURT is a learnt metric that uses BERT to compute a similarity score for each token in the candidate sentence with each token in the reference sentence.
+        BLEURT is a learnt metric that uses BERT to compute a similarity score for
+        each token in the candidate sentence with each token in the reference sentence.
 
         Args:
             checkpoint (str, optional): Checkpoint to use. Defaults to BLEURT-tiny if not specified.
diff --git a/saga_llm_evaluation_ml/model/helpers/llm_metrics.py b/saga_llm_evaluation_ml/helpers/llm_metrics.py
similarity index 68%
rename from saga_llm_evaluation_ml/model/helpers/llm_metrics.py
rename to saga_llm_evaluation_ml/helpers/llm_metrics.py
index 3760bf5..d270915 100644
--- a/saga_llm_evaluation_ml/model/helpers/llm_metrics.py
+++ b/saga_llm_evaluation_ml/helpers/llm_metrics.py
@@ -4,7 +4,7 @@
 
 
 class GPTScore:
-    def __init__(self):
+    def __init__(self, model="gpt2"):
         """
         GPTScore is a metric which allows to evaluate generative models on a variety of tasks.
         GPTScore(h|d, a, S) =  sum_{t=1}^m w_t * log p(h_t | h_{<t}, T(d, a, S), theta)
@@ -15,12 +15,17 @@ def __init__(self):
         S: context information.
         and theta are model parameters.
         GPTScore does not require any reference text.
+
+        Args:
+            model (str, optional): Model name. Defaults to "gpt2".
         """
+        self.model = model
         self.huggingface_models = [
             "meta-llama/Llama-2-7b-chat-hf",
             "gpt2",
             "mistralai/Mistral-7B-v0.1",
         ]
+        self.openai_models = ["gpt-3.5-turbo"]
         self.aspects = [
             "COV",
             "FAC",
@@ -44,15 +49,87 @@ def __init__(self):
             "FLE",
             "INQ",
         ]
-        self.models = ["meta-llama/Llama-2-7b-chat-hf", "gpt-3.5-turbo", "gpt2"]
+        self.models = [
+            "meta-llama/Llama-2-7b-chat-hf",
+            "gpt-3.5-turbo",
+            "gpt2",
+            "mistralai/Mistral-7B-v0.1",
+        ]
         self.tasks = ["summ", "MT", "D2T", "diag"]
 
-    def get_prompt(self, a, d, src, pred):
+        assert isinstance(model, str), "Model must be a string."
+        assert model in self.models, f"Model must be one of {self.models}."
+
+    def huggingface_logprobs(self, prompts):
+        """
+        This method outputs the log-likelihood of the next token given the previous tokens and the prompt template.
+        Args:
+            prompts (list of str): List of prompt templates.
+        Returns:
+            list: List of log-likelihoods for each candidate sentence.
+        """
+
+        tokenizer = AutoTokenizer.from_pretrained(self.model)
+        llm = AutoModelForCausalLM.from_pretrained(self.model)
+        inputs = tokenizer(prompts, return_tensors="pt")
+
+        outputs = llm.generate(
+            **inputs,
+            max_new_tokens=50,
+            return_dict_in_generate=True,
+            output_scores=True,
+        )
+
+        logprobs = np.array(
+            llm.compute_transition_scores(
+                outputs.sequences, outputs.scores, normalize_logits=True
+            ).tolist()
+        )
+
+        return logprobs
+
+    def openai_logprobs(self, prompts, api_key):
+        """
+        This method outputs the log-likelihood of the next token given the previous tokens and the prompt template.
+        Args:
+            prompts (list of str): List of prompt templates.
+            api_key (str): OpenAI API key.
+        Returns:
+            list: List of log-likelihoods for each candidate sentence.
+        """
+        openai.api_key = api_key
+        outputs = openai.Completion.create(
+            model=self.model,
+            prompt=prompts,
+            logprobs=5,
+        )
+
+        logprobs = outputs["choices"][0]["logprobs"]
+
+        return logprobs
+
+    def get_prompts(self, aspect, task, sources, preds):
+        """
+        This method returns a list of prompt templates given a task description, and an aspect to evaluate.
+        Args:
+            aspect (str): Aspect to evaluate.
+            task (str): Task description.
+            sources (list of str): Source texts.
+            preds (list of str): Candidate sentences.
+        Returns:
+            list: List of prompt templates.
+        """
+        return [
+            self.get_prompt(aspect, task, src, pred)
+            for (src, pred) in zip(sources, preds)
+        ]
+
+    def get_prompt(self, aspect, task, src, pred):
         """
         This method returns a prompt template given a task description, and an aspect to evaluate.
         Args:
-            a (str): Aspect to evaluate.
-            d (str): Task description.
+            aspect (str): Aspect to evaluate.
+            task (str): Task description.
             src (str): Source text.
             pred (str): Candidate sentence.
         Returns:
@@ -94,21 +171,25 @@ def get_prompt(self, a, d, src, pred):
         }
 
         # Check that the corresponding entry exists in the prompt template
-        assert a in templates[d], f"Aspect {a} is not available for task {d}."
+        assert (
+            aspect in templates[task]
+        ), f"Aspect {aspect} is not available for task {task}."
         # Check that the prompt template is not empty
-        assert templates[d][
-            a
-        ], f"Prompt template for aspect {a} and task {d} is non-existent. Please specify a prompt template."
+        assert templates[task][
+            aspect
+        ], f"Prompt template for aspect {aspect} and task {task} is non-existent. Please specify a prompt template."
 
-        return templates[d][a]
+        return templates[task][aspect]
 
     def compute(
-        self, sources, preds, model="gpt2", prompts=None, a=None, d=None, api_key=None
+        self, sources, preds, prompts=None, aspect=None, task=None, api_key=None
     ):
         """
-        This method computes GPTScore for a list of candidate sentences given a task description, an aspect to evaluate and context information.
+        This method computes GPTScore for a list of candidate sentences given a task description,
+        an aspect to evaluate and context information.
         The possible values for aspect are:
-        - (COV): Semantic coverage. How many semantic content units from the reference text are covered by the generated text?
+        - (COV): Semantic coverage. How many semantic content units from the reference text
+                    are covered by the generated text?
         - (FAC): Factuality. Does the generated text preserve the factual statements of the source text?)
         - (FLU): Fluency. Is the generated text well-written and grammatical?
         - (CON): Consistency. Is the generated text consistent in the information it provides?
@@ -139,10 +220,9 @@ def compute(
         Args:
             sources (list of str): Source texts.
             preds (list of str): Candidate sentences.
-            model (str): Model name. If None, a default model is used.
-            prompt (str): Prompt template. If None, a default prompt template is used.
-            a (list): List of aspects to evaluate.
-            d (str): Task description.
+            prompts (str): Prompt template. If None, a default prompt template is used.
+            aspect (list): List of aspects to evaluate.
+            task (str): Task description.
             api_key (str): OpenAI API key.
 
         Returns:
@@ -155,75 +235,46 @@ def compute(
             preds[0], str
         ), "Prediction must be a list of strings."
 
-        assert isinstance(model, str), "Model must be a string."
-        assert model in self.models, f"Model must be one of {self.models}."
+        assert isinstance(self.model, str), "Model must be a string."
+        assert self.model in self.models, f"Model must be one of {self.models}."
 
         # If prompt is given, check that it is a list of string
         if prompts:
             assert isinstance(prompts, list) and isinstance(
                 prompts[0], str
             ), "Prompts must be a list of strings."
-            assert not a, "Aspect must not be given if prompt is given."
-            assert not d, "Task must not be given if prompt is given."
+            assert not aspect, "Aspect must not be given if prompt is given."
+            assert not task, "Task must not be given if prompt is given."
         else:
             # If prompt is not given, check that task and aspect are given
-            assert a, "Aspect must be given if prompt is not given."
-            assert d, "Task must be given if prompt is not given."
+            assert aspect, "Aspect must be given if prompt is not given."
+            assert task, "Task must be given if prompt is not given."
 
         # If aspect is given, check that it is a string
-        if a:
-            assert isinstance(a, str), "Aspect must be a string."
-            assert a in self.aspects, f"Aspect must be one of {self.aspects}."
+        if aspect:
+            assert isinstance(aspect, str), "Aspect must be a string."
+            assert aspect in self.aspects, f"Aspect must be one of {self.aspects}."
 
         # If task is given, check that it is a string
-        if d:
-            assert isinstance(d, str), "Task must be a string."
-            assert d in self.tasks, f"Task must be one of {self.tasks}."
+        if task:
+            assert isinstance(task, str), "Task must be a string."
+            assert task in self.tasks, f"Task must be one of {self.tasks}."
 
         # Generative LLM is given a prompt template and some context information
-        prompts = (
-            prompts
-            if prompts
-            else [
-                self.get_prompt(a, d, src, pred) for (src, pred) in zip(sources, preds)
-            ]
-        )
+        prompts = prompts or self.get_prompts(aspect, task, sources, preds)
 
         # Model predicts log-likelihood of the next token given the previous tokens and the prompt template
-        if model in self.huggingface_models:
-            tokenizer = AutoTokenizer.from_pretrained(model)
-            llm = AutoModelForCausalLM.from_pretrained(model)
-            inputs = tokenizer(prompts, return_tensors="pt")
-
-            outputs = llm.generate(
-                **inputs,
-                max_new_tokens=50,
-                return_dict_in_generate=True,
-                output_scores=True,
-            )
-
-            transition_scores = llm.compute_transition_scores(
-                outputs.sequences, outputs.scores, normalize_logits=True
-            )
-
-            logprobs = np.array(transition_scores.tolist())
-
-        elif model == "gpt-3.5-turbo":
-            openai.api_key = api_key
-            response = openai.Completion.create(
-                model=model,
-                prompt=prompts,
-                logprobs=5,
-            )
+        if self.model in self.huggingface_models:
+            logprobs = self.huggingface_logprobs(prompts)
 
-            logprobs = response["choices"][0]["logprobs"]
+        elif self.model in self.openai_models:
+            logprobs = self.openai_logprobs(prompts, api_key)
 
         # Compute GPTScores
         scores = []
         for i, pred in enumerate(preds):
             pred_tokens = pred.split()
             pred_logprobs = logprobs[i][: len(pred_tokens)]
-            score = np.mean(pred_logprobs)
-            scores.append(score)
+            scores.append(np.mean(pred_logprobs))
 
         return scores
diff --git a/saga_llm_evaluation_ml/model/helpers/utils.py b/saga_llm_evaluation_ml/helpers/utils.py
similarity index 92%
rename from saga_llm_evaluation_ml/model/helpers/utils.py
rename to saga_llm_evaluation_ml/helpers/utils.py
index 5fd4a9f..0d38b02 100644
--- a/saga_llm_evaluation_ml/model/helpers/utils.py
+++ b/saga_llm_evaluation_ml/helpers/utils.py
@@ -18,7 +18,7 @@ class MetadataExtractor:
     def __init__(self):
         self.metadata_extractor = MetafeatureExtractorsRunner()
 
-    def addWordRegexMatchesCount(self, regex_rule, name=None):
+    def add_word_regex_matches_count(self, regex_rule, name=None):
         """
         Adds a regex rule to the metadata extractor.
         For a given regex return the number of words matching the regex.
@@ -30,7 +30,7 @@ def addWordRegexMatchesCount(self, regex_rule, name=None):
             WordRegexMatchesCount(regex=regex_rule, name=name)
         )
 
-    def addRegexMatchCount(self, regex_rule, name=None):
+    def add_regex_match_count(self, regex_rule, name=None):
         """
         Adds a regex rule to the metadata extractor.
         For a given regex return the number of matches it has in the text.
diff --git a/tests/test_embedding_metrics.py b/tests/test_embedding_metrics.py
index 963fe23..a13c5b2 100644
--- a/tests/test_embedding_metrics.py
+++ b/tests/test_embedding_metrics.py
@@ -1,6 +1,6 @@
 import unittest
 
-from saga_llm_evaluation_ml.model.helpers.embedding_metrics import BERTScore, MAUVE
+from saga_llm_evaluation_ml.helpers.embedding_metrics import BERTScore, MAUVE
 
 
 class TestBERTScore(unittest.TestCase):
diff --git a/tests/test_helpers.py b/tests/test_helpers.py
index 9d3f244..30429cb 100644
--- a/tests/test_helpers.py
+++ b/tests/test_helpers.py
@@ -1,6 +1,6 @@
 import unittest
 
-from saga_llm_evaluation_ml.model.helpers.utils import MetadataExtractor
+from saga_llm_evaluation_ml.helpers.utils import MetadataExtractor
 
 
 class TestMetadataExtractor(unittest.TestCase):
@@ -25,8 +25,8 @@ def test_add_regex(self):
         """Tests that the MetadataExtractor class extracts the correct metadata when regex rules are added."""
         text = "The cat sat on the mat."
         extractor = MetadataExtractor()
-        extractor.addWordRegexMatchesCount("the")
-        extractor.addRegexMatchCount("the")
+        extractor.add_word_regex_matches_count("the")
+        extractor.add_regex_match_count("the")
         metadata = extractor.compute(text)
 
         # Test a few metadata values
@@ -45,8 +45,10 @@ def test_add_regex(self):
         len_metadata = len(metadata)
 
         # Check that the metadata is longer when multiple regex rules are added
-        extractor.addWordRegexMatchesCount("cat", name="word_regex_matches_count_cat")
-        extractor.addRegexMatchCount("cat", name="regex_match_count_cat")
+        extractor.add_word_regex_matches_count(
+            "cat", name="word_regex_matches_count_cat"
+        )
+        extractor.add_regex_match_count("cat", name="regex_match_count_cat")
         metadata = extractor.compute(text)
 
         self.assertGreater(len(metadata), len_metadata)
diff --git a/tests/test_language_metrics.py b/tests/test_language_metrics.py
index 62090a7..4a26424 100644
--- a/tests/test_language_metrics.py
+++ b/tests/test_language_metrics.py
@@ -1,6 +1,6 @@
 import unittest
 
-from saga_llm_evaluation_ml.model.helpers.language_metrics import BLEURTScore
+from saga_llm_evaluation_ml.helpers.language_metrics import BLEURTScore
 
 
 class TestBLEURTScore(unittest.TestCase):
diff --git a/tests/test_llm_metrics.py b/tests/test_llm_metrics.py
index 3eb1e1e..9561ce1 100644
--- a/tests/test_llm_metrics.py
+++ b/tests/test_llm_metrics.py
@@ -1,74 +1,78 @@
 import unittest
 
-from saga_llm_evaluation_ml.model.helpers.llm_metrics import GPTScore
+from saga_llm_evaluation_ml.helpers.llm_metrics import GPTScore
 
 
 class TestGPTScore(unittest.TestCase):
+    def test_init(self):
+        with self.assertRaises(AssertionError):
+            GPTScore(model=100)
+            GPTScore(model="notvalid")
+
     def test_bad_arguments(self):
         gptscore = GPTScore()
 
         with self.assertRaises(AssertionError):
             gptscore.compute("The cat sat on the mat.", "The dog sat on the log.")
-            gptscore.compute(
-                ["The cat sat on the mat."], ["The dog sat on the log."], model="random"
-            )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
-                model="meta-llama/Llama-2-7b-chat-hf",
                 prompts=10,
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
                 prompts="Summarize",
-                a="ERR",
+                aspect="ERR",
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
                 prompts="Summarize",
-                d="summ",
+                task="summ",
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
                 prompts="Summarize",
-                a="ERR",
-                d="summ",
+                aspect="ERR",
+                task="summ",
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
-                a="ERR",
-                d=None,
+                aspect="ERR",
+                task=None,
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
-                a=None,
-                d="summ",
+                aspect=None,
+                task="summ",
             )
             gptscore.compute(
-                ["The cat sat on the mat."], ["The dog sat on the log."], a=2, d="summ"
+                ["The cat sat on the mat."],
+                ["The dog sat on the log."],
+                aspect=2,
+                task="summ",
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
-                a="ERR",
-                d=None,
+                aspect="ERR",
+                task=None,
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
-                a="notvalid",
-                d="summ",
+                aspect="notvalid",
+                task="summ",
             )
             gptscore.compute(
                 ["The cat sat on the mat."],
                 ["The dog sat on the log."],
-                a="ERR",
-                d="D2T",
+                aspect="ERR",
+                task="D2T",
             )
 
     def test_compute_gpt2(self):
@@ -79,7 +83,7 @@ def test_compute_gpt2(self):
         gptscore = GPTScore()
 
         # gpt2
-        scores = gptscore.compute(sources, preds, a="ERR", d="diag", model="gpt2")
+        scores = gptscore.compute(sources, preds, aspect="ERR", task="diag")
         self.assertGreater(scores[1], scores[0])
 
     # def test_compute_mistral(self):
@@ -94,8 +98,8 @@ def test_compute_gpt2(self):
     #     gptscore = GPTScore()
 
     #     # mistralai/Mistral-7B-v0.1
-    #     score = gptscore.compute(source, pred, a="ERR", d="diag", model="mistralai/Mistral-7B-v0.1")
-    #     score_2 = gptscore.compute(source, better_pred, a="ERR", d="diag", model="mistralai/Mistral-7B-v0.1")
+    #     score = gptscore.compute(source, pred, aspect="ERR", task="diag", model="mistralai/Mistral-7B-v0.1")
+    #     score_2 = gptscore.compute(source, better_pred, aspect="ERR", task="diag", model="mistralai/Mistral-7B-v0.1")
     #     self.assertGreater(score_2, score)
 
     # def test_compute_llama(self):
@@ -110,6 +114,7 @@ def test_compute_gpt2(self):
     #     gptscore = GPTScore()
 
     #     # meta-llama/Llama-2-7b-chat-hf
-    #     score = gptscore.compute(source, pred, a="ERR", d="diag", model="meta-llama/Llama-2-7b-chat-hf")
-    #     score_2 = gptscore.compute(source, better_pred, a="ERR", d="diag", model="meta-llama/Llama-2-7b-chat-hf")
+    #     score = gptscore.compute(source, pred, aspect="ERR", task="diag", model="meta-llama/Llama-2-7b-chat-hf")
+    #     score_2 = gptscore.compute(source, better_pred, aspect="ERR", task="diag",
+    #       model="meta-llama/Llama-2-7b-chat-hf")
     #     self.assertGreater(score_2, score)