refactor(structure): refactor file structure for pylint

Sagacify · Oct 19, 2023 · 2be80d7 · 2be80d7
1 parent 4f44004
commit 2be80d7
Show file tree

Hide file tree

Showing 9 changed files with 164 additions and 102 deletions.
diff --git a/...m_evaluation_ml/model/helpers/__init__.py → saga_llm_evaluation_ml/helpers/__init__.py b/...m_evaluation_ml/model/helpers/__init__.py → saga_llm_evaluation_ml/helpers/__init__.py
diff --git a/...ion_ml/model/helpers/embedding_metrics.py → ...valuation_ml/helpers/embedding_metrics.py b/...ion_ml/model/helpers/embedding_metrics.py → ...valuation_ml/helpers/embedding_metrics.py
@@ -4,7 +4,8 @@
 class BERTScore:
     def __init__(self, model_type="distilbert-base-uncased"):
         """
-        BERTScore computes a similarity score for each token in the candidate sentence with each token in the reference sentence.
+        BERTScore computes a similarity score for each token in the candidate sentence with each
+        token in the reference sentence.
         The final score is the average of the similarity scores of all tokens in the candidate sentence.
 
         Args:
@@ -20,7 +21,8 @@ def compute(self, references, predictions, **kwargs):
             predictions (list): List of candidate sentences.
 
         Returns:
-            list: List of scores for each candidate sentence. Contains a list of scores for precisions, recalls, and F1 scores.
+            list: List of scores for each candidate sentence. Contains a list of scores
+            for precisions, recalls, and F1 scores.
         """
         assert len(references) == len(
             predictions
@@ -39,7 +41,8 @@ def compute(self, references, predictions, **kwargs):
 class MAUVE:
     def __init__(self, featurize_model_name="gpt2"):
         """
-        MAUVE score computes the difference between the candidate sentence distribution and the reference sentence distribution.
+        MAUVE score computes the difference between the candidate sentence distribution
+        and the reference sentence distribution.
         The bigger the MAUVE score, the better.
         """
         self.metric = load("mauve")

diff --git a/...tion_ml/model/helpers/language_metrics.py → ...evaluation_ml/helpers/language_metrics.py b/...tion_ml/model/helpers/language_metrics.py → ...evaluation_ml/helpers/language_metrics.py
@@ -4,7 +4,8 @@
 class BLEURTScore:
     def __init__(self, checkpoint="BLEURT-tiny"):
         """
-        BLEURT is a learnt metric that uses BERT to compute a similarity score for each token in the candidate sentence with each token in the reference sentence.
+        BLEURT is a learnt metric that uses BERT to compute a similarity score for
+        each token in the candidate sentence with each token in the reference sentence.
 
         Args:
             checkpoint (str, optional): Checkpoint to use. Defaults to BLEURT-tiny if not specified.

diff --git a/...valuation_ml/model/helpers/llm_metrics.py → ..._llm_evaluation_ml/helpers/llm_metrics.py b/...valuation_ml/model/helpers/llm_metrics.py → ..._llm_evaluation_ml/helpers/llm_metrics.py
@@ -4,7 +4,7 @@
 
 
 class GPTScore:
-    def __init__(self):
+    def __init__(self, model="gpt2"):
         """
         GPTScore is a metric which allows to evaluate generative models on a variety of tasks.
         GPTScore(h|d, a, S) =  sum_{t=1}^m w_t * log p(h_t | h_{<t}, T(d, a, S), theta)
@@ -15,12 +15,17 @@ def __init__(self):
         S: context information.
         and theta are model parameters.
         GPTScore does not require any reference text.
+
+        Args:
+            model (str, optional): Model name. Defaults to "gpt2".
         """
+        self.model = model
         self.huggingface_models = [
             "meta-llama/Llama-2-7b-chat-hf",
             "gpt2",
             "mistralai/Mistral-7B-v0.1",
         ]
+        self.openai_models = ["gpt-3.5-turbo"]
         self.aspects = [
             "COV",
             "FAC",
@@ -44,15 +49,87 @@ def __init__(self):
             "FLE",
             "INQ",
         ]
-        self.models = ["meta-llama/Llama-2-7b-chat-hf", "gpt-3.5-turbo", "gpt2"]
+        self.models = [
+            "meta-llama/Llama-2-7b-chat-hf",
+            "gpt-3.5-turbo",
+            "gpt2",
+            "mistralai/Mistral-7B-v0.1",
+        ]
         self.tasks = ["summ", "MT", "D2T", "diag"]
 
-    def get_prompt(self, a, d, src, pred):
+        assert isinstance(model, str), "Model must be a string."
+        assert model in self.models, f"Model must be one of {self.models}."
+
+    def huggingface_logprobs(self, prompts):
+        """
+        This method outputs the log-likelihood of the next token given the previous tokens and the prompt template.
+        Args:
+            prompts (list of str): List of prompt templates.
+        Returns:
+            list: List of log-likelihoods for each candidate sentence.
+        """
+
+        tokenizer = AutoTokenizer.from_pretrained(self.model)
+        llm = AutoModelForCausalLM.from_pretrained(self.model)
+        inputs = tokenizer(prompts, return_tensors="pt")
+
+        outputs = llm.generate(
+            **inputs,
+            max_new_tokens=50,
+            return_dict_in_generate=True,
+            output_scores=True,
+        )
+
+        logprobs = np.array(
+            llm.compute_transition_scores(
+                outputs.sequences, outputs.scores, normalize_logits=True
+            ).tolist()
+        )
+
+        return logprobs
+
+    def openai_logprobs(self, prompts, api_key):
+        """
+        This method outputs the log-likelihood of the next token given the previous tokens and the prompt template.
+        Args:
+            prompts (list of str): List of prompt templates.
+            api_key (str): OpenAI API key.
+        Returns:
+            list: List of log-likelihoods for each candidate sentence.
+        """
+        openai.api_key = api_key
+        outputs = openai.Completion.create(
+            model=self.model,
+            prompt=prompts,
+            logprobs=5,
+        )
+
+        logprobs = outputs["choices"][0]["logprobs"]
+
+        return logprobs
+
+    def get_prompts(self, aspect, task, sources, preds):
+        """
+        This method returns a list of prompt templates given a task description, and an aspect to evaluate.
+        Args:
+            aspect (str): Aspect to evaluate.
+            task (str): Task description.
+            sources (list of str): Source texts.
+            preds (list of str): Candidate sentences.
+        Returns:
+            list: List of prompt templates.
+        """
+        return [
+            self.get_prompt(aspect, task, src, pred)
+            for (src, pred) in zip(sources, preds)
+        ]
+
+    def get_prompt(self, aspect, task, src, pred):
         """
         This method returns a prompt template given a task description, and an aspect to evaluate.
         Args:
-            a (str): Aspect to evaluate.
-            d (str): Task description.
+            aspect (str): Aspect to evaluate.
+            task (str): Task description.
             src (str): Source text.
             pred (str): Candidate sentence.
         Returns:
@@ -94,21 +171,25 @@ def get_prompt(self, a, d, src, pred):
         }
 
         # Check that the corresponding entry exists in the prompt template
-        assert a in templates[d], f"Aspect {a} is not available for task {d}."
+        assert (
+            aspect in templates[task]
+        ), f"Aspect {aspect} is not available for task {task}."
         # Check that the prompt template is not empty
-        assert templates[d][
-            a
-        ], f"Prompt template for aspect {a} and task {d} is non-existent. Please specify a prompt template."
+        assert templates[task][
+            aspect
+        ], f"Prompt template for aspect {aspect} and task {task} is non-existent. Please specify a prompt template."
 
-        return templates[d][a]
+        return templates[task][aspect]
 
     def compute(
-        self, sources, preds, model="gpt2", prompts=None, a=None, d=None, api_key=None
+        self, sources, preds, prompts=None, aspect=None, task=None, api_key=None
     ):
         """
-        This method computes GPTScore for a list of candidate sentences given a task description, an aspect to evaluate and context information.
+        This method computes GPTScore for a list of candidate sentences given a task description,
+        an aspect to evaluate and context information.
         The possible values for aspect are:
-        - (COV): Semantic coverage. How many semantic content units from the reference text are covered by the generated text?
+        - (COV): Semantic coverage. How many semantic content units from the reference text
+                    are covered by the generated text?
         - (FAC): Factuality. Does the generated text preserve the factual statements of the source text?)
         - (FLU): Fluency. Is the generated text well-written and grammatical?
         - (CON): Consistency. Is the generated text consistent in the information it provides?
@@ -139,10 +220,9 @@ def compute(
         Args:
             sources (list of str): Source texts.
             preds (list of str): Candidate sentences.
-            model (str): Model name. If None, a default model is used.
-            prompt (str): Prompt template. If None, a default prompt template is used.
-            a (list): List of aspects to evaluate.
-            d (str): Task description.
+            prompts (str): Prompt template. If None, a default prompt template is used.
+            aspect (list): List of aspects to evaluate.
+            task (str): Task description.
             api_key (str): OpenAI API key.
 
         Returns:
@@ -155,75 +235,46 @@ def compute(
             preds[0], str
         ), "Prediction must be a list of strings."
 
-        assert isinstance(model, str), "Model must be a string."
-        assert model in self.models, f"Model must be one of {self.models}."
+        assert isinstance(self.model, str), "Model must be a string."
+        assert self.model in self.models, f"Model must be one of {self.models}."
 
         # If prompt is given, check that it is a list of string
         if prompts:
             assert isinstance(prompts, list) and isinstance(
                 prompts[0], str
             ), "Prompts must be a list of strings."
-            assert not a, "Aspect must not be given if prompt is given."
-            assert not d, "Task must not be given if prompt is given."
+            assert not aspect, "Aspect must not be given if prompt is given."
+            assert not task, "Task must not be given if prompt is given."
         else:
             # If prompt is not given, check that task and aspect are given
-            assert a, "Aspect must be given if prompt is not given."
-            assert d, "Task must be given if prompt is not given."
+            assert aspect, "Aspect must be given if prompt is not given."
+            assert task, "Task must be given if prompt is not given."
 
         # If aspect is given, check that it is a string
-        if a:
-            assert isinstance(a, str), "Aspect must be a string."
-            assert a in self.aspects, f"Aspect must be one of {self.aspects}."
+        if aspect:
+            assert isinstance(aspect, str), "Aspect must be a string."
+            assert aspect in self.aspects, f"Aspect must be one of {self.aspects}."
 
         # If task is given, check that it is a string
-        if d:
-            assert isinstance(d, str), "Task must be a string."
-            assert d in self.tasks, f"Task must be one of {self.tasks}."
+        if task:
+            assert isinstance(task, str), "Task must be a string."
+            assert task in self.tasks, f"Task must be one of {self.tasks}."
 
         # Generative LLM is given a prompt template and some context information
-        prompts = (
-            prompts
-            if prompts
-            else [
-                self.get_prompt(a, d, src, pred) for (src, pred) in zip(sources, preds)
-            ]
-        )
+        prompts = prompts or self.get_prompts(aspect, task, sources, preds)
 
         # Model predicts log-likelihood of the next token given the previous tokens and the prompt template
-        if model in self.huggingface_models:
-            tokenizer = AutoTokenizer.from_pretrained(model)
-            llm = AutoModelForCausalLM.from_pretrained(model)
-            inputs = tokenizer(prompts, return_tensors="pt")
-
-            outputs = llm.generate(
-                **inputs,
-                max_new_tokens=50,
-                return_dict_in_generate=True,
-                output_scores=True,
-            )
-
-            transition_scores = llm.compute_transition_scores(
-                outputs.sequences, outputs.scores, normalize_logits=True
-            )
-
-            logprobs = np.array(transition_scores.tolist())
-
-        elif model == "gpt-3.5-turbo":
-            openai.api_key = api_key
-            response = openai.Completion.create(
-                model=model,
-                prompt=prompts,
-                logprobs=5,
-            )
+        if self.model in self.huggingface_models:
+            logprobs = self.huggingface_logprobs(prompts)
 
-            logprobs = response["choices"][0]["logprobs"]
+        elif self.model in self.openai_models:
+            logprobs = self.openai_logprobs(prompts, api_key)
 
         # Compute GPTScores
         scores = []
         for i, pred in enumerate(preds):
             pred_tokens = pred.split()
             pred_logprobs = logprobs[i][: len(pred_tokens)]
-            score = np.mean(pred_logprobs)
-            scores.append(score)
+            scores.append(np.mean(pred_logprobs))
 
         return scores
diff --git a/..._llm_evaluation_ml/model/helpers/utils.py → saga_llm_evaluation_ml/helpers/utils.py b/..._llm_evaluation_ml/model/helpers/utils.py → saga_llm_evaluation_ml/helpers/utils.py
@@ -18,7 +18,7 @@ class MetadataExtractor:
     def __init__(self):
         self.metadata_extractor = MetafeatureExtractorsRunner()
 
-    def addWordRegexMatchesCount(self, regex_rule, name=None):
+    def add_word_regex_matches_count(self, regex_rule, name=None):
         """
         Adds a regex rule to the metadata extractor.
         For a given regex return the number of words matching the regex.
@@ -30,7 +30,7 @@ def addWordRegexMatchesCount(self, regex_rule, name=None):
             WordRegexMatchesCount(regex=regex_rule, name=name)
         )
 
-    def addRegexMatchCount(self, regex_rule, name=None):
+    def add_regex_match_count(self, regex_rule, name=None):
         """
         Adds a regex rule to the metadata extractor.
         For a given regex return the number of matches it has in the text.

diff --git a/tests/test_embedding_metrics.py b/tests/test_embedding_metrics.py
@@ -1,6 +1,6 @@
 import unittest
 
-from saga_llm_evaluation_ml.model.helpers.embedding_metrics import BERTScore, MAUVE
+from saga_llm_evaluation_ml.helpers.embedding_metrics import BERTScore, MAUVE
 
 
 class TestBERTScore(unittest.TestCase):

diff --git a/tests/test_helpers.py b/tests/test_helpers.py
@@ -1,6 +1,6 @@
 import unittest
 
-from saga_llm_evaluation_ml.model.helpers.utils import MetadataExtractor
+from saga_llm_evaluation_ml.helpers.utils import MetadataExtractor
 
 
 class TestMetadataExtractor(unittest.TestCase):
@@ -25,8 +25,8 @@ def test_add_regex(self):
         """Tests that the MetadataExtractor class extracts the correct metadata when regex rules are added."""
         text = "The cat sat on the mat."
         extractor = MetadataExtractor()
-        extractor.addWordRegexMatchesCount("the")
-        extractor.addRegexMatchCount("the")
+        extractor.add_word_regex_matches_count("the")
+        extractor.add_regex_match_count("the")
         metadata = extractor.compute(text)
 
         # Test a few metadata values
@@ -45,8 +45,10 @@ def test_add_regex(self):
         len_metadata = len(metadata)
 
         # Check that the metadata is longer when multiple regex rules are added
-        extractor.addWordRegexMatchesCount("cat", name="word_regex_matches_count_cat")
-        extractor.addRegexMatchCount("cat", name="regex_match_count_cat")
+        extractor.add_word_regex_matches_count(
+            "cat", name="word_regex_matches_count_cat"
+        )
+        extractor.add_regex_match_count("cat", name="regex_match_count_cat")
         metadata = extractor.compute(text)
 
         self.assertGreater(len(metadata), len_metadata)
diff --git a/tests/test_language_metrics.py b/tests/test_language_metrics.py
@@ -1,6 +1,6 @@
 import unittest
 
-from saga_llm_evaluation_ml.model.helpers.language_metrics import BLEURTScore
+from saga_llm_evaluation_ml.helpers.language_metrics import BLEURTScore
 
 
 class TestBLEURTScore(unittest.TestCase):