feat(gptscore): revamp to make multiple predictions at once

Sagacify · Oct 19, 2023 · ce999f8 · ce999f8
1 parent 09d2cba
commit ce999f8
Show file tree

Hide file tree

Showing 2 changed files with 160 additions and 114 deletions.
diff --git a/saga_llm_evaluation_ml/model/helpers/llm_metrics.py b/saga_llm_evaluation_ml/model/helpers/llm_metrics.py
@@ -16,9 +16,91 @@ def __init__(self):
         and theta are model parameters.
         GPTScore does not require any reference text.
         """
+        self.huggingface_models = ["meta-llama/Llama-2-7b-chat-hf", "gpt2", "mistralai/Mistral-7B-v0.1"]
+        self.aspects = [
+                "COV",
+                "FAC",
+                "FLU",
+                "CON",
+                "INF",
+                "COH",
+                "REL",
+                "ACC",
+                "MQM",
+                "INT",
+                "ENG",
+                "SPE",
+                "COR",
+                "SEM",
+                "UND",
+                "ERR",
+                "DIV",
+                "DEP",
+                "LIK",
+                "FLE",
+                "INQ",
+            ]
+        self.models = ["meta-llama/Llama-2-7b-chat-hf", "gpt-3.5-turbo", "gpt2"]
+        self.tasks = ["summ", "MT", "D2T", "diag"]
+
+    def get_prompt(self, a, d, src, pred):
+        """
+        This method returns a prompt template given a task description, and an aspect to evaluate.
+        Args:
+            a (str): Aspect to evaluate.
+            d (str): Task description.
+            src (str): Source text.
+            pred (str): Candidate sentence.
+        Returns:
+            str: Prompt template.
+        """
+
+        templates = {
+            "summ": {
+                "FAC": f"Generate a summary with consistent facts for the following text: {src}\n\nTl;dr{pred}",
+                "COV": f"Generate a summary with as much semantic coverage as possible for the following text: {src}\n\nTl;dr{pred}",
+                "CON": f"Generate factually consistent summary for the following text: {src}\n\nTl;dr{pred}",
+                "INF": f"Generate an informative summary that captures the key points of the following text:{src}\n\nTl;dr{pred}",
+                "COH": f"Generate a coherent summary for the following text: {src}\n\nTl;dr{pred}",
+                "REL": f"Generate a relevant summary with consistent details for the following text: {src}\n\nTl;dr{pred}",
+                "FLU": f"Generate a fluent and grammatical summary for the following text: {src}\n\nTl;dr{pred}",
+            },
+            "MT": {
+                "ACC": f"Rewrite the following text with its core information and consistent facts:{src} In other words, {pred}",
+                "FLU": f"Rewrite the following text to make it more grammatical and well-written:{src} In other words,{pred}",
+                "MQM": f"Rewrite the following text into high-quality text with its core information:{src} In other words,{pred}",
+            },
+            "D2T": {
+                "INF": f"Convert the following text to another expression that preserves key information:\n\n{src} In other words, {pred}",
+                "NAT": f"Convert the following text into another expression that is human-like and natural:\n\n{src} In other words, {pred}",
+                "FLU": f"Convert the following text into another expression that preserves key information and is human-like and natural:\n\n{src} In other words, {pred}",
+            },
+            "diag": {
+                "COH": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI coherent and maintains a good conversation flow throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "DIV": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is there diversity in the AI responses? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "FLE": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI flexible and adaptable to human and their interests? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "UND": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI seem to understand the human? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "INQ": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI inquisitive throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "CON": f"Answer the question based on the conversation between a human and AI.\nQuestion:  Are the responses of AI consistent in the information it provides throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "INF": f"Answer the question based on the conversation between a human and AI.\nQuestion: Are the responses of AI informative throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "LIK": f"Answer the question based on the conversation between a human and AI.\nQuestion:  Does the AI display a likeable personality? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "DEP": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI discuss topics in depth? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+                "ERR": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI able to recover from errors that it makes? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
+            },
+        }
+
+        # Check that the corresponding entry exists in the prompt template
+        assert a in templates[d], f"Aspect {a} is not available for task {d}."
+        # Check that the prompt template is not empty
+        assert templates[d][
+            a
+        ], f"Prompt template for aspect {a} and task {d} is non-existent. Please specify a prompt template."
+
+
+        return templates[d][a]
 
     def compute(
-        self, src, pred, model="gpt2", prompt=None, a=None, d=None, api_key=None
+        self, sources, preds, model="gpt2", prompts=None, a=None, d=None, api_key=None
     ):
         """
         This method computes GPTScore for a list of candidate sentences given a task description, an aspect to evaluate and context information.
@@ -52,8 +134,8 @@ def compute(
         - (diag): Dialogue. Generate an engaging and informative response based on the dialogue history.
 
         Args:
-            src (str): Source text.
-            pred (str): Candidate sentence.
+            sources (list of str): Source texts.
+            preds (list of str): Candidate sentences.
             model (str): Model name. If None, a default model is used.
             prompt (str): Prompt template. If None, a default prompt template is used.
             a (list): List of aspects to evaluate.
@@ -63,50 +145,15 @@ def compute(
         Returns:
             list: List of scores for each candidate sentence.
         """
-        prompts = {
-            "summ": {
-                "FAC": f"Generate a summary with consistent facts for the following text: {src}\n\nTl;dr{pred}",
-                "COV": f"Generate a summary with as much semantic coverage as possible for the following text: {src}\n\nTl;dr{pred}",
-                "CON": f"Generate factually consistent summary for the following text: {src}\n\nTl;dr{pred}",
-                "INF": f"Generate an informative summary that captures the key points of the following text:{src}\n\nTl;dr{pred}",
-                "COH": f"Generate a coherent summary for the following text: {src}\n\nTl;dr{pred}",
-                "REL": f"Generate a relevant summary with consistent details for the following text: {src}\n\nTl;dr{pred}",
-                "FLU": f"Generate a fluent and grammatical summary for the following text: {src}\n\nTl;dr{pred}",
-            },
-            "MT": {
-                "ACC": f"Rewrite the following text with its core information and consistent facts:{src} In other words, {pred}",
-                "FLU": f"Rewrite the following text to make it more grammatical and well-written:{src} In other words,{pred}",
-                "MQM": f"Rewrite the following text into high-quality text with its core information:{src} In other words,{pred}",
-            },
-            "D2T": {
-                "INF": f"Convert the following text to another expression that preserves key information:\n\n{src} In other words, {pred}",
-                "NAT": f"Convert the following text into another expression that is human-like and natural:\n\n{src} In other words, {pred}",
-                "FLU": f"Convert the following text into another expression that preserves key information and is human-like and natural:\n\n{src} In other words, {pred}",
-            },
-            "diag": {
-                "COH": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI coherent and maintains a good conversation flow throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-                "DIV": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is there diversity in the AI responses? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-                "FLE": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI flexible and adaptable to human and their interests? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-                "UND": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI seem to understand the human? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-                "INQ": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI inquisitive throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-                "CON": f"Answer the question based on the conversation between a human and AI.\nQuestion:  Are the responses of AI consistent in the information it provides throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-                "INF": f"Answer the question based on the conversation between a human and AI.\nQuestion: Are the responses of AI informative throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-                "LIK": f"Answer the question based on the conversation between a human and AI.\nQuestion:  Does the AI display a likeable personality? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-                "DEP": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI discuss topics in depth? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-                "ERR": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI able to recover from errors that it makes? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
-            },
-        }
+        assert isinstance(sources, list) and isinstance(sources[0], str), "Source must be a list of strings."
+        assert isinstance(preds, list) and isinstance(preds[0], str), "Prediction must be a list of strings."
 
-        assert isinstance(src, str), "Source must be a string."
-        assert isinstance(pred, str), "Prediction must be a string."
         assert isinstance(model, str), "Model must be a string."
-        # If model is not in the list of models, raise an error
-        models = ["meta-llama/Llama-2-7b-chat-hf", "gpt-3.5-turbo", "gpt2"]
-        assert model in models, f"Model must be one of {models}."
+        assert model in self.models, f"Model must be one of {self.models}."
 
-        # If prompt is given, check that it is a string
-        if prompt:
-            assert isinstance(prompt, str), "Prompt must be a string."
+        # If prompt is given, check that it is a list of string
+        if prompts:
+            assert isinstance(prompts, list) and isinstance(prompts[0], str), "Prompts must be a list of strings."
             assert not a, "Aspect must not be given if prompt is given."
             assert not d, "Task must not be given if prompt is given."
         else:
@@ -117,53 +164,21 @@ def compute(
         # If aspect is given, check that it is a string
         if a:
             assert isinstance(a, str), "Aspect must be a string."
-            aspects = [
-                "COV",
-                "FAC",
-                "FLU",
-                "CON",
-                "INF",
-                "COH",
-                "REL",
-                "ACC",
-                "MQM",
-                "INT",
-                "ENG",
-                "SPE",
-                "COR",
-                "SEM",
-                "UND",
-                "ERR",
-                "DIV",
-                "DEP",
-                "LIK",
-                "FLE",
-                "INQ",
-            ]
-            assert a in aspects, f"Aspect must be one of {aspects}."
+            assert a in self.aspects, f"Aspect must be one of {self.aspects}."
 
         # If task is given, check that it is a string
         if d:
             assert isinstance(d, str), "Task must be a string."
-            tasks = ["summ", "MT", "D2T", "diag"]
-            assert d in tasks, f"Task must be one of {tasks}."
-
-        if a and d:
-            # Check that the corresponding entry exists in the prompt template
-            assert a in prompts[d], f"Aspect {a} is not available for task {d}."
-            # Check that the prompt template is not empty
-            assert prompts[d][
-                a
-            ], f"Prompt template for aspect {a} and task {d} is non-existent. Please specify a prompt template."
+            assert d in self.tasks, f"Task must be one of {self.tasks}."
 
         # Generative LLM is given a prompt template and some context information
-        prompt = prompt if prompt else prompts[d][a]
+        prompts = prompts if prompts else [self.get_prompt(a, d, src, pred) for (src, pred) in zip(sources, preds)]
 
         # Model predicts log-likelihood of the next token given the previous tokens and the prompt template
-        if model == "meta-llama/Llama-2-7b-chat-hf" or model == "gpt2":
+        if model in self.huggingface_models:
             tokenizer = AutoTokenizer.from_pretrained(model)
             llm = AutoModelForCausalLM.from_pretrained(model)
-            inputs = tokenizer(prompt, return_tensors="pt")
+            inputs = tokenizer(prompts, return_tensors="pt")
 
             outputs = llm.generate(
                 **inputs,
@@ -176,22 +191,24 @@ def compute(
                 outputs.sequences, outputs.scores, normalize_logits=True
             )
 
-            logprobs = np.array(transition_scores[0].tolist())
-            print(logprobs)
+            logprobs = np.array(transition_scores.tolist())
 
         elif model == "gpt-3.5-turbo":
             openai.api_key = api_key
             response = openai.Completion.create(
                 model=model,
-                prompt=prompt,
+                prompt=prompts,
                 logprobs=5,
             )
 
             logprobs = response["choices"][0]["logprobs"]
 
-        # Compute GPTScore
-        score = 0
-        for i, _ in enumerate(pred.split()):
-            score += logprobs[i]
+        # Compute GPTScores
+        scores = []
+        for i, pred in enumerate(preds):
+            pred_tokens = pred.split()
+            pred_logprobs = logprobs[i][: len(pred_tokens)]
+            score = np.mean(pred_logprobs)
+            scores.append(score)
 
-        return score
+        return scores