Skip to content

Commit

Permalink
feat(gptscore): revamp to make multiple predictions at once
Browse files Browse the repository at this point in the history
  • Loading branch information
LucieNvz committed Oct 19, 2023
1 parent 09d2cba commit ce999f8
Show file tree
Hide file tree
Showing 2 changed files with 160 additions and 114 deletions.
195 changes: 106 additions & 89 deletions saga_llm_evaluation_ml/model/helpers/llm_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,91 @@ def __init__(self):
and theta are model parameters.
GPTScore does not require any reference text.
"""
self.huggingface_models = ["meta-llama/Llama-2-7b-chat-hf", "gpt2", "mistralai/Mistral-7B-v0.1"]
self.aspects = [
"COV",
"FAC",
"FLU",
"CON",
"INF",
"COH",
"REL",
"ACC",
"MQM",
"INT",
"ENG",
"SPE",
"COR",
"SEM",
"UND",
"ERR",
"DIV",
"DEP",
"LIK",
"FLE",
"INQ",
]
self.models = ["meta-llama/Llama-2-7b-chat-hf", "gpt-3.5-turbo", "gpt2"]
self.tasks = ["summ", "MT", "D2T", "diag"]

def get_prompt(self, a, d, src, pred):
"""
This method returns a prompt template given a task description, and an aspect to evaluate.
Args:
a (str): Aspect to evaluate.
d (str): Task description.
src (str): Source text.
pred (str): Candidate sentence.
Returns:
str: Prompt template.
"""

templates = {
"summ": {
"FAC": f"Generate a summary with consistent facts for the following text: {src}\n\nTl;dr{pred}",
"COV": f"Generate a summary with as much semantic coverage as possible for the following text: {src}\n\nTl;dr{pred}",
"CON": f"Generate factually consistent summary for the following text: {src}\n\nTl;dr{pred}",
"INF": f"Generate an informative summary that captures the key points of the following text:{src}\n\nTl;dr{pred}",
"COH": f"Generate a coherent summary for the following text: {src}\n\nTl;dr{pred}",
"REL": f"Generate a relevant summary with consistent details for the following text: {src}\n\nTl;dr{pred}",
"FLU": f"Generate a fluent and grammatical summary for the following text: {src}\n\nTl;dr{pred}",
},
"MT": {
"ACC": f"Rewrite the following text with its core information and consistent facts:{src} In other words, {pred}",
"FLU": f"Rewrite the following text to make it more grammatical and well-written:{src} In other words,{pred}",
"MQM": f"Rewrite the following text into high-quality text with its core information:{src} In other words,{pred}",
},
"D2T": {
"INF": f"Convert the following text to another expression that preserves key information:\n\n{src} In other words, {pred}",
"NAT": f"Convert the following text into another expression that is human-like and natural:\n\n{src} In other words, {pred}",
"FLU": f"Convert the following text into another expression that preserves key information and is human-like and natural:\n\n{src} In other words, {pred}",
},
"diag": {
"COH": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI coherent and maintains a good conversation flow throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
"DIV": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is there diversity in the AI responses? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
"FLE": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI flexible and adaptable to human and their interests? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
"UND": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI seem to understand the human? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
"INQ": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI inquisitive throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
"CON": f"Answer the question based on the conversation between a human and AI.\nQuestion: Are the responses of AI consistent in the information it provides throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
"INF": f"Answer the question based on the conversation between a human and AI.\nQuestion: Are the responses of AI informative throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
"LIK": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI display a likeable personality? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
"DEP": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI discuss topics in depth? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
"ERR": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI able to recover from errors that it makes? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
},
}

# Check that the corresponding entry exists in the prompt template
assert a in templates[d], f"Aspect {a} is not available for task {d}."
# Check that the prompt template is not empty
assert templates[d][
a
], f"Prompt template for aspect {a} and task {d} is non-existent. Please specify a prompt template."


return templates[d][a]

def compute(
self, src, pred, model="gpt2", prompt=None, a=None, d=None, api_key=None
self, sources, preds, model="gpt2", prompts=None, a=None, d=None, api_key=None
):
"""
This method computes GPTScore for a list of candidate sentences given a task description, an aspect to evaluate and context information.
Expand Down Expand Up @@ -52,8 +134,8 @@ def compute(
- (diag): Dialogue. Generate an engaging and informative response based on the dialogue history.
Args:
src (str): Source text.
pred (str): Candidate sentence.
sources (list of str): Source texts.
preds (list of str): Candidate sentences.
model (str): Model name. If None, a default model is used.
prompt (str): Prompt template. If None, a default prompt template is used.
a (list): List of aspects to evaluate.
Expand All @@ -63,50 +145,15 @@ def compute(
Returns:
list: List of scores for each candidate sentence.
"""
prompts = {
"summ": {
"FAC": f"Generate a summary with consistent facts for the following text: {src}\n\nTl;dr{pred}",
"COV": f"Generate a summary with as much semantic coverage as possible for the following text: {src}\n\nTl;dr{pred}",
"CON": f"Generate factually consistent summary for the following text: {src}\n\nTl;dr{pred}",
"INF": f"Generate an informative summary that captures the key points of the following text:{src}\n\nTl;dr{pred}",
"COH": f"Generate a coherent summary for the following text: {src}\n\nTl;dr{pred}",
"REL": f"Generate a relevant summary with consistent details for the following text: {src}\n\nTl;dr{pred}",
"FLU": f"Generate a fluent and grammatical summary for the following text: {src}\n\nTl;dr{pred}",
},
"MT": {
"ACC": f"Rewrite the following text with its core information and consistent facts:{src} In other words, {pred}",
"FLU": f"Rewrite the following text to make it more grammatical and well-written:{src} In other words,{pred}",
"MQM": f"Rewrite the following text into high-quality text with its core information:{src} In other words,{pred}",
},
"D2T": {
"INF": f"Convert the following text to another expression that preserves key information:\n\n{src} In other words, {pred}",
"NAT": f"Convert the following text into another expression that is human-like and natural:\n\n{src} In other words, {pred}",
"FLU": f"Convert the following text into another expression that preserves key information and is human-like and natural:\n\n{src} In other words, {pred}",
},
"diag": {
"COH": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI coherent and maintains a good conversation flow throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
"DIV": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is there diversity in the AI responses? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
"FLE": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI flexible and adaptable to human and their interests? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
"UND": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI seem to understand the human? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
"INQ": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI inquisitive throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
"CON": f"Answer the question based on the conversation between a human and AI.\nQuestion: Are the responses of AI consistent in the information it provides throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
"INF": f"Answer the question based on the conversation between a human and AI.\nQuestion: Are the responses of AI informative throughout the conversation? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
"LIK": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI display a likeable personality? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
"DEP": f"Answer the question based on the conversation between a human and AI.\nQuestion: Does the AI discuss topics in depth? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
"ERR": f"Answer the question based on the conversation between a human and AI.\nQuestion: Is the AI able to recover from errors that it makes? (a) Yes. (b) No.\nConversation: {src + pred}\nAnswer: Yes.",
},
}
assert isinstance(sources, list) and isinstance(sources[0], str), "Source must be a list of strings."
assert isinstance(preds, list) and isinstance(preds[0], str), "Prediction must be a list of strings."

assert isinstance(src, str), "Source must be a string."
assert isinstance(pred, str), "Prediction must be a string."
assert isinstance(model, str), "Model must be a string."
# If model is not in the list of models, raise an error
models = ["meta-llama/Llama-2-7b-chat-hf", "gpt-3.5-turbo", "gpt2"]
assert model in models, f"Model must be one of {models}."
assert model in self.models, f"Model must be one of {self.models}."

# If prompt is given, check that it is a string
if prompt:
assert isinstance(prompt, str), "Prompt must be a string."
# If prompt is given, check that it is a list of string
if prompts:
assert isinstance(prompts, list) and isinstance(prompts[0], str), "Prompts must be a list of strings."
assert not a, "Aspect must not be given if prompt is given."
assert not d, "Task must not be given if prompt is given."
else:
Expand All @@ -117,53 +164,21 @@ def compute(
# If aspect is given, check that it is a string
if a:
assert isinstance(a, str), "Aspect must be a string."
aspects = [
"COV",
"FAC",
"FLU",
"CON",
"INF",
"COH",
"REL",
"ACC",
"MQM",
"INT",
"ENG",
"SPE",
"COR",
"SEM",
"UND",
"ERR",
"DIV",
"DEP",
"LIK",
"FLE",
"INQ",
]
assert a in aspects, f"Aspect must be one of {aspects}."
assert a in self.aspects, f"Aspect must be one of {self.aspects}."

# If task is given, check that it is a string
if d:
assert isinstance(d, str), "Task must be a string."
tasks = ["summ", "MT", "D2T", "diag"]
assert d in tasks, f"Task must be one of {tasks}."

if a and d:
# Check that the corresponding entry exists in the prompt template
assert a in prompts[d], f"Aspect {a} is not available for task {d}."
# Check that the prompt template is not empty
assert prompts[d][
a
], f"Prompt template for aspect {a} and task {d} is non-existent. Please specify a prompt template."
assert d in self.tasks, f"Task must be one of {self.tasks}."

# Generative LLM is given a prompt template and some context information
prompt = prompt if prompt else prompts[d][a]
prompts = prompts if prompts else [self.get_prompt(a, d, src, pred) for (src, pred) in zip(sources, preds)]

# Model predicts log-likelihood of the next token given the previous tokens and the prompt template
if model == "meta-llama/Llama-2-7b-chat-hf" or model == "gpt2":
if model in self.huggingface_models:
tokenizer = AutoTokenizer.from_pretrained(model)
llm = AutoModelForCausalLM.from_pretrained(model)
inputs = tokenizer(prompt, return_tensors="pt")
inputs = tokenizer(prompts, return_tensors="pt")

outputs = llm.generate(
**inputs,
Expand All @@ -176,22 +191,24 @@ def compute(
outputs.sequences, outputs.scores, normalize_logits=True
)

logprobs = np.array(transition_scores[0].tolist())
print(logprobs)
logprobs = np.array(transition_scores.tolist())

elif model == "gpt-3.5-turbo":
openai.api_key = api_key
response = openai.Completion.create(
model=model,
prompt=prompt,
prompt=prompts,
logprobs=5,
)

logprobs = response["choices"][0]["logprobs"]

# Compute GPTScore
score = 0
for i, _ in enumerate(pred.split()):
score += logprobs[i]
# Compute GPTScores
scores = []
for i, pred in enumerate(preds):
pred_tokens = pred.split()
pred_logprobs = logprobs[i][: len(pred_tokens)]
score = np.mean(pred_logprobs)
scores.append(score)

return score
return scores
Loading

0 comments on commit ce999f8

Please sign in to comment.