Skip to content

Commit

Permalink
refactor(structure): refactor file structure for pylint
Browse files Browse the repository at this point in the history
  • Loading branch information
LucieNvz committed Oct 19, 2023
1 parent 4f44004 commit 2be80d7
Show file tree
Hide file tree
Showing 9 changed files with 164 additions and 102 deletions.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
class BERTScore:
def __init__(self, model_type="distilbert-base-uncased"):
"""
BERTScore computes a similarity score for each token in the candidate sentence with each token in the reference sentence.
BERTScore computes a similarity score for each token in the candidate sentence with each
token in the reference sentence.
The final score is the average of the similarity scores of all tokens in the candidate sentence.
Args:
Expand All @@ -20,7 +21,8 @@ def compute(self, references, predictions, **kwargs):
predictions (list): List of candidate sentences.
Returns:
list: List of scores for each candidate sentence. Contains a list of scores for precisions, recalls, and F1 scores.
list: List of scores for each candidate sentence. Contains a list of scores
for precisions, recalls, and F1 scores.
"""
assert len(references) == len(
predictions
Expand All @@ -39,7 +41,8 @@ def compute(self, references, predictions, **kwargs):
class MAUVE:
def __init__(self, featurize_model_name="gpt2"):
"""
MAUVE score computes the difference between the candidate sentence distribution and the reference sentence distribution.
MAUVE score computes the difference between the candidate sentence distribution
and the reference sentence distribution.
The bigger the MAUVE score, the better.
"""
self.metric = load("mauve")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
class BLEURTScore:
def __init__(self, checkpoint="BLEURT-tiny"):
"""
BLEURT is a learnt metric that uses BERT to compute a similarity score for each token in the candidate sentence with each token in the reference sentence.
BLEURT is a learnt metric that uses BERT to compute a similarity score for
each token in the candidate sentence with each token in the reference sentence.
Args:
checkpoint (str, optional): Checkpoint to use. Defaults to BLEURT-tiny if not specified.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


class GPTScore:
def __init__(self):
def __init__(self, model="gpt2"):
"""
GPTScore is a metric which allows to evaluate generative models on a variety of tasks.
GPTScore(h|d, a, S) = sum_{t=1}^m w_t * log p(h_t | h_{<t}, T(d, a, S), theta)
Expand All @@ -15,12 +15,17 @@ def __init__(self):
S: context information.
and theta are model parameters.
GPTScore does not require any reference text.
Args:
model (str, optional): Model name. Defaults to "gpt2".
"""
self.model = model
self.huggingface_models = [
"meta-llama/Llama-2-7b-chat-hf",
"gpt2",
"mistralai/Mistral-7B-v0.1",
]
self.openai_models = ["gpt-3.5-turbo"]
self.aspects = [
"COV",
"FAC",
Expand All @@ -44,15 +49,87 @@ def __init__(self):
"FLE",
"INQ",
]
self.models = ["meta-llama/Llama-2-7b-chat-hf", "gpt-3.5-turbo", "gpt2"]
self.models = [
"meta-llama/Llama-2-7b-chat-hf",
"gpt-3.5-turbo",
"gpt2",
"mistralai/Mistral-7B-v0.1",
]
self.tasks = ["summ", "MT", "D2T", "diag"]

def get_prompt(self, a, d, src, pred):
assert isinstance(model, str), "Model must be a string."
assert model in self.models, f"Model must be one of {self.models}."

def huggingface_logprobs(self, prompts):
"""
This method outputs the log-likelihood of the next token given the previous tokens and the prompt template.
Args:
prompts (list of str): List of prompt templates.
Returns:
list: List of log-likelihoods for each candidate sentence.
"""

tokenizer = AutoTokenizer.from_pretrained(self.model)
llm = AutoModelForCausalLM.from_pretrained(self.model)
inputs = tokenizer(prompts, return_tensors="pt")

outputs = llm.generate(
**inputs,
max_new_tokens=50,
return_dict_in_generate=True,
output_scores=True,
)

logprobs = np.array(
llm.compute_transition_scores(
outputs.sequences, outputs.scores, normalize_logits=True
).tolist()
)

return logprobs

def openai_logprobs(self, prompts, api_key):
"""
This method outputs the log-likelihood of the next token given the previous tokens and the prompt template.
Args:
prompts (list of str): List of prompt templates.
api_key (str): OpenAI API key.
Returns:
list: List of log-likelihoods for each candidate sentence.
"""
openai.api_key = api_key
outputs = openai.Completion.create(
model=self.model,
prompt=prompts,
logprobs=5,
)

logprobs = outputs["choices"][0]["logprobs"]

return logprobs

def get_prompts(self, aspect, task, sources, preds):
"""
This method returns a list of prompt templates given a task description, and an aspect to evaluate.
Args:
aspect (str): Aspect to evaluate.
task (str): Task description.
sources (list of str): Source texts.
preds (list of str): Candidate sentences.
Returns:
list: List of prompt templates.
"""
return [
self.get_prompt(aspect, task, src, pred)
for (src, pred) in zip(sources, preds)
]

def get_prompt(self, aspect, task, src, pred):
"""
This method returns a prompt template given a task description, and an aspect to evaluate.
Args:
a (str): Aspect to evaluate.
d (str): Task description.
aspect (str): Aspect to evaluate.
task (str): Task description.
src (str): Source text.
pred (str): Candidate sentence.
Returns:
Expand Down Expand Up @@ -94,21 +171,25 @@ def get_prompt(self, a, d, src, pred):
}

# Check that the corresponding entry exists in the prompt template
assert a in templates[d], f"Aspect {a} is not available for task {d}."
assert (
aspect in templates[task]
), f"Aspect {aspect} is not available for task {task}."
# Check that the prompt template is not empty
assert templates[d][
a
], f"Prompt template for aspect {a} and task {d} is non-existent. Please specify a prompt template."
assert templates[task][
aspect
], f"Prompt template for aspect {aspect} and task {task} is non-existent. Please specify a prompt template."

return templates[d][a]
return templates[task][aspect]

def compute(
self, sources, preds, model="gpt2", prompts=None, a=None, d=None, api_key=None
self, sources, preds, prompts=None, aspect=None, task=None, api_key=None
):
"""
This method computes GPTScore for a list of candidate sentences given a task description, an aspect to evaluate and context information.
This method computes GPTScore for a list of candidate sentences given a task description,
an aspect to evaluate and context information.
The possible values for aspect are:
- (COV): Semantic coverage. How many semantic content units from the reference text are covered by the generated text?
- (COV): Semantic coverage. How many semantic content units from the reference text
are covered by the generated text?
- (FAC): Factuality. Does the generated text preserve the factual statements of the source text?)
- (FLU): Fluency. Is the generated text well-written and grammatical?
- (CON): Consistency. Is the generated text consistent in the information it provides?
Expand Down Expand Up @@ -139,10 +220,9 @@ def compute(
Args:
sources (list of str): Source texts.
preds (list of str): Candidate sentences.
model (str): Model name. If None, a default model is used.
prompt (str): Prompt template. If None, a default prompt template is used.
a (list): List of aspects to evaluate.
d (str): Task description.
prompts (str): Prompt template. If None, a default prompt template is used.
aspect (list): List of aspects to evaluate.
task (str): Task description.
api_key (str): OpenAI API key.
Returns:
Expand All @@ -155,75 +235,46 @@ def compute(
preds[0], str
), "Prediction must be a list of strings."

assert isinstance(model, str), "Model must be a string."
assert model in self.models, f"Model must be one of {self.models}."
assert isinstance(self.model, str), "Model must be a string."
assert self.model in self.models, f"Model must be one of {self.models}."

# If prompt is given, check that it is a list of string
if prompts:
assert isinstance(prompts, list) and isinstance(
prompts[0], str
), "Prompts must be a list of strings."
assert not a, "Aspect must not be given if prompt is given."
assert not d, "Task must not be given if prompt is given."
assert not aspect, "Aspect must not be given if prompt is given."
assert not task, "Task must not be given if prompt is given."
else:
# If prompt is not given, check that task and aspect are given
assert a, "Aspect must be given if prompt is not given."
assert d, "Task must be given if prompt is not given."
assert aspect, "Aspect must be given if prompt is not given."
assert task, "Task must be given if prompt is not given."

# If aspect is given, check that it is a string
if a:
assert isinstance(a, str), "Aspect must be a string."
assert a in self.aspects, f"Aspect must be one of {self.aspects}."
if aspect:
assert isinstance(aspect, str), "Aspect must be a string."
assert aspect in self.aspects, f"Aspect must be one of {self.aspects}."

# If task is given, check that it is a string
if d:
assert isinstance(d, str), "Task must be a string."
assert d in self.tasks, f"Task must be one of {self.tasks}."
if task:
assert isinstance(task, str), "Task must be a string."
assert task in self.tasks, f"Task must be one of {self.tasks}."

# Generative LLM is given a prompt template and some context information
prompts = (
prompts
if prompts
else [
self.get_prompt(a, d, src, pred) for (src, pred) in zip(sources, preds)
]
)
prompts = prompts or self.get_prompts(aspect, task, sources, preds)

# Model predicts log-likelihood of the next token given the previous tokens and the prompt template
if model in self.huggingface_models:
tokenizer = AutoTokenizer.from_pretrained(model)
llm = AutoModelForCausalLM.from_pretrained(model)
inputs = tokenizer(prompts, return_tensors="pt")

outputs = llm.generate(
**inputs,
max_new_tokens=50,
return_dict_in_generate=True,
output_scores=True,
)

transition_scores = llm.compute_transition_scores(
outputs.sequences, outputs.scores, normalize_logits=True
)

logprobs = np.array(transition_scores.tolist())

elif model == "gpt-3.5-turbo":
openai.api_key = api_key
response = openai.Completion.create(
model=model,
prompt=prompts,
logprobs=5,
)
if self.model in self.huggingface_models:
logprobs = self.huggingface_logprobs(prompts)

logprobs = response["choices"][0]["logprobs"]
elif self.model in self.openai_models:
logprobs = self.openai_logprobs(prompts, api_key)

# Compute GPTScores
scores = []
for i, pred in enumerate(preds):
pred_tokens = pred.split()
pred_logprobs = logprobs[i][: len(pred_tokens)]
score = np.mean(pred_logprobs)
scores.append(score)
scores.append(np.mean(pred_logprobs))

return scores
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class MetadataExtractor:
def __init__(self):
self.metadata_extractor = MetafeatureExtractorsRunner()

def addWordRegexMatchesCount(self, regex_rule, name=None):
def add_word_regex_matches_count(self, regex_rule, name=None):
"""
Adds a regex rule to the metadata extractor.
For a given regex return the number of words matching the regex.
Expand All @@ -30,7 +30,7 @@ def addWordRegexMatchesCount(self, regex_rule, name=None):
WordRegexMatchesCount(regex=regex_rule, name=name)
)

def addRegexMatchCount(self, regex_rule, name=None):
def add_regex_match_count(self, regex_rule, name=None):
"""
Adds a regex rule to the metadata extractor.
For a given regex return the number of matches it has in the text.
Expand Down
2 changes: 1 addition & 1 deletion tests/test_embedding_metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import unittest

from saga_llm_evaluation_ml.model.helpers.embedding_metrics import BERTScore, MAUVE
from saga_llm_evaluation_ml.helpers.embedding_metrics import BERTScore, MAUVE


class TestBERTScore(unittest.TestCase):
Expand Down
12 changes: 7 additions & 5 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import unittest

from saga_llm_evaluation_ml.model.helpers.utils import MetadataExtractor
from saga_llm_evaluation_ml.helpers.utils import MetadataExtractor


class TestMetadataExtractor(unittest.TestCase):
Expand All @@ -25,8 +25,8 @@ def test_add_regex(self):
"""Tests that the MetadataExtractor class extracts the correct metadata when regex rules are added."""
text = "The cat sat on the mat."
extractor = MetadataExtractor()
extractor.addWordRegexMatchesCount("the")
extractor.addRegexMatchCount("the")
extractor.add_word_regex_matches_count("the")
extractor.add_regex_match_count("the")
metadata = extractor.compute(text)

# Test a few metadata values
Expand All @@ -45,8 +45,10 @@ def test_add_regex(self):
len_metadata = len(metadata)

# Check that the metadata is longer when multiple regex rules are added
extractor.addWordRegexMatchesCount("cat", name="word_regex_matches_count_cat")
extractor.addRegexMatchCount("cat", name="regex_match_count_cat")
extractor.add_word_regex_matches_count(
"cat", name="word_regex_matches_count_cat"
)
extractor.add_regex_match_count("cat", name="regex_match_count_cat")
metadata = extractor.compute(text)

self.assertGreater(len(metadata), len_metadata)
2 changes: 1 addition & 1 deletion tests/test_language_metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import unittest

from saga_llm_evaluation_ml.model.helpers.language_metrics import BLEURTScore
from saga_llm_evaluation_ml.helpers.language_metrics import BLEURTScore


class TestBLEURTScore(unittest.TestCase):
Expand Down
Loading

0 comments on commit 2be80d7

Please sign in to comment.