Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adds llm as judge using transformers #223

Merged
merged 23 commits into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ tests = ["pytest==7.4.0"]
dev = ["lighteval[accelerate,quality,tests]"]
extended_tasks = [
"langdetect", # ifeval
"openai", # mt-bench
]

[project.urls]
Expand Down
5 changes: 4 additions & 1 deletion src/lighteval/logging/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,10 @@ class EnhancedJSONEncoder(json.JSONEncoder):

def default(self, o):
if is_dataclass(o):
return asdict(o)
try:
return asdict(o)
except Exception:
return str(o)
if callable(o):
return o.__name__
if isinstance(o, Enum):
Expand Down
136 changes: 74 additions & 62 deletions src/lighteval/metrics/llm_as_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,36 +25,28 @@
import json
import re
import time
from typing import Optional
from typing import Any, Optional

from lighteval.logging.hierarchical_logger import hlog_warn
from lighteval.utils import NO_OPENAI_ERROR_MSG, is_openai_available


class JudgeOpenAI:
class JudgeLM:
"""
A class representing a judge for evaluating answers using the OpenAI API.
A class representing a judge for evaluating answers using the Transformers library.

Args:
model (str): The name of the OpenAI model to use.
seed (int): The seed value for generating random responses.
temperature (float): The temperature value for controlling the randomness of the responses.
model (str): The name of the model to use.
templates_path (str): The path to the JSON file containing the templates for prompts.
multi_turn (bool): Whether to use multi-turn prompts

Attributes:
client: An instance of the OpenAI client.
model (str): The name of the OpenAI model.
seed (int): The seed value, passed to the API when generating responses.
temperature (float): The temperature value, passed to the API when generating responses.
model (str): The name of the model.
templates (dict): A dictionary containing the templates for prompts.
one_score_pattern (re.Pattern): A regular expression pattern for extracting scores from the response.
one_score_pattern_backup (re.Pattern): A backup regular expression pattern for extracting scores.
API_MAX_RETRY (int): The maximum number of API retries.
API_RETRY_SLEEP (int): The sleep time between API retries.
max_tokens (int): The maximum number of tokens allowed in the response.

Methods:
evaluate_answer: Evaluates an answer using the OpenAI API.
evaluate_answer: Evaluates an answer using the OpenAI API or Transformers library.
__get_prompts_multi_turn: Generates prompts for multi-turn conversations.
__get_prompts_single_turn: Generates prompts for single-turn conversations.
__process_judge_response: Processes the judge's response and extracts the score.
Expand All @@ -63,18 +55,14 @@ class JudgeOpenAI:
def __init__(
self,
model: str,
seed: int,
temperature: float,
templates_path: str,
openai_api_key: str,
multi_turn: bool = False,
use_transformers: bool = False,
url: Optional[str] = None,
api_key: Optional[str] = None,
):
self.client = None # loaded lazily
self.openai_api_key = openai_api_key
self.model = model
self.seed = seed
self.temperature = temperature
self.multi_turn = multi_turn
self.model = model

data = []
with open(templates_path, "r") as f:
Expand All @@ -89,40 +77,60 @@ def __init__(
# the second is for the backup case: [score]
self.one_score_pattern = re.compile(r"\[\[(\d+\.?\d*)\]\]")
self.one_score_pattern_backup = re.compile(r"\[(\d+\.?\d*)\]")
NathanHB marked this conversation as resolved.
Show resolved Hide resolved

self.API_MAX_RETRY = 16
self.API_RETRY_SLEEP = 10
self.max_tokens = 2048
self.API_MAX_RETRY = 3
self.API_RETRY_SLEEP = 1

self.client = None
self.pipe = None
self.use_transformers = use_transformers
self.url = url
self.api_key = api_key

def lazy_load_client(self):
if not self.use_transformers:
NathanHB marked this conversation as resolved.
Show resolved Hide resolved
if self.client is None:
from openai import OpenAI

if self.url is None:
self.client = OpenAI(api_key=self.api_key)
else:
self.client = OpenAI(base_url=self.url, api_key=self.api_key)
else:
if self.pipe is None:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
NathanHB marked this conversation as resolved.
Show resolved Hide resolved

transformers_model = AutoModelForCausalLM.from_pretrained(
self.model, torch_dtype=torch.bfloat16, trust_remote_code=False, device_map="cuda"
)
tokenizer = AutoTokenizer.from_pretrained(self.model)
self.pipe = pipeline(
"text-generation",
model=transformers_model,
tokenizer=tokenizer,
max_new_tokens=50,
)

def evaluate_answer(
self, questions: list[str], answers: list[str], references: list[str]
) -> tuple[int, list[dict[str, str]], str]:
) -> tuple[list[int], list[list[dict[str, str]]], list[str | None | Any]]:
"""
Evaluates an answer using the OpenAI API.
Evaluates an answer using Transformers.
NathanHB marked this conversation as resolved.
Show resolved Hide resolved

Args:
questions (list[str]): A list of questions (can be a list because of multi-turn conversations)
answers (list[str]): A list of answers, one for each question.
references (list[str]): A list of reference answers, one for each question (sometimes not available)
single_turn (bool): Indicates whether the conversation is single-turn or multi-turn.

Returns:
A tuple containing the score, prompts, and judgment.

Raises:
Exception: If an error occurs during the API call.
"""
if self.client is None:
if not is_openai_available():
raise ImportError(NO_OPENAI_ERROR_MSG)

from openai import OpenAI

self.client = OpenAI(api_key=self.openai_api_key)
# lazy loading of the pipeline
self.lazy_load_client()

prompts = [
self.__get_prompts_single_turn(
questions[0], answers[0], references[0] if references is not None and len(references) > 0 else None
questions[0], answers[0], references[0] if references and len(references) > 0 else None
)
]

Expand All @@ -132,28 +140,15 @@ def evaluate_answer(
)
prompts.append(prompts_multi_turn)

responses = []
judgments = []
for prompt in prompts:
for _ in range(self.API_MAX_RETRY):
try:
response = self.client.chat.completions.create(
model=self.model,
seed=self.seed,
temperature=self.temperature,
messages=prompt,
max_tokens=self.max_tokens,
n=1,
)
responses.append(response)
break
except Exception as e:
hlog_warn(f"{type(e), e}")
time.sleep(self.API_RETRY_SLEEP)

if len(responses) == 0:
raise Exception("Failed to get response from the API")

judgments = [response.choices[0].message.content for response in responses]
if self.client is not None:
response = self.__call_openai_api(prompt)
else:
response = self.pipe(prompt)[0]["generated_text"]
response = response[-1]["content"]
judgments.append(response)

scores = [self.__process_judge_response(judgment) for judgment in judgments]

return scores, prompts, judgments
Expand Down Expand Up @@ -235,3 +230,20 @@ def __process_judge_response(self, judgment: str) -> int:
rating = -1

return rating

def __call_openai_api(self, prompt):
NathanHB marked this conversation as resolved.
Show resolved Hide resolved
for _ in range(self.API_MAX_RETRY):
try:
response = self.client.chat.completions.create(
model=self.model,
# seed=self.seed,
# temperature=self.temperature,
messages=prompt,
max_tokens=512,
n=1,
)
return response.choices[0].message.content
NathanHB marked this conversation as resolved.
Show resolved Hide resolved
except Exception as e:
hlog_warn(f"{type(e), e}")
time.sleep(self.API_RETRY_SLEEP)
raise Exception("Failed to get response from the API")
37 changes: 33 additions & 4 deletions src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,9 +228,9 @@ class Metrics(Enum):
corpus_level_fn=np.mean,
higher_is_better=True,
)
llm_judge_multi_turn_openai = SampleLevelMetricGrouping(
llm_judge_multi_turn_gpt3p5 = SampleLevelMetricGrouping(
metric_name=["single_turn", "multi_turn"],
higher_is_better=True,
higher_is_better={"single_turn": True, "multi_turn": True},
NathanHB marked this conversation as resolved.
Show resolved Hide resolved
category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN,
use_case=MetricUseCase.SUMMARIZATION,
sample_level_fn=JudgeLLM(
Expand All @@ -243,9 +243,24 @@ class Metrics(Enum):
"multi_turn": np.mean,
},
)
llm_judge_openai = SampleLevelMetricGrouping(
llm_judge_multi_turn_llama_3_405b = SampleLevelMetricGrouping(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't forget to update the README with the new metrics

metric_name=["single_turn", "multi_turn"],
higher_is_better={"single_turn": True, "multi_turn": True},
category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN,
use_case=MetricUseCase.SUMMARIZATION,
sample_level_fn=JudgeLLM(
judge_model_name="meta-llama/Meta-Llama-3.1-405B-Instruct-FP8",
template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
multi_turn=True,
).compute,
corpus_level_fn={
"single_turn": np.mean,
"multi_turn": np.mean,
},
)
llm_judge_gpt3p5 = SampleLevelMetricGrouping(
metric_name=["judge_score"],
higher_is_better=True,
higher_is_better={"judge_score": True},
category=MetricCategory.LLM_AS_JUDGE,
use_case=MetricUseCase.SUMMARIZATION,
sample_level_fn=JudgeLLM(
Expand All @@ -257,6 +272,20 @@ class Metrics(Enum):
"judge_score": np.mean,
},
)
llm_judge_llama_3_405b = SampleLevelMetricGrouping(
metric_name=["judge_score"],
higher_is_better={"judge_score": True},
category=MetricCategory.LLM_AS_JUDGE,
use_case=MetricUseCase.SUMMARIZATION,
sample_level_fn=JudgeLLM(
judge_model_name="meta-llama/Meta-Llama-3.1-405B-Instruct-FP8",
template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
multi_turn=False,
).compute,
corpus_level_fn={
"judge_score": np.mean,
},
)
loglikelihood_acc = SampleLevelMetric(
metric_name="acc",
sample_level_fn=LoglikelihoodAcc().compute,
Expand Down
34 changes: 23 additions & 11 deletions src/lighteval/metrics/metrics_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

import nltk
import numpy as np
from huggingface_hub import HfApi
from nltk.metrics.distance import edit_distance
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordTokenizer
Expand All @@ -40,7 +41,7 @@
from lighteval.metrics.imports.bert_scorer import BERTScorer
from lighteval.metrics.imports.data_stats_metric import DataStatsMetric
from lighteval.metrics.imports.summac import SummaCZS
from lighteval.metrics.llm_as_judge import JudgeOpenAI
from lighteval.metrics.llm_as_judge import JudgeLM
from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip
from lighteval.tasks.requests import Doc
from lighteval.utils import as_list
Expand Down Expand Up @@ -626,22 +627,33 @@ def edit_similarity(self, s1, s2):


class JudgeLLM:
available_models = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"]
available_models_openai = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"]

def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False):
if judge_model_name not in self.available_models:
raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric")
def __init__(
self, judge_model_name: str, template_path: str, multi_turn: bool = False, use_transformers: bool = False
) -> None:
if judge_model_name in self.available_models_openai:
api_key = os.getenv("OPENAI_API_KEY")
url = None
elif not use_transformers:
NathanHB marked this conversation as resolved.
Show resolved Hide resolved
api_key = os.getenv("HF_TOKEN")
url = "https://api-inference.huggingface.co/v1/"
else:
api = HfApi()
models = api.list_models(model_name=judge_model_name)
url = None
api_key = None
if not models:
raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
self.multi_turn = multi_turn

self.judge = JudgeOpenAI(
self.judge = JudgeLM(
model=judge_model_name,
seed=42,
temperature=0.0,
templates_path=template_path,
openai_api_key=OPENAI_API_KEY,
multi_turn=multi_turn,
use_transformers=use_transformers,
api_key=api_key,
url=url,
)

def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
Expand Down
1 change: 0 additions & 1 deletion src/lighteval/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,6 @@ def greedy_until_multi_turn( # noqa: C901
max_generated_tokens = request.generation_size
context = request.context[0]
max_context_size_allowed = self.max_length - max_generated_tokens

model_inputs = self.tokenizer(
context,
padding=True,
Expand Down
3 changes: 2 additions & 1 deletion src/lighteval/tasks/extended/mt_bench/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
# ruff: noqa: F405, F403, F401, I001
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc
from lighteval.metrics.metrics import Metrics


def mt_bench_prompt(line, task_name: str = None):
Expand Down Expand Up @@ -55,7 +56,7 @@ def mt_bench_prompt(line, task_name: str = None):
evaluation_splits=["train"],
few_shots_split="",
few_shots_select="random",
metric=["llm_judge_multi_turn_openai"],
metric=[Metrics.llm_judge_multi_turn_llama_3_405b],
NathanHB marked this conversation as resolved.
Show resolved Hide resolved
generation_size=1024,
stop_sequence=[],
)
Expand Down
Loading
Loading