Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use inference endpoints as judge #237

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 11 additions & 8 deletions src/lighteval/metrics/llm_as_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,20 @@
from lighteval.utils import NO_OPENAI_ERROR_MSG, is_openai_available


class JudgeOpenAI:
class JudgeEndpoint:
"""
A class representing a judge for evaluating answers using the OpenAI API.
A class representing a judge for evaluating answers using the OpenAI API or the Inference Endpoints API.

Args:
model (str): The name of the OpenAI model to use.
model (str): The name of the model to use.
seed (int): The seed value for generating random responses.
temperature (float): The temperature value for controlling the randomness of the responses.
templates_path (str): The path to the JSON file containing the templates for prompts.
api_key (str): The API key to use to create/connect to the endpoint

Attributes:
client: An instance of the OpenAI client.
model (str): The name of the OpenAI model.
client: An instance of the endpoint client.
model (str): The name of the endpoint model.
seed (int): The seed value, passed to the API when generating responses.
temperature (float): The temperature value, passed to the API when generating responses.
templates (dict): A dictionary containing the templates for prompts.
Expand All @@ -63,15 +64,17 @@ class JudgeOpenAI:
def __init__(
self,
model: str,
url: str,
seed: int,
temperature: float,
templates_path: str,
openai_api_key: str,
api_key: str,
multi_turn: bool = False,
):
self.client = None # loaded lazily
self.openai_api_key = openai_api_key
self.api_key = api_key
self.model = model
self.url = url # None for Open AI, value for Inference endpoint
self.seed = seed
self.temperature = temperature
self.multi_turn = multi_turn
Expand Down Expand Up @@ -118,7 +121,7 @@ def evaluate_answer(

from openai import OpenAI

self.client = OpenAI(api_key=self.openai_api_key)
self.client = OpenAI(base_url=self.url, api_key=self.api_key)

prompts = [
self.__get_prompts_single_turn(
Expand Down
19 changes: 17 additions & 2 deletions src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,22 @@ class Metrics(Enum):
category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN,
use_case=MetricUseCase.SUMMARIZATION,
sample_level_fn=JudgeLLM(
judge_model_name="gpt-3.5-turbo",
judge_model_name_or_url="gpt-3.5-turbo",
template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
multi_turn=True,
).compute,
corpus_level_fn={
"single_turn": np.mean,
"multi_turn": np.mean,
},
)
llm_judge_multi_turn_local_endpoint = SampleLevelMetricGrouping(
metric_name=["single_turn", "multi_turn"],
higher_is_better=True,
category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN,
use_case=MetricUseCase.SUMMARIZATION,
sample_level_fn=JudgeLLM(
judge_model_name_or_url="http://localhost:3000/v1", # replace with your endpoint url if needed
clefourrier marked this conversation as resolved.
Show resolved Hide resolved
template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
multi_turn=True,
).compute,
Expand All @@ -249,7 +264,7 @@ class Metrics(Enum):
category=MetricCategory.LLM_AS_JUDGE,
use_case=MetricUseCase.SUMMARIZATION,
sample_level_fn=JudgeLLM(
judge_model_name="gpt-3.5-turbo",
judge_model_name_or_url="gpt-3.5-turbo",
template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
multi_turn=False,
).compute,
Expand Down
25 changes: 15 additions & 10 deletions src/lighteval/metrics/metrics_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
from lighteval.metrics.imports.bert_scorer import BERTScorer
from lighteval.metrics.imports.data_stats_metric import DataStatsMetric
from lighteval.metrics.imports.summac import SummaCZS
from lighteval.metrics.llm_as_judge import JudgeOpenAI
from lighteval.metrics.llm_as_judge import JudgeEndpoint
from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip
from lighteval.tasks.requests import Doc
from lighteval.utils import as_list
Expand Down Expand Up @@ -622,21 +622,26 @@ def edit_similarity(self, s1, s2):


class JudgeLLM:
available_models = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"]
available_models_openai = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"]

def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False):
if judge_model_name not in self.available_models:
raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric")
def __init__(self, judge_model_name_or_url: str, template_path: str, multi_turn: bool = False):
if judge_model_name_or_url in self.available_models_openai:
API_KEY = os.getenv("OPENAI_API_KEY")
url = None
model = judge_model_name_or_url
else:
API_KEY = os.getenv("HF_TOKEN")
url = judge_model_name_or_url
model = "tgi"

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
self.multi_turn = multi_turn

self.judge = JudgeOpenAI(
model=judge_model_name,
self.judge = JudgeEndpoint(
model=model,
url=url,
seed=42,
temperature=0.0,
templates_path=template_path,
openai_api_key=OPENAI_API_KEY,
api_key=API_KEY,
multi_turn=multi_turn,
)

Expand Down
Loading