diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index 5b70e9d5..4163d73f 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -31,19 +31,21 @@ from lighteval.utils import NO_OPENAI_ERROR_MSG, is_openai_available -class JudgeOpenAI: +class JudgeEndpoint: """ - A class representing a judge for evaluating answers using the OpenAI API. + A class representing a judge for evaluating answers using the OpenAI API or the Inference Endpoints API. Args: - model (str): The name of the OpenAI model to use. + model (str): The name of the model to use. + url (str): Endpoint to go to (open ai or inference endpoint) seed (int): The seed value for generating random responses. temperature (float): The temperature value for controlling the randomness of the responses. templates_path (str): The path to the JSON file containing the templates for prompts. + api_key (str): The API key to use to create/connect to the endpoint Attributes: - client: An instance of the OpenAI client. - model (str): The name of the OpenAI model. + client: An instance of the endpoint client. + model (str): The name of the endpoint model. seed (int): The seed value, passed to the API when generating responses. temperature (float): The temperature value, passed to the API when generating responses. templates (dict): A dictionary containing the templates for prompts. @@ -63,15 +65,17 @@ class JudgeOpenAI: def __init__( self, model: str, + url: str, seed: int, temperature: float, templates_path: str, - openai_api_key: str, + api_key: str, multi_turn: bool = False, ): self.client = None # loaded lazily - self.openai_api_key = openai_api_key + self.api_key = api_key self.model = model + self.url = url # None for Open AI, value for Inference endpoint self.seed = seed self.temperature = temperature self.multi_turn = multi_turn @@ -118,7 +122,7 @@ def evaluate_answer( from openai import OpenAI - self.client = OpenAI(api_key=self.openai_api_key) + self.client = OpenAI(base_url=self.url, api_key=self.api_key) prompts = [ self.__get_prompts_single_turn( diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 8b06e45c..a6bab8b1 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -228,7 +228,7 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - llm_judge_multi_turn_openai = SampleLevelMetricGrouping( + llm_judge_multi_turn_gpt3p5 = SampleLevelMetricGrouping( metric_name=["single_turn", "multi_turn"], higher_is_better=True, category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN, @@ -243,6 +243,21 @@ class Metrics(Enum): "multi_turn": np.mean, }, ) + llm_judge_multi_turn_llama3_405 = SampleLevelMetricGrouping( + metric_name=["single_turn", "multi_turn"], + higher_is_better=True, + category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN, + use_case=MetricUseCase.SUMMARIZATION, + sample_level_fn=JudgeLLM( + judge_model_name="meta-llama/Meta-Llama-3.1-405B-Instruct-FP8", + template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"), + multi_turn=True, + ).compute, + corpus_level_fn={ + "single_turn": np.mean, + "multi_turn": np.mean, + }, + ) llm_judge_openai = SampleLevelMetricGrouping( metric_name=["judge_score"], higher_is_better=True, diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index b7876dbc..b58a9e70 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -40,7 +40,7 @@ from lighteval.metrics.imports.bert_scorer import BERTScorer from lighteval.metrics.imports.data_stats_metric import DataStatsMetric from lighteval.metrics.imports.summac import SummaCZS -from lighteval.metrics.llm_as_judge import JudgeOpenAI +from lighteval.metrics.llm_as_judge import JudgeEndpoint from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip from lighteval.tasks.requests import Doc from lighteval.utils import as_list @@ -622,21 +622,24 @@ def edit_similarity(self, s1, s2): class JudgeLLM: - available_models = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"] + available_models_openai = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"] def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False): - if judge_model_name not in self.available_models: - raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric") + if judge_model_name in self.available_models_openai: + API_KEY = os.getenv("OPENAI_API_KEY") + url = None + else: + API_KEY = os.getenv("HF_TOKEN") + url = "https://api-inference.huggingface.co/v1/" - OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") self.multi_turn = multi_turn - - self.judge = JudgeOpenAI( + self.judge = JudgeEndpoint( model=judge_model_name, + url=url, seed=42, temperature=0.0, templates_path=template_path, - openai_api_key=OPENAI_API_KEY, + api_key=API_KEY, multi_turn=multi_turn, ) diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py index 77b8f3ee..51f857d3 100644 --- a/src/lighteval/tasks/extended/mt_bench/main.py +++ b/src/lighteval/tasks/extended/mt_bench/main.py @@ -55,7 +55,7 @@ def mt_bench_prompt(line, task_name: str = None): evaluation_splits=["train"], few_shots_split="", few_shots_select="random", - metric=["llm_judge_multi_turn_openai"], + metric=["llm_judge_multi_turn_gpt3p5", "llm_judge_multi_turn_llama3_405"], generation_size=1024, stop_sequence=[], )