From fd9dc34a6eadbad3de6464ed8b2d788a697604c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Thu, 25 Jul 2024 10:22:00 +0200 Subject: [PATCH 1/2] added endpoint through openai lib --- src/lighteval/metrics/llm_as_judge.py | 19 +++++++++++-------- src/lighteval/metrics/metrics.py | 19 +++++++++++++++++-- src/lighteval/metrics/metrics_sample.py | 25 +++++++++++++++---------- 3 files changed, 43 insertions(+), 20 deletions(-) diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index 5b70e9d5..9b6f43f7 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -31,19 +31,20 @@ from lighteval.utils import NO_OPENAI_ERROR_MSG, is_openai_available -class JudgeOpenAI: +class JudgeEndpoint: """ - A class representing a judge for evaluating answers using the OpenAI API. + A class representing a judge for evaluating answers using the OpenAI API or the Inference Endpoints API. Args: - model (str): The name of the OpenAI model to use. + model (str): The name of the model to use. seed (int): The seed value for generating random responses. temperature (float): The temperature value for controlling the randomness of the responses. templates_path (str): The path to the JSON file containing the templates for prompts. + api_key (str): The API key to use to create/connect to the endpoint Attributes: - client: An instance of the OpenAI client. - model (str): The name of the OpenAI model. + client: An instance of the endpoint client. + model (str): The name of the endpoint model. seed (int): The seed value, passed to the API when generating responses. temperature (float): The temperature value, passed to the API when generating responses. templates (dict): A dictionary containing the templates for prompts. @@ -63,15 +64,17 @@ class JudgeOpenAI: def __init__( self, model: str, + url: str, seed: int, temperature: float, templates_path: str, - openai_api_key: str, + api_key: str, multi_turn: bool = False, ): self.client = None # loaded lazily - self.openai_api_key = openai_api_key + self.api_key = api_key self.model = model + self.url = url # None for Open AI, value for Inference endpoint self.seed = seed self.temperature = temperature self.multi_turn = multi_turn @@ -118,7 +121,7 @@ def evaluate_answer( from openai import OpenAI - self.client = OpenAI(api_key=self.openai_api_key) + self.client = OpenAI(base_url=self.url, api_key=self.api_key) prompts = [ self.__get_prompts_single_turn( diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 8b06e45c..b1a0b1c9 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -234,7 +234,22 @@ class Metrics(Enum): category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN, use_case=MetricUseCase.SUMMARIZATION, sample_level_fn=JudgeLLM( - judge_model_name="gpt-3.5-turbo", + judge_model_name_or_url="gpt-3.5-turbo", + template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"), + multi_turn=True, + ).compute, + corpus_level_fn={ + "single_turn": np.mean, + "multi_turn": np.mean, + }, + ) + llm_judge_multi_turn_local_endpoint = SampleLevelMetricGrouping( + metric_name=["single_turn", "multi_turn"], + higher_is_better=True, + category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN, + use_case=MetricUseCase.SUMMARIZATION, + sample_level_fn=JudgeLLM( + judge_model_name_or_url="http://localhost:3000/v1", # replace with your endpoint url if needed template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"), multi_turn=True, ).compute, @@ -249,7 +264,7 @@ class Metrics(Enum): category=MetricCategory.LLM_AS_JUDGE, use_case=MetricUseCase.SUMMARIZATION, sample_level_fn=JudgeLLM( - judge_model_name="gpt-3.5-turbo", + judge_model_name_or_url="gpt-3.5-turbo", template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"), multi_turn=False, ).compute, diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index b7876dbc..bbefe392 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -40,7 +40,7 @@ from lighteval.metrics.imports.bert_scorer import BERTScorer from lighteval.metrics.imports.data_stats_metric import DataStatsMetric from lighteval.metrics.imports.summac import SummaCZS -from lighteval.metrics.llm_as_judge import JudgeOpenAI +from lighteval.metrics.llm_as_judge import JudgeEndpoint from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip from lighteval.tasks.requests import Doc from lighteval.utils import as_list @@ -622,21 +622,26 @@ def edit_similarity(self, s1, s2): class JudgeLLM: - available_models = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"] + available_models_openai = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"] - def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False): - if judge_model_name not in self.available_models: - raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric") + def __init__(self, judge_model_name_or_url: str, template_path: str, multi_turn: bool = False): + if judge_model_name_or_url in self.available_models_openai: + API_KEY = os.getenv("OPENAI_API_KEY") + url = None + model = judge_model_name_or_url + else: + API_KEY = os.getenv("HF_TOKEN") + url = judge_model_name_or_url + model = "tgi" - OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") self.multi_turn = multi_turn - - self.judge = JudgeOpenAI( - model=judge_model_name, + self.judge = JudgeEndpoint( + model=model, + url=url, seed=42, temperature=0.0, templates_path=template_path, - openai_api_key=OPENAI_API_KEY, + api_key=API_KEY, multi_turn=multi_turn, ) From 4067ee46ecdb58ed6b4df2f2a591ae300ac075a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Thu, 25 Jul 2024 10:39:16 +0200 Subject: [PATCH 2/2] updated inference endpoint system --- src/lighteval/metrics/llm_as_judge.py | 1 + src/lighteval/metrics/metrics.py | 10 +++++----- src/lighteval/metrics/metrics_sample.py | 10 ++++------ src/lighteval/tasks/extended/mt_bench/main.py | 2 +- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index 9b6f43f7..4163d73f 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -37,6 +37,7 @@ class JudgeEndpoint: Args: model (str): The name of the model to use. + url (str): Endpoint to go to (open ai or inference endpoint) seed (int): The seed value for generating random responses. temperature (float): The temperature value for controlling the randomness of the responses. templates_path (str): The path to the JSON file containing the templates for prompts. diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index b1a0b1c9..a6bab8b1 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -228,13 +228,13 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - llm_judge_multi_turn_openai = SampleLevelMetricGrouping( + llm_judge_multi_turn_gpt3p5 = SampleLevelMetricGrouping( metric_name=["single_turn", "multi_turn"], higher_is_better=True, category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN, use_case=MetricUseCase.SUMMARIZATION, sample_level_fn=JudgeLLM( - judge_model_name_or_url="gpt-3.5-turbo", + judge_model_name="gpt-3.5-turbo", template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"), multi_turn=True, ).compute, @@ -243,13 +243,13 @@ class Metrics(Enum): "multi_turn": np.mean, }, ) - llm_judge_multi_turn_local_endpoint = SampleLevelMetricGrouping( + llm_judge_multi_turn_llama3_405 = SampleLevelMetricGrouping( metric_name=["single_turn", "multi_turn"], higher_is_better=True, category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN, use_case=MetricUseCase.SUMMARIZATION, sample_level_fn=JudgeLLM( - judge_model_name_or_url="http://localhost:3000/v1", # replace with your endpoint url if needed + judge_model_name="meta-llama/Meta-Llama-3.1-405B-Instruct-FP8", template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"), multi_turn=True, ).compute, @@ -264,7 +264,7 @@ class Metrics(Enum): category=MetricCategory.LLM_AS_JUDGE, use_case=MetricUseCase.SUMMARIZATION, sample_level_fn=JudgeLLM( - judge_model_name_or_url="gpt-3.5-turbo", + judge_model_name="gpt-3.5-turbo", template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"), multi_turn=False, ).compute, diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index bbefe392..b58a9e70 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -624,19 +624,17 @@ def edit_similarity(self, s1, s2): class JudgeLLM: available_models_openai = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"] - def __init__(self, judge_model_name_or_url: str, template_path: str, multi_turn: bool = False): - if judge_model_name_or_url in self.available_models_openai: + def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False): + if judge_model_name in self.available_models_openai: API_KEY = os.getenv("OPENAI_API_KEY") url = None - model = judge_model_name_or_url else: API_KEY = os.getenv("HF_TOKEN") - url = judge_model_name_or_url - model = "tgi" + url = "https://api-inference.huggingface.co/v1/" self.multi_turn = multi_turn self.judge = JudgeEndpoint( - model=model, + model=judge_model_name, url=url, seed=42, temperature=0.0, diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py index 77b8f3ee..51f857d3 100644 --- a/src/lighteval/tasks/extended/mt_bench/main.py +++ b/src/lighteval/tasks/extended/mt_bench/main.py @@ -55,7 +55,7 @@ def mt_bench_prompt(line, task_name: str = None): evaluation_splits=["train"], few_shots_split="", few_shots_select="random", - metric=["llm_judge_multi_turn_openai"], + metric=["llm_judge_multi_turn_gpt3p5", "llm_judge_multi_turn_llama3_405"], generation_size=1024, stop_sequence=[], )