huggingface · clefourrier · Jul 25, 2024 · Jul 25, 2024
diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py
@@ -31,19 +31,20 @@
 from lighteval.utils import NO_OPENAI_ERROR_MSG, is_openai_available
 
 
-class JudgeOpenAI:
+class JudgeEndpoint:
     """
-    A class representing a judge for evaluating answers using the OpenAI API.
+    A class representing a judge for evaluating answers using the OpenAI API or the Inference Endpoints API.
 
     Args:
-        model (str): The name of the OpenAI model to use.
+        model (str): The name of the model to use.
         seed (int): The seed value for generating random responses.
         temperature (float): The temperature value for controlling the randomness of the responses.
         templates_path (str): The path to the JSON file containing the templates for prompts.
+        api_key (str): The API key to use to create/connect to the endpoint
 
     Attributes:
-        client: An instance of the OpenAI client.
-        model (str): The name of the OpenAI model.
+        client: An instance of the endpoint client.
+        model (str): The name of the endpoint model.
         seed (int): The seed value, passed to the API when generating responses.
         temperature (float): The temperature value, passed to the API when generating responses.
         templates (dict): A dictionary containing the templates for prompts.
@@ -63,15 +64,17 @@ class JudgeOpenAI:
     def __init__(
         self,
         model: str,
+        url: str,
         seed: int,
         temperature: float,
         templates_path: str,
-        openai_api_key: str,
+        api_key: str,
         multi_turn: bool = False,
     ):
         self.client = None  # loaded lazily
-        self.openai_api_key = openai_api_key
+        self.api_key = api_key
         self.model = model
+        self.url = url  # None for Open AI, value for Inference endpoint
         self.seed = seed
         self.temperature = temperature
         self.multi_turn = multi_turn
@@ -118,7 +121,7 @@ def evaluate_answer(
 
             from openai import OpenAI
 
-            self.client = OpenAI(api_key=self.openai_api_key)
+            self.client = OpenAI(base_url=self.url, api_key=self.api_key)
 
         prompts = [
             self.__get_prompts_single_turn(

diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -234,7 +234,22 @@ class Metrics(Enum):
         category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN,
         use_case=MetricUseCase.SUMMARIZATION,
         sample_level_fn=JudgeLLM(
-            judge_model_name="gpt-3.5-turbo",
+            judge_model_name_or_url="gpt-3.5-turbo",
+            template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
+            multi_turn=True,
+        ).compute,
+        corpus_level_fn={
+            "single_turn": np.mean,
+            "multi_turn": np.mean,
+        },
+    )
+    llm_judge_multi_turn_local_endpoint = SampleLevelMetricGrouping(
+        metric_name=["single_turn", "multi_turn"],
+        higher_is_better=True,
+        category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN,
+        use_case=MetricUseCase.SUMMARIZATION,
+        sample_level_fn=JudgeLLM(
+            judge_model_name_or_url="http://localhost:3000/v1",  # replace with your endpoint url if needed
             template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
             multi_turn=True,
         ).compute,
@@ -249,7 +264,7 @@ class Metrics(Enum):
         category=MetricCategory.LLM_AS_JUDGE,
         use_case=MetricUseCase.SUMMARIZATION,
         sample_level_fn=JudgeLLM(
-            judge_model_name="gpt-3.5-turbo",
+            judge_model_name_or_url="gpt-3.5-turbo",
             template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
             multi_turn=False,
         ).compute,

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -40,7 +40,7 @@
 from lighteval.metrics.imports.bert_scorer import BERTScorer
 from lighteval.metrics.imports.data_stats_metric import DataStatsMetric
 from lighteval.metrics.imports.summac import SummaCZS
-from lighteval.metrics.llm_as_judge import JudgeOpenAI
+from lighteval.metrics.llm_as_judge import JudgeEndpoint
 from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip
 from lighteval.tasks.requests import Doc
 from lighteval.utils import as_list
@@ -622,21 +622,26 @@ def edit_similarity(self, s1, s2):
 
 
 class JudgeLLM:
-    available_models = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"]
+    available_models_openai = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"]
 
-    def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False):
-        if judge_model_name not in self.available_models:
-            raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric")
+    def __init__(self, judge_model_name_or_url: str, template_path: str, multi_turn: bool = False):
+        if judge_model_name_or_url in self.available_models_openai:
+            API_KEY = os.getenv("OPENAI_API_KEY")
+            url = None
+            model = judge_model_name_or_url
+        else:
+            API_KEY = os.getenv("HF_TOKEN")
+            url = judge_model_name_or_url
+            model = "tgi"
 
-        OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
         self.multi_turn = multi_turn
-
-        self.judge = JudgeOpenAI(
-            model=judge_model_name,
+        self.judge = JudgeEndpoint(
+            model=model,
+            url=url,
             seed=42,
             temperature=0.0,
             templates_path=template_path,
-            openai_api_key=OPENAI_API_KEY,
+            api_key=API_KEY,
             multi_turn=multi_turn,
         )