From fdb22c7df00ad796c6070b5122d99e65627a180a Mon Sep 17 00:00:00 2001 From: anilaltuner Date: Fri, 26 Apr 2024 11:24:12 +0300 Subject: [PATCH 01/15] Transformers as Judge added --- src/lighteval/metrics/llm_as_judge.py | 114 ++++++++++++++---------- src/lighteval/metrics/metrics_sample.py | 12 +-- 2 files changed, 73 insertions(+), 53 deletions(-) diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index 12b637a3..a7f31e08 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -25,29 +25,30 @@ import json import re import time -from typing import Optional +from typing import Optional, Any from openai import OpenAI from lighteval.logging.hierarchical_logger import hlog_warn +from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline -class JudgeOpenAI: +class JudgeLM: """ - A class representing a judge for evaluating answers using the OpenAI API. + A class representing a judge for evaluating answers using the Transformers library. Args: - model (str): The name of the OpenAI model to use. + model (str): The name of the model to use. seed (int): The seed value for generating random responses. temperature (float): The temperature value for controlling the randomness of the responses. templates_path (str): The path to the JSON file containing the templates for prompts. Attributes: - client: An instance of the OpenAI client. - model (str): The name of the OpenAI model. + model (str): The name of the model. seed (int): The seed value, passed to the API when generating responses. temperature (float): The temperature value, passed to the API when generating responses. templates (dict): A dictionary containing the templates for prompts. + judge_type (str): Judge type based on used model. one_score_pattern (re.Pattern): A regular expression pattern for extracting scores from the response. one_score_pattern_backup (re.Pattern): A backup regular expression pattern for extracting scores. API_MAX_RETRY (int): The maximum number of API retries. @@ -55,22 +56,23 @@ class JudgeOpenAI: max_tokens (int): The maximum number of tokens allowed in the response. Methods: - evaluate_answer: Evaluates an answer using the OpenAI API. + evaluate_answer: Evaluates an answer using the OpenAI API or Transformers library. __get_prompts_multi_turn: Generates prompts for multi-turn conversations. __get_prompts_single_turn: Generates prompts for single-turn conversations. __process_judge_response: Processes the judge's response and extracts the score. """ def __init__( - self, - model: str, - seed: int, - temperature: float, - templates_path: str, - openai_api_key: str, - multi_turn: bool = False, + self, + model: str, + seed: int, + temperature: float, + templates_path: str, + judge_type: str, + openai_api_key: Optional[str] = None, + multi_turn: bool = False, ): - self.client = OpenAI(api_key=openai_api_key) + self.model = model self.seed = seed self.temperature = temperature @@ -90,31 +92,44 @@ def __init__( self.one_score_pattern = re.compile(r"\[\[(\d+\.?\d*)\]\]") self.one_score_pattern_backup = re.compile(r"\[(\d+\.?\d*)\]") - self.API_MAX_RETRY = 16 - self.API_RETRY_SLEEP = 10 - self.max_tokens = 2048 + if judge_type == "openai": + self.client = OpenAI(api_key=openai_api_key) + self.API_MAX_RETRY = 16 + self.API_RETRY_SLEEP = 10 + self.max_tokens = 2048 + else: + transformers_model = AutoModelForCausalLM.from_pretrained(model, + torch_dtype="auto", trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model) + self.pipe = pipeline( + "text-generation", + model=transformers_model, + tokenizer=tokenizer, + ) + self.generation_args = { + "max_new_tokens": 500, + "return_full_text": False, + "temperature": temperature, + "do_sample": False, + } def evaluate_answer( - self, questions: list[str], answers: list[str], references: list[str] - ) -> tuple[int, list[dict[str, str]], str]: + self, questions: list[str], answers: list[str], references: list[str] + ) -> tuple[list[int], list[list[dict[str, str]]], list[str | None | Any]]: """ - Evaluates an answer using the OpenAI API. + Evaluates an answer using the OpenAI API or Transformers. Args: questions (list[str]): A list of questions (can be a list because of multi-turn conversations) answers (list[str]): A list of answers, one for each question. references (list[str]): A list of reference answers, one for each question (sometimes not available) - single_turn (bool): Indicates whether the conversation is single-turn or multi-turn. Returns: A tuple containing the score, prompts, and judgment. - - Raises: - Exception: If an error occurs during the API call. """ prompts = [ self.__get_prompts_single_turn( - questions[0], answers[0], references[0] if references is not None and len(references) > 0 else None + questions[0], answers[0], references[0] if references and len(references) > 0 else None ) ] @@ -124,34 +139,37 @@ def evaluate_answer( ) prompts.append(prompts_multi_turn) - responses = [] + judgments = [] for prompt in prompts: - for _ in range(self.API_MAX_RETRY): - try: - response = self.client.chat.completions.create( - model=self.model, - seed=self.seed, - temperature=self.temperature, - messages=prompt, - max_tokens=self.max_tokens, - n=1, - ) - responses.append(response) - break - except Exception as e: - hlog_warn(f"{type(e), e}") - time.sleep(self.API_RETRY_SLEEP) - - if len(responses) == 0: - raise Exception("Failed to get response from the API") - - judgments = [response.choices[0].message.content for response in responses] + if hasattr(self, 'client'): + response = self.__call_openai_api(prompt) + else: + response = self.pipe(prompt)[0]['generated_text'] + judgments.append(response) + scores = [self.__process_judge_response(judgment) for judgment in judgments] return scores, prompts, judgments + def __call_openai_api(self, prompt): + for _ in range(self.API_MAX_RETRY): + try: + response = self.client.chat.completions.create( + model=self.model, + seed=self.seed, + temperature=self.temperature, + messages=prompt, + max_tokens=self.max_tokens, + n=1, + ) + return response.choices[0].message.content + except Exception as e: + hlog_warn(f"{type(e), e}") + time.sleep(self.API_RETRY_SLEEP) + raise Exception("Failed to get response from the API") + def __get_prompts_multi_turn( - self, questions: list[str], answers: list[str], references: Optional[list[str]] + self, questions: list[str], answers: list[str], references: Optional[list[str]] ) -> list[dict[str, str]]: """ Generates prompts for multi-turn conversations. The prompts are generated based on the templates. diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index a3809adb..53581f7b 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -39,7 +39,7 @@ from lighteval.metrics.imports.bert_scorer import BERTScorer from lighteval.metrics.imports.data_stats_metric import DataStatsMetric from lighteval.metrics.imports.summac import SummaCZS -from lighteval.metrics.llm_as_judge import JudgeOpenAI +from lighteval.metrics.llm_as_judge import JudgeLM from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip from lighteval.tasks.requests import Doc from lighteval.utils import as_list @@ -619,28 +619,30 @@ def edit_similarity(self, s1, s2): edist = edit_distance(s1, s2) return 1.0 - edist / max(len(s1), len(s2)) if len(s1) > 0 and len(s2) > 0 else 0 - class JudgeLLM: available_models = ["gpt-3.5-turbo"] def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False): if judge_model_name not in self.available_models: - raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric") + judge_type = "openai" + else: + judge_type = "transformers" OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") self.multi_turn = multi_turn try: - self.judge = JudgeOpenAI( + self.judge = JudgeLM( model=judge_model_name, seed=42, temperature=0.0, templates_path=template_path, + judge_type=judge_type, openai_api_key=OPENAI_API_KEY, multi_turn=multi_turn, ) except Exception as e: - print(f"Could not initialize the JudgeOpenAI model:\n{e}") + print(f"Could not initialize the JudgeLLM model:\n{e}") self.judge = None def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]: From 6906b04f3775becd0217ea9c0568b7a543b908f5 Mon Sep 17 00:00:00 2001 From: anilaltuner Date: Fri, 26 Apr 2024 11:24:58 +0300 Subject: [PATCH 02/15] Transformers as Judge added --- src/lighteval/metrics/metrics_sample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 53581f7b..6b071d54 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -642,7 +642,7 @@ def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = multi_turn=multi_turn, ) except Exception as e: - print(f"Could not initialize the JudgeLLM model:\n{e}") + print(f"Could not initialize the JudgeLM model:\n{e}") self.judge = None def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]: From 8a33c121edab14619d4d0afb87ed5a19eec34a1c Mon Sep 17 00:00:00 2001 From: anilaltuner Date: Tue, 30 Apr 2024 16:58:29 +0300 Subject: [PATCH 03/15] Formatting fix --- src/lighteval/metrics/llm_as_judge.py | 34 ++++++++++++------------- src/lighteval/metrics/metrics_sample.py | 1 + 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index a7f31e08..9d717d0f 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -25,12 +25,12 @@ import json import re import time -from typing import Optional, Any +from typing import Any, Optional from openai import OpenAI +from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from lighteval.logging.hierarchical_logger import hlog_warn -from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline class JudgeLM: @@ -63,16 +63,15 @@ class JudgeLM: """ def __init__( - self, - model: str, - seed: int, - temperature: float, - templates_path: str, - judge_type: str, - openai_api_key: Optional[str] = None, - multi_turn: bool = False, + self, + model: str, + seed: int, + temperature: float, + templates_path: str, + judge_type: str, + openai_api_key: Optional[str] = None, + multi_turn: bool = False, ): - self.model = model self.seed = seed self.temperature = temperature @@ -98,8 +97,9 @@ def __init__( self.API_RETRY_SLEEP = 10 self.max_tokens = 2048 else: - transformers_model = AutoModelForCausalLM.from_pretrained(model, - torch_dtype="auto", trust_remote_code=True) + transformers_model = AutoModelForCausalLM.from_pretrained( + model, torch_dtype="auto", trust_remote_code=True + ) tokenizer = AutoTokenizer.from_pretrained(model) self.pipe = pipeline( "text-generation", @@ -114,7 +114,7 @@ def __init__( } def evaluate_answer( - self, questions: list[str], answers: list[str], references: list[str] + self, questions: list[str], answers: list[str], references: list[str] ) -> tuple[list[int], list[list[dict[str, str]]], list[str | None | Any]]: """ Evaluates an answer using the OpenAI API or Transformers. @@ -141,10 +141,10 @@ def evaluate_answer( judgments = [] for prompt in prompts: - if hasattr(self, 'client'): + if hasattr(self, "client"): response = self.__call_openai_api(prompt) else: - response = self.pipe(prompt)[0]['generated_text'] + response = self.pipe(prompt)[0]["generated_text"] judgments.append(response) scores = [self.__process_judge_response(judgment) for judgment in judgments] @@ -169,7 +169,7 @@ def __call_openai_api(self, prompt): raise Exception("Failed to get response from the API") def __get_prompts_multi_turn( - self, questions: list[str], answers: list[str], references: Optional[list[str]] + self, questions: list[str], answers: list[str], references: Optional[list[str]] ) -> list[dict[str, str]]: """ Generates prompts for multi-turn conversations. The prompts are generated based on the templates. diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 6b071d54..d9f828a1 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -619,6 +619,7 @@ def edit_similarity(self, s1, s2): edist = edit_distance(s1, s2) return 1.0 - edist / max(len(s1), len(s2)) if len(s1) > 0 and len(s2) > 0 else 0 + class JudgeLLM: available_models = ["gpt-3.5-turbo"] From aef34a7c206e743d0c446d5fda7396bfa94f9539 Mon Sep 17 00:00:00 2001 From: anilaltuner Date: Thu, 23 May 2024 11:52:05 +0300 Subject: [PATCH 04/15] Check model from HfApi --- src/lighteval/metrics/metrics_sample.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index d9f828a1..dc4a75ba 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -28,6 +28,7 @@ import nltk import numpy as np +from huggingface_hub import HfApi from nltk.metrics.distance import edit_distance from nltk.tokenize import word_tokenize from nltk.tokenize.treebank import TreebankWordTokenizer @@ -621,13 +622,18 @@ def edit_similarity(self, s1, s2): class JudgeLLM: - available_models = ["gpt-3.5-turbo"] + gpt_models = ["gpt-3.5-turbo"] def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False): - if judge_model_name not in self.available_models: + if judge_model_name in self.gpt_models: judge_type = "openai" else: - judge_type = "transformers" + api = HfApi() + models = api.list_models(model_name=judge_model_name) + if models: + judge_type = "transformers" + else: + raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric") OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") self.multi_turn = multi_turn From 1e0a72007de42418ad5765ce4c792ecce1aa7708 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 11 Jul 2024 14:01:27 +0200 Subject: [PATCH 05/15] fixes after merge with main --- pyproject.toml | 2 +- src/lighteval/metrics/llm_as_judge.py | 4 ++++ src/lighteval/metrics/metrics.py | 8 ++++---- src/lighteval/metrics/metrics_sample.py | 6 +++--- src/lighteval/tasks/extended/mt_bench/main.py | 2 +- src/lighteval/tasks/lighteval_task.py | 14 +------------- src/lighteval/tasks/registry.py | 1 + src/lighteval/tasks/requests.py | 1 + src/lighteval/utils.py | 2 +- 9 files changed, 17 insertions(+), 23 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b771942d..53639da8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,7 +92,7 @@ tests = ["pytest==7.4.0"] dev = ["lighteval[accelerate,quality,tests]"] extended_tasks = [ "langdetect", # ifeval - "openai", # mt-bench + #"openai", # mt-bench ] [project.urls] diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index ca9e6b8e..8ebe82a2 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -92,11 +92,15 @@ def __init__( self.one_score_pattern_backup = re.compile(r"\[(\d+\.?\d*)\]") if judge_type == "openai": + from openai import OpenAI + self.client = OpenAI(api_key=openai_api_key) self.API_MAX_RETRY = 16 self.API_RETRY_SLEEP = 10 self.max_tokens = 2048 else: + from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline + transformers_model = AutoModelForCausalLM.from_pretrained( model, torch_dtype="auto", trust_remote_code=True ) diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 3c83625c..88f3492b 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -227,13 +227,13 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - llm_judge_multi_turn_openai = SampleLevelMetricGrouping( + llm_judge_multi_turn = SampleLevelMetricGrouping( metric=["single_turn", "multi_turn"], higher_is_better=True, category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN, use_case=MetricUseCase.SUMMARIZATION, sample_level_fn=JudgeLLM( - judge_model_name="gpt-3.5-turbo", + judge_model_name="gpt2", template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"), multi_turn=True, ).compute, @@ -242,13 +242,13 @@ class Metrics(Enum): "multi_turn": np.mean, }, ) - llm_judge_openai = SampleLevelMetricGrouping( + llm_judge = SampleLevelMetricGrouping( metric=["judge_score"], higher_is_better=True, category=MetricCategory.LLM_AS_JUDGE, use_case=MetricUseCase.SUMMARIZATION, sample_level_fn=JudgeLLM( - judge_model_name="gpt-3.5-turbo", + judge_model_name="gpt2", template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"), multi_turn=False, ).compute, diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index c6dbb965..054eb817 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -639,13 +639,13 @@ def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") self.multi_turn = multi_turn - self.judge = JudgeOpenAI( + self.judge = JudgeLM( model=judge_model_name, + judge_type=judge_type, + openai_api_key=OPENAI_API_KEY, seed=42, temperature=0.0, templates_path=template_path, - openai_api_key=OPENAI_API_KEY, - multi_turn=multi_turn, ) def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]: diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py index 77b8f3ee..8fb2a2a6 100644 --- a/src/lighteval/tasks/extended/mt_bench/main.py +++ b/src/lighteval/tasks/extended/mt_bench/main.py @@ -55,7 +55,7 @@ def mt_bench_prompt(line, task_name: str = None): evaluation_splits=["train"], few_shots_split="", few_shots_select="random", - metric=["llm_judge_multi_turn_openai"], + metric=["llm_judge_multi_turn"], generation_size=1024, stop_sequence=[], ) diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index b92cb8fa..3ea76121 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -21,7 +21,6 @@ # SOFTWARE. import collections -import os import random from dataclasses import dataclass from multiprocessing import Pool @@ -54,7 +53,7 @@ RequestType, TaskExampleId, ) -from lighteval.utils import NO_OPENAI_ERROR_MSG, as_list, is_openai_available +from lighteval.utils import as_list if TYPE_CHECKING: @@ -184,17 +183,6 @@ def __init__( # noqa: C901 if len(ignored) > 0: hlog_warn(f"[WARNING] Not implemented yet: ignoring the metric {' ,'.join(ignored)} for task {self.name}.") - if any( - Metrics[metric].value.category in [MetricCategory.LLM_AS_JUDGE, MetricCategory.LLM_AS_JUDGE_MULTI_TURN] - for metric in self.metrics - ): - if not is_openai_available(): - raise ImportError(NO_OPENAI_ERROR_MSG) - if os.getenv("OPENAI_API_KEY") is None: - raise ValueError( - "Using llm as judge metric but no OPEN_API_KEY were found, please set it with: export OPEN_API_KEY={yourkey}" - ) - current_categories = [Metrics[metric].value.category for metric in self.metrics] self.has_metric_category = {category: (category in current_categories) for category in MetricCategory} # Sub-optimal system - we might want to store metric parametrisation in a yaml conf for example diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index ef575b7e..4726b19b 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -103,6 +103,7 @@ def get_task_class( return custom_tasks_registry[task_name] hlog_warn(f"{task_name} not found in provided tasks") hlog_warn(pformat(self.TASK_REGISTRY)) + raise ValueError( f"Cannot find tasks {task_name} in task list or in custom task registry ({custom_tasks_registry})" ) diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py index 6dd30786..2bd69023 100644 --- a/src/lighteval/tasks/requests.py +++ b/src/lighteval/tasks/requests.py @@ -136,6 +136,7 @@ class GreedyUntilMultiTurnRequest(Request): stop_sequence: str generation_size: int request_type = RequestType.GREEDY_UNTIL_MULTI_TURN + use_logits: bool = False class TaskExampleId(NamedTuple): diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py index 3e032d1f..16235785 100644 --- a/src/lighteval/utils.py +++ b/src/lighteval/utils.py @@ -209,7 +209,7 @@ def is_openai_available() -> bool: def can_load_extended_tasks() -> bool: imports = [] - for package in ["langdetect", "openai"]: + for package in ["langdetect"]: imports.append(importlib.util.find_spec(package)) return all(cur_import is not None for cur_import in imports) From a54b88831c3083a133b7e1f1ca0c6e5be57faa81 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Wed, 24 Jul 2024 14:39:15 +0000 Subject: [PATCH 06/15] linting --- pyproject.toml | 1 - src/lighteval/metrics/llm_as_judge.py | 86 ++++++------------------- src/lighteval/metrics/metrics.py | 4 +- src/lighteval/metrics/metrics_sample.py | 25 ++----- src/lighteval/models/base_model.py | 1 - src/lighteval/utils.py | 7 -- 6 files changed, 27 insertions(+), 97 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 53639da8..7e3390de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,7 +92,6 @@ tests = ["pytest==7.4.0"] dev = ["lighteval[accelerate,quality,tests]"] extended_tasks = [ "langdetect", # ifeval - #"openai", # mt-bench ] [project.urls] diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index 8ebe82a2..bcd6d773 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -24,12 +24,8 @@ import ast import json import re -import time from typing import Any, Optional -from lighteval.logging.hierarchical_logger import hlog_warn -from lighteval.utils import NO_OPENAI_ERROR_MSG, is_openai_available - class JudgeLM: """ @@ -63,19 +59,12 @@ class JudgeLM: def __init__( self, model: str, - seed: int, - temperature: float, templates_path: str, - judge_type: str, - openai_api_key: Optional[str] = None, multi_turn: bool = False, ): - self.client = None # loaded lazily - self.openai_api_key = openai_api_key - self.model = model - self.seed = seed - self.temperature = temperature self.multi_turn = multi_turn + self.pipe = None + self.model = model data = [] with open(templates_path, "r") as f: @@ -91,37 +80,11 @@ def __init__( self.one_score_pattern = re.compile(r"\[\[(\d+\.?\d*)\]\]") self.one_score_pattern_backup = re.compile(r"\[(\d+\.?\d*)\]") - if judge_type == "openai": - from openai import OpenAI - - self.client = OpenAI(api_key=openai_api_key) - self.API_MAX_RETRY = 16 - self.API_RETRY_SLEEP = 10 - self.max_tokens = 2048 - else: - from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline - - transformers_model = AutoModelForCausalLM.from_pretrained( - model, torch_dtype="auto", trust_remote_code=True - ) - tokenizer = AutoTokenizer.from_pretrained(model) - self.pipe = pipeline( - "text-generation", - model=transformers_model, - tokenizer=tokenizer, - ) - self.generation_args = { - "max_new_tokens": 500, - "return_full_text": False, - "temperature": temperature, - "do_sample": False, - } - def evaluate_answer( self, questions: list[str], answers: list[str], references: list[str] ) -> tuple[list[int], list[list[dict[str, str]]], list[str | None | Any]]: """ - Evaluates an answer using the OpenAI API or Transformers. + Evaluates an answer using Transformers. Args: questions (list[str]): A list of questions (can be a list because of multi-turn conversations) @@ -131,13 +94,21 @@ def evaluate_answer( Returns: A tuple containing the score, prompts, and judgment. """ - if self.client is None: - if not is_openai_available(): - raise ImportError(NO_OPENAI_ERROR_MSG) - - from openai import OpenAI + # lazy loading of the pipeline + if self.pipe is None: + import torch + from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline - self.client = OpenAI(api_key=self.openai_api_key) + transformers_model = AutoModelForCausalLM.from_pretrained( + self.model, torch_dtype=torch.bfloat16, trust_remote_code=False, device_map="cuda" + ) + tokenizer = AutoTokenizer.from_pretrained(self.model) + self.pipe = pipeline( + "text-generation", + model=transformers_model, + tokenizer=tokenizer, + max_new_tokens=50, + ) prompts = [ self.__get_prompts_single_turn( @@ -153,33 +124,14 @@ def evaluate_answer( judgments = [] for prompt in prompts: - if hasattr(self, "client"): - response = self.__call_openai_api(prompt) - else: - response = self.pipe(prompt)[0]["generated_text"] + response = self.pipe(prompt)[0]["generated_text"] + response = response[-1]["content"] judgments.append(response) scores = [self.__process_judge_response(judgment) for judgment in judgments] return scores, prompts, judgments - def __call_openai_api(self, prompt): - for _ in range(self.API_MAX_RETRY): - try: - response = self.client.chat.completions.create( - model=self.model, - seed=self.seed, - temperature=self.temperature, - messages=prompt, - max_tokens=self.max_tokens, - n=1, - ) - return response.choices[0].message.content - except Exception as e: - hlog_warn(f"{type(e), e}") - time.sleep(self.API_RETRY_SLEEP) - raise Exception("Failed to get response from the API") - def __get_prompts_multi_turn( self, questions: list[str], answers: list[str], references: Optional[list[str]] ) -> list[dict[str, str]]: diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 88f3492b..8d559076 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -233,7 +233,7 @@ class Metrics(Enum): category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN, use_case=MetricUseCase.SUMMARIZATION, sample_level_fn=JudgeLLM( - judge_model_name="gpt2", + judge_model_name="HuggingFaceH4/zephyr-7b-alpha", template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"), multi_turn=True, ).compute, @@ -248,7 +248,7 @@ class Metrics(Enum): category=MetricCategory.LLM_AS_JUDGE, use_case=MetricUseCase.SUMMARIZATION, sample_level_fn=JudgeLLM( - judge_model_name="gpt2", + judge_model_name="HuggingFaceH4/zephyr-7b-alpha", template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"), multi_turn=False, ).compute, diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 054eb817..1d9ebaa7 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -24,7 +24,6 @@ using simple function (min, mean, max, ...) at the corpus level. Most metrics fall under this category. """ -import os from typing import Union import nltk @@ -623,29 +622,17 @@ def edit_similarity(self, s1, s2): class JudgeLLM: - gpt_models = ["gpt-3.5-turbo"] - def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False): - if judge_model_name in self.gpt_models: - judge_type = "openai" - else: - api = HfApi() - models = api.list_models(model_name=judge_model_name) - if models: - judge_type = "transformers" - else: - raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric") - - OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") - self.multi_turn = multi_turn + api = HfApi() + models = api.list_models(model_name=judge_model_name) + if not models: + raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric") + self.multi_turn = multi_turn self.judge = JudgeLM( model=judge_model_name, - judge_type=judge_type, - openai_api_key=OPENAI_API_KEY, - seed=42, - temperature=0.0, templates_path=template_path, + multi_turn=multi_turn, ) def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]: diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py index 3e483d44..d383aae7 100644 --- a/src/lighteval/models/base_model.py +++ b/src/lighteval/models/base_model.py @@ -355,7 +355,6 @@ def greedy_until_multi_turn( # noqa: C901 max_generated_tokens = request.generation_size context = request.context[0] max_context_size_allowed = self.max_length - max_generated_tokens - model_inputs = self.tokenizer( context, padding=True, diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py index 16235785..c529db59 100644 --- a/src/lighteval/utils.py +++ b/src/lighteval/utils.py @@ -200,13 +200,6 @@ def is_tensorboardX_available() -> bool: ) -def is_openai_available() -> bool: - return importlib.util.find_spec("openai") is not None - - -NO_OPENAI_ERROR_MSG = "You are trying to use an Open AI LLM as a judge, for which you need `openai`, which is not available in your environment. Please install it using pip." - - def can_load_extended_tasks() -> bool: imports = [] for package in ["langdetect"]: From ba8c7bf133d2512548cbb4278622ad5d9a898c94 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Wed, 24 Jul 2024 15:20:07 +0000 Subject: [PATCH 07/15] fix doc --- src/lighteval/metrics/llm_as_judge.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index bcd6d773..2bcdcb7e 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -33,21 +33,14 @@ class JudgeLM: Args: model (str): The name of the model to use. - seed (int): The seed value for generating random responses. - temperature (float): The temperature value for controlling the randomness of the responses. templates_path (str): The path to the JSON file containing the templates for prompts. + multi_turn (bool): Whether to use multi-turn prompts Attributes: model (str): The name of the model. - seed (int): The seed value, passed to the API when generating responses. - temperature (float): The temperature value, passed to the API when generating responses. templates (dict): A dictionary containing the templates for prompts. - judge_type (str): Judge type based on used model. one_score_pattern (re.Pattern): A regular expression pattern for extracting scores from the response. one_score_pattern_backup (re.Pattern): A backup regular expression pattern for extracting scores. - API_MAX_RETRY (int): The maximum number of API retries. - API_RETRY_SLEEP (int): The sleep time between API retries. - max_tokens (int): The maximum number of tokens allowed in the response. Methods: evaluate_answer: Evaluates an answer using the OpenAI API or Transformers library. From e6c9a9981971ffc4104f80a27aa14be6dc9aefa0 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Wed, 24 Jul 2024 15:54:38 +0000 Subject: [PATCH 08/15] merge --- src/lighteval/metrics/metrics.py | 4 ++-- src/lighteval/tasks/extended/mt_bench/main.py | 3 ++- src/lighteval/tasks/lighteval_task.py | 10 ---------- 3 files changed, 4 insertions(+), 13 deletions(-) diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index c5788e87..66b2f55f 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -228,7 +228,7 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - llm_judge_multi_turn_openai = SampleLevelMetricGrouping( + llm_judge_multi_turn = SampleLevelMetricGrouping( metric_name=["single_turn", "multi_turn"], higher_is_better=True, category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN, @@ -243,7 +243,7 @@ class Metrics(Enum): "multi_turn": np.mean, }, ) - llm_judge_openai = SampleLevelMetricGrouping( + llm_judge = SampleLevelMetricGrouping( metric_name=["judge_score"], higher_is_better=True, category=MetricCategory.LLM_AS_JUDGE, diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py index 8fb2a2a6..fa7a874d 100644 --- a/src/lighteval/tasks/extended/mt_bench/main.py +++ b/src/lighteval/tasks/extended/mt_bench/main.py @@ -23,6 +23,7 @@ # ruff: noqa: F405, F403, F401, I001 from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc +from lighteval.metrics.metrics import Metrics def mt_bench_prompt(line, task_name: str = None): @@ -55,7 +56,7 @@ def mt_bench_prompt(line, task_name: str = None): evaluation_splits=["train"], few_shots_split="", few_shots_select="random", - metric=["llm_judge_multi_turn"], + metric=[Metrics.llm_judge_multi_turn], generation_size=1024, stop_sequence=[], ) diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index 40388e10..f27c894a 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -188,16 +188,6 @@ def __init__( # noqa: C901 current_categories = [metric.category for metric in self.metrics] self.has_metric_category = {category: (category in current_categories) for category in MetricCategory} - if ( - self.has_metric_category[MetricCategory.LLM_AS_JUDGE] - or self.has_metric_category[MetricCategory.LLM_AS_JUDGE_MULTI_TURN] - ): - if not is_openai_available(): - raise ImportError(NO_OPENAI_ERROR_MSG) - if os.getenv("OPENAI_API_KEY") is None: - raise ValueError( - "Using llm as judge metric but no OPEN_API_KEY were found, please set it with: export OPEN_API_KEY={yourkey}" - ) # We assume num_samples always contains 1 (for base generative evals) self.num_samples = [1] From a75d0576eda2c2db9d52527ad3034070c7c461a1 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 25 Jul 2024 12:52:07 +0000 Subject: [PATCH 09/15] readd openai and adds llama 3 405b as judge --- src/lighteval/logging/evaluation_tracker.py | 5 +- src/lighteval/metrics/llm_as_judge.py | 79 +++++++++++++++---- src/lighteval/metrics/metrics.py | 41 ++++++++-- src/lighteval/metrics/metrics_sample.py | 27 +++++-- src/lighteval/tasks/extended/mt_bench/main.py | 2 +- 5 files changed, 124 insertions(+), 30 deletions(-) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 370758d0..e6d1846f 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -57,7 +57,10 @@ class EnhancedJSONEncoder(json.JSONEncoder): def default(self, o): if is_dataclass(o): - return asdict(o) + try: + return asdict(o) + except Exception: + return str(o) if callable(o): return o.__name__ if isinstance(o, Enum): diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index 2bcdcb7e..3e56c2f0 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -24,8 +24,11 @@ import ast import json import re +import time from typing import Any, Optional +from lighteval.logging.hierarchical_logger import hlog_warn + class JudgeLM: """ @@ -54,9 +57,11 @@ def __init__( model: str, templates_path: str, multi_turn: bool = False, + use_transformers: bool = False, + url: Optional[str] = None, + api_key: Optional[str] = None, ): self.multi_turn = multi_turn - self.pipe = None self.model = model data = [] @@ -72,6 +77,39 @@ def __init__( # the second is for the backup case: [score] self.one_score_pattern = re.compile(r"\[\[(\d+\.?\d*)\]\]") self.one_score_pattern_backup = re.compile(r"\[(\d+\.?\d*)\]") + self.API_MAX_RETRY = 3 + self.API_RETRY_SLEEP = 1 + + self.client = None + self.pipe = None + self.use_transformers = use_transformers + self.url = url + self.api_key = api_key + + def lazy_load_client(self): + if not self.use_transformers: + if self.client is None: + from openai import OpenAI + + if self.url is None: + self.client = OpenAI(api_key=self.api_key) + else: + self.client = OpenAI(base_url=self.url, api_key=self.api_key) + else: + if self.pipe is None: + import torch + from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline + + transformers_model = AutoModelForCausalLM.from_pretrained( + self.model, torch_dtype=torch.bfloat16, trust_remote_code=False, device_map="cuda" + ) + tokenizer = AutoTokenizer.from_pretrained(self.model) + self.pipe = pipeline( + "text-generation", + model=transformers_model, + tokenizer=tokenizer, + max_new_tokens=50, + ) def evaluate_answer( self, questions: list[str], answers: list[str], references: list[str] @@ -88,20 +126,7 @@ def evaluate_answer( A tuple containing the score, prompts, and judgment. """ # lazy loading of the pipeline - if self.pipe is None: - import torch - from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline - - transformers_model = AutoModelForCausalLM.from_pretrained( - self.model, torch_dtype=torch.bfloat16, trust_remote_code=False, device_map="cuda" - ) - tokenizer = AutoTokenizer.from_pretrained(self.model) - self.pipe = pipeline( - "text-generation", - model=transformers_model, - tokenizer=tokenizer, - max_new_tokens=50, - ) + self.lazy_load_client() prompts = [ self.__get_prompts_single_turn( @@ -117,8 +142,11 @@ def evaluate_answer( judgments = [] for prompt in prompts: - response = self.pipe(prompt)[0]["generated_text"] - response = response[-1]["content"] + if self.client is not None: + response = self.__call_openai_api(prompt) + else: + response = self.pipe(prompt)[0]["generated_text"] + response = response[-1]["content"] judgments.append(response) scores = [self.__process_judge_response(judgment) for judgment in judgments] @@ -202,3 +230,20 @@ def __process_judge_response(self, judgment: str) -> int: rating = -1 return rating + + def __call_openai_api(self, prompt): + for _ in range(self.API_MAX_RETRY): + try: + response = self.client.chat.completions.create( + model=self.model, + # seed=self.seed, + # temperature=self.temperature, + messages=prompt, + max_tokens=512, + n=1, + ) + return response.choices[0].message.content + except Exception as e: + hlog_warn(f"{type(e), e}") + time.sleep(self.API_RETRY_SLEEP) + raise Exception("Failed to get response from the API") diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index 66b2f55f..068e086e 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -228,13 +228,28 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - llm_judge_multi_turn = SampleLevelMetricGrouping( + llm_judge_multi_turn_gpt3p5 = SampleLevelMetricGrouping( metric_name=["single_turn", "multi_turn"], - higher_is_better=True, + higher_is_better={"single_turn": True, "multi_turn": True}, + category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN, + use_case=MetricUseCase.SUMMARIZATION, + sample_level_fn=JudgeLLM( + judge_model_name="gpt-3.5-turbo", + template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"), + multi_turn=True, + ).compute, + corpus_level_fn={ + "single_turn": np.mean, + "multi_turn": np.mean, + }, + ) + llm_judge_multi_turn_llama_3_405b = SampleLevelMetricGrouping( + metric_name=["single_turn", "multi_turn"], + higher_is_better={"single_turn": True, "multi_turn": True}, category=MetricCategory.LLM_AS_JUDGE_MULTI_TURN, use_case=MetricUseCase.SUMMARIZATION, sample_level_fn=JudgeLLM( - judge_model_name="HuggingFaceH4/zephyr-7b-alpha", + judge_model_name="meta-llama/Meta-Llama-3.1-405B-Instruct-FP8", template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"), multi_turn=True, ).compute, @@ -243,13 +258,27 @@ class Metrics(Enum): "multi_turn": np.mean, }, ) - llm_judge = SampleLevelMetricGrouping( + llm_judge_gpt3p5 = SampleLevelMetricGrouping( metric_name=["judge_score"], - higher_is_better=True, + higher_is_better={"judge_score": True}, + category=MetricCategory.LLM_AS_JUDGE, + use_case=MetricUseCase.SUMMARIZATION, + sample_level_fn=JudgeLLM( + judge_model_name="gpt-3.5-turbo", + template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"), + multi_turn=False, + ).compute, + corpus_level_fn={ + "judge_score": np.mean, + }, + ) + llm_judge_llama_3_405b = SampleLevelMetricGrouping( + metric_name=["judge_score"], + higher_is_better={"judge_score": True}, category=MetricCategory.LLM_AS_JUDGE, use_case=MetricUseCase.SUMMARIZATION, sample_level_fn=JudgeLLM( - judge_model_name="HuggingFaceH4/zephyr-7b-alpha", + judge_model_name="meta-llama/Meta-Llama-3.1-405B-Instruct-FP8", template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"), multi_turn=False, ).compute, diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 6d872fb0..385f15aa 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -24,6 +24,7 @@ using simple function (min, mean, max, ...) at the corpus level. Most metrics fall under this category. """ +import os from typing import Union import nltk @@ -626,17 +627,33 @@ def edit_similarity(self, s1, s2): class JudgeLLM: - def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False): - api = HfApi() - models = api.list_models(model_name=judge_model_name) - if not models: - raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric") + available_models_openai = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"] + + def __init__( + self, judge_model_name: str, template_path: str, multi_turn: bool = False, use_transformers: bool = False + ) -> None: + if judge_model_name in self.available_models_openai: + api_key = os.getenv("OPENAI_API_KEY") + url = None + elif not use_transformers: + api_key = os.getenv("HF_TOKEN") + url = "https://api-inference.huggingface.co/v1/" + else: + api = HfApi() + models = api.list_models(model_name=judge_model_name) + url = None + api_key = None + if not models: + raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric") self.multi_turn = multi_turn self.judge = JudgeLM( model=judge_model_name, templates_path=template_path, multi_turn=multi_turn, + use_transformers=use_transformers, + api_key=api_key, + url=url, ) def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]: diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py index fa7a874d..84e63d8a 100644 --- a/src/lighteval/tasks/extended/mt_bench/main.py +++ b/src/lighteval/tasks/extended/mt_bench/main.py @@ -56,7 +56,7 @@ def mt_bench_prompt(line, task_name: str = None): evaluation_splits=["train"], few_shots_split="", few_shots_select="random", - metric=[Metrics.llm_judge_multi_turn], + metric=[Metrics.llm_judge_multi_turn_llama_3_405b], generation_size=1024, stop_sequence=[], ) From 0ce15da07b0e15edad7e126c980d4cf8d2807c52 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 30 Jul 2024 12:25:52 +0000 Subject: [PATCH 10/15] fixing PR comments --- pyproject.toml | 1 + src/lighteval/metrics/llm_as_judge.py | 52 +++++++++++++------ src/lighteval/metrics/metrics_sample.py | 1 - src/lighteval/models/base_model.py | 1 + src/lighteval/tasks/extended/mt_bench/main.py | 2 +- src/lighteval/tasks/registry.py | 1 - src/lighteval/utils.py | 9 +++- 7 files changed, 46 insertions(+), 21 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7576d40e..e301d7af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,6 +92,7 @@ tests = ["pytest==7.4.0"] dev = ["lighteval[accelerate,quality,tests]"] extended_tasks = [ "langdetect", # ifeval + "openai", # llm as a judge using openai models ] [project.urls] diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index 3e56c2f0..ab18e80d 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -27,29 +27,43 @@ import time from typing import Any, Optional +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline + from lighteval.logging.hierarchical_logger import hlog_warn class JudgeLM: """ - A class representing a judge for evaluating answers using the Transformers library. + A class representing a judge for evaluating answers using either the OpeanAI or Transformers library. Args: model (str): The name of the model to use. templates_path (str): The path to the JSON file containing the templates for prompts. multi_turn (bool): Whether to use multi-turn prompts + url (Optional[str]): The URL for the OpenAI API. + api_key (Optional[str]): The API key for the OpenAI API (either OpenAI or HF key). Attributes: model (str): The name of the model. templates (dict): A dictionary containing the templates for prompts. one_score_pattern (re.Pattern): A regular expression pattern for extracting scores from the response. one_score_pattern_backup (re.Pattern): A backup regular expression pattern for extracting scores. + API_MAX_RETRY (int): The maximum number of retries for the API. + API_RETRY_SLEEP (int): The sleep time between retries. + client (Optional[OpenAI]): The OpenAI client. + pipe (Optional[pipeline]): The Transformers pipeline. + use_transformers (bool): Whether to use the Transformers library. + url (Optional[str]): The URL for the OpenAI API. + api_key (Optional[str]): The API key for the OpenAI API (either OpenAI or HF key). Methods: evaluate_answer: Evaluates an answer using the OpenAI API or Transformers library. __get_prompts_multi_turn: Generates prompts for multi-turn conversations. __get_prompts_single_turn: Generates prompts for single-turn conversations. __process_judge_response: Processes the judge's response and extracts the score. + __call_openai_api: Calls the OpenAI API to get the judge's response. + __lazy_load_client: Lazy loads the OpenAI client or Transformers pipeline. """ def __init__( @@ -57,7 +71,6 @@ def __init__( model: str, templates_path: str, multi_turn: bool = False, - use_transformers: bool = False, url: Optional[str] = None, api_key: Optional[str] = None, ): @@ -82,24 +95,21 @@ def __init__( self.client = None self.pipe = None - self.use_transformers = use_transformers - self.url = url - self.api_key = api_key - def lazy_load_client(self): - if not self.use_transformers: - if self.client is None: - from openai import OpenAI + if url is not None and api_key is None: + raise ValueError("API key must be provided if using a custom URL. `export HF_TOKEN=your_token`") - if self.url is None: - self.client = OpenAI(api_key=self.api_key) - else: - self.client = OpenAI(base_url=self.url, api_key=self.api_key) + if url is None and api_key is None: + self.use_transformers = True else: - if self.pipe is None: - import torch - from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline + self.use_transformers = False + + self.url = url + self.api_key = api_key + def __lazy_load_client(self): + if self.use_transformers: + if self.pipe is None: transformers_model = AutoModelForCausalLM.from_pretrained( self.model, torch_dtype=torch.bfloat16, trust_remote_code=False, device_map="cuda" ) @@ -110,6 +120,14 @@ def lazy_load_client(self): tokenizer=tokenizer, max_new_tokens=50, ) + else: + if self.client is None: + from openai import OpenAI + + if self.url is None: + self.client = OpenAI(api_key=self.api_key) + else: + self.client = OpenAI(base_url=self.url, api_key=self.api_key) def evaluate_answer( self, questions: list[str], answers: list[str], references: list[str] @@ -126,7 +144,7 @@ def evaluate_answer( A tuple containing the score, prompts, and judgment. """ # lazy loading of the pipeline - self.lazy_load_client() + self.__lazy_load_client() prompts = [ self.__get_prompts_single_turn( diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 385f15aa..c729655c 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -651,7 +651,6 @@ def __init__( model=judge_model_name, templates_path=template_path, multi_turn=multi_turn, - use_transformers=use_transformers, api_key=api_key, url=url, ) diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py index 79fced37..f1ba6151 100644 --- a/src/lighteval/models/base_model.py +++ b/src/lighteval/models/base_model.py @@ -351,6 +351,7 @@ def greedy_until_multi_turn( # noqa: C901 max_generated_tokens = request.generation_size context = request.context[0] max_context_size_allowed = self.max_length - max_generated_tokens + model_inputs = self.tokenizer( context, padding=True, diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py index 84e63d8a..e6b69518 100644 --- a/src/lighteval/tasks/extended/mt_bench/main.py +++ b/src/lighteval/tasks/extended/mt_bench/main.py @@ -56,7 +56,7 @@ def mt_bench_prompt(line, task_name: str = None): evaluation_splits=["train"], few_shots_split="", few_shots_select="random", - metric=[Metrics.llm_judge_multi_turn_llama_3_405b], + metric=[Metrics.llm_judge_multi_turn_llama_gpt3p5], generation_size=1024, stop_sequence=[], ) diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index da3b06c7..28597763 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -104,7 +104,6 @@ def get_task_class( return custom_tasks_registry[task_name] hlog_warn(f"{task_name} not found in provided tasks") hlog_warn(pformat(self.TASK_REGISTRY)) - raise ValueError( f"Cannot find tasks {task_name} in task list or in custom task registry ({custom_tasks_registry})" ) diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py index c529db59..3e032d1f 100644 --- a/src/lighteval/utils.py +++ b/src/lighteval/utils.py @@ -200,9 +200,16 @@ def is_tensorboardX_available() -> bool: ) +def is_openai_available() -> bool: + return importlib.util.find_spec("openai") is not None + + +NO_OPENAI_ERROR_MSG = "You are trying to use an Open AI LLM as a judge, for which you need `openai`, which is not available in your environment. Please install it using pip." + + def can_load_extended_tasks() -> bool: imports = [] - for package in ["langdetect"]: + for package in ["langdetect", "openai"]: imports.append(importlib.util.find_spec(package)) return all(cur_import is not None for cur_import in imports) From bb742c99942a5ba589d8eaac463563334fe2f7f0 Mon Sep 17 00:00:00 2001 From: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Date: Thu, 1 Aug 2024 00:16:06 +0200 Subject: [PATCH 11/15] Update src/lighteval/metrics/llm_as_judge.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- src/lighteval/metrics/llm_as_judge.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index ab18e80d..cc662b63 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -99,10 +99,7 @@ def __init__( if url is not None and api_key is None: raise ValueError("API key must be provided if using a custom URL. `export HF_TOKEN=your_token`") - if url is None and api_key is None: - self.use_transformers = True - else: - self.use_transformers = False + self.use_transformers = url is None and api_key is None self.url = url self.api_key = api_key From 2b8a078ce139f48b00c0591c0dbf86e5970b349f Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Wed, 31 Jul 2024 22:25:46 +0000 Subject: [PATCH 12/15] add nits --- README.md | 5 +++++ src/lighteval/metrics/llm_as_judge.py | 8 +++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index a6dfbc48..f15f6b84 100644 --- a/README.md +++ b/README.md @@ -415,6 +415,11 @@ These metrics need the model to generate an output. They are therefore slower. - `maj_at_4_math` (Lighteval): Majority choice evaluation, using the math normalisation for the predictions and gold - `quasi_exact_match_gsm8k` (Harness): Fraction of instances where the normalized prediction matches the normalized gold (normalization done for gsm8k, where latex symbols, units, etc are removed) - `maj_at_8_gsm8k` (Lighteval): Majority choice evaluation, using the gsm8k normalisation for the predictions and gold +- LLM-as-Judge: + - `llm_judge_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the openai API + - `llm_judge_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the openai API + - `llm_judge_multi_turn_gpt3p5`: Can be used for any generative task, the model will be scored by a GPT3.5 model using the openai API. It is used for multiturn tasks like mt-bench. + - `llm_judge_multi_turn_llama_3_405b`: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the openai API. It is used for multiturn tasks like mt-bench. ### Metrics for specific tasks To keep compatibility with the Harness for some specific tasks, we ported their evaluations more or less as such. They include `drop` (for the DROP dataset) and `truthfulqa_mc_metrics` (for TruthfulQA). In general, except for tasks where the dataset has very different formatting than usual (another language, programming language, math, ...), we want to use standard implementations of the above metrics. It makes little sense to have 10 different versions of an exact match depending on the task. However, most of the above metrics are parametrizable so that you can change the normalization applied easily for experimental purposes. diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index cc662b63..b152057b 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -130,7 +130,7 @@ def evaluate_answer( self, questions: list[str], answers: list[str], references: list[str] ) -> tuple[list[int], list[list[dict[str, str]]], list[str | None | Any]]: """ - Evaluates an answer using Transformers. + Evaluates an answer using either Transformers or OpenAI API. Args: questions (list[str]): A list of questions (can be a list because of multi-turn conversations) @@ -158,7 +158,7 @@ def evaluate_answer( judgments = [] for prompt in prompts: if self.client is not None: - response = self.__call_openai_api(prompt) + response = self.__call_api(prompt) else: response = self.pipe(prompt)[0]["generated_text"] response = response[-1]["content"] @@ -246,13 +246,11 @@ def __process_judge_response(self, judgment: str) -> int: return rating - def __call_openai_api(self, prompt): + def __call_api(self, prompt): for _ in range(self.API_MAX_RETRY): try: response = self.client.chat.completions.create( model=self.model, - # seed=self.seed, - # temperature=self.temperature, messages=prompt, max_tokens=512, n=1, From 3109dcda83799a74ccfef98bbdca55acfbe0b0a4 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Wed, 31 Jul 2024 22:32:07 +0000 Subject: [PATCH 13/15] fix tests --- src/lighteval/metrics/llm_as_judge.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index b152057b..ff3dff3b 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -96,9 +96,6 @@ def __init__( self.client = None self.pipe = None - if url is not None and api_key is None: - raise ValueError("API key must be provided if using a custom URL. `export HF_TOKEN=your_token`") - self.use_transformers = url is None and api_key is None self.url = url From 5df8df31eb569f313c52bb5a3224bd46358798d9 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Wed, 31 Jul 2024 23:42:16 +0000 Subject: [PATCH 14/15] fix tests --- src/lighteval/tasks/extended/mt_bench/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py index e6b69518..03bff898 100644 --- a/src/lighteval/tasks/extended/mt_bench/main.py +++ b/src/lighteval/tasks/extended/mt_bench/main.py @@ -56,7 +56,7 @@ def mt_bench_prompt(line, task_name: str = None): evaluation_splits=["train"], few_shots_split="", few_shots_select="random", - metric=[Metrics.llm_judge_multi_turn_llama_gpt3p5], + metric=[Metrics.llm_judge_multi_turn_gpt3p5], generation_size=1024, stop_sequence=[], ) From 97b7b42822584556b934923e3161644ebc91c4e0 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Wed, 14 Aug 2024 09:49:41 +0000 Subject: [PATCH 15/15] fix tests --- src/lighteval/tasks/lighteval_task.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index c93a1f2d..07120b71 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -22,7 +22,6 @@ import collections import inspect -import os import random from dataclasses import asdict, dataclass from multiprocessing import Pool