diff --git a/README.md b/README.md index b90fc9764..dc4827735 100644 --- a/README.md +++ b/README.md @@ -239,6 +239,17 @@ python run_evals_accelerate.py \ --output_dir "./evals" ``` +### Using the dummy model +To debug or obtain random baseline scores for a given set of tasks, you can use the `dummy` model: +```shell +python run_evals_accelerate.py \ + --model_args "dummy"\ + --tasks \ + --output_dir output_dir +``` +This "model" randomly generates logprobs (for selection/accuracy tasks) and the string "random baseline" for generation tasks. +You can also select a specific seed for the random logprob values generated by the dummy model: `--model_args "dummy,seed=123"`. + ## Deep thanks `lighteval` was originally built on top of the great [Eleuther AI Harness](https://github.com/EleutherAI/lm-evaluation-harness) (we use the latter to power the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)). We also took a lot of inspiration from the amazing [HELM](https://crfm.stanford.edu/helm/latest/), notably for metrics. diff --git a/src/lighteval/models/dummy_model.py b/src/lighteval/models/dummy_model.py new file mode 100644 index 000000000..08335db5f --- /dev/null +++ b/src/lighteval/models/dummy_model.py @@ -0,0 +1,89 @@ +# MIT License +# +# Copyright (c) 2024 The HuggingFace Team +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# inspired by https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/dummy.py + +import random +from typing import Optional + +from transformers import AutoTokenizer + +from lighteval.models.abstract_model import LightevalModel +from lighteval.models.model_config import DummyModelConfig, EnvConfig +from lighteval.models.model_output import GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn +from lighteval.tasks.requests import ( + GreedyUntilRequest, + LoglikelihoodRequest, + LoglikelihoodRollingRequest, + LoglikelihoodSingleTokenRequest, +) + + +class DummyModel(LightevalModel): + """Dummy model to generate random baselines.""" + + def __init__( + self, + config: DummyModelConfig, + env_config: EnvConfig, + ): + self.config = config + self.env_config = env_config + self._random = random.Random(self.config.seed) + self._tokenizer = None + + @property + def tokenizer(self): + if not self._tokenizer: + self._tokenizer = AutoTokenizer.from_pretrained("gpt2") + return self._tokenizer + + @property + def add_special_tokens(self): + return False + + @property + def max_length(self) -> int: + return 2048 + + def greedy_until( + self, requests: list[GreedyUntilRequest], override_bs: Optional[int] = None + ) -> list[GenerateReturn]: + return [GenerateReturn(result="random baseline") for _ in range(len(requests))] + + def loglikelihood( + self, requests: list[LoglikelihoodRequest], override_bs: Optional[int] = None + ) -> list[LoglikelihoodReturn]: + return [LoglikelihoodReturn((-self._random.random(), False)) for _ in requests] + + def loglikelihood_rolling( + self, requests: list[LoglikelihoodRollingRequest], override_bs: Optional[int] = None + ) -> list[LoglikelihoodReturn]: + return [LoglikelihoodReturn((-self._random.random(), False)) for _ in requests] + + def loglikelihood_single_token( + self, requests: list[LoglikelihoodSingleTokenRequest], override_bs: Optional[int] = None + ) -> list[LoglikelihoodSingleTokenReturn]: + return [ + LoglikelihoodSingleTokenReturn(result=[-self._random.random() for _ in req.tokenized_continuation]) + for req in requests + ] diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py index b686c9bd7..b6f4bb5d9 100644 --- a/src/lighteval/models/model_config.py +++ b/src/lighteval/models/model_config.py @@ -203,6 +203,11 @@ class TGIModelConfig: model_id: str +@dataclass +class DummyModelConfig: + seed: int = 42 + + @dataclass class InferenceModelConfig: model: str @@ -253,7 +258,16 @@ def nullable_keys() -> list[str]: return ["namespace", "env_vars", "image_url"] -def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]) -> BaseModelConfig: # noqa: C901 +def create_model_config( # noqa: C901 + args: Namespace, accelerator: Union["Accelerator", None] +) -> Union[ + BaseModelConfig, + AdapterModelConfig, + DeltaModelConfig, + TGIModelConfig, + InferenceEndpointModelConfig, + DummyModelConfig, +]: """ Create a model configuration based on the provided arguments. @@ -262,7 +276,7 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None] accelerator (Union[Accelerator, None]): accelerator to use for model training. Returns: - BaseModelConfig: model configuration. + Union[BaseModelConfig, AdapterModelConfig, DeltaModelConfig, TGIModelConfig, InferenceEndpointModelConfig, DummyModelConfig]: model configuration. Raises: ValueError: If both an inference server address and model arguments are provided. @@ -271,7 +285,11 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None] ValueError: If a base model is specified when not using delta weights or adapter weights. """ if args.model_args: - args_dict = {k.split("=")[0]: k.split("=")[1] for k in args.model_args.split(",")} + args_dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in args.model_args.split(",")} + + if args_dict.pop("dummy", False): + return DummyModelConfig(**args_dict) + args_dict["accelerator"] = accelerator args_dict["use_chat_template"] = args.use_chat_template diff --git a/src/lighteval/models/model_loader.py b/src/lighteval/models/model_loader.py index e662beac0..c72d64038 100644 --- a/src/lighteval/models/model_loader.py +++ b/src/lighteval/models/model_loader.py @@ -27,11 +27,13 @@ from lighteval.models.adapter_model import AdapterModel from lighteval.models.base_model import BaseModel from lighteval.models.delta_model import DeltaModel +from lighteval.models.dummy_model import DummyModel from lighteval.models.endpoint_model import InferenceEndpointModel from lighteval.models.model_config import ( AdapterModelConfig, BaseModelConfig, DeltaModelConfig, + DummyModelConfig, EnvConfig, InferenceEndpointModelConfig, InferenceModelConfig, @@ -54,9 +56,16 @@ class ModelInfo: def load_model( # noqa: C901 - config: Union[BaseModelConfig, AdapterModelConfig, DeltaModelConfig, TGIModelConfig, InferenceEndpointModelConfig], + config: Union[ + BaseModelConfig, + AdapterModelConfig, + DeltaModelConfig, + TGIModelConfig, + InferenceEndpointModelConfig, + DummyModelConfig, + ], env_config: EnvConfig, -) -> Tuple[Union[BaseModel, AdapterModel, DeltaModel, ModelClient], ModelInfo]: +) -> Tuple[Union[BaseModel, AdapterModel, DeltaModel, ModelClient, DummyModel], ModelInfo]: """Will load either a model from an inference server or a model from a checkpoint, depending on the config type. @@ -82,6 +91,9 @@ def load_model( # noqa: C901 if isinstance(config, BaseModelConfig): return load_model_with_accelerate_or_default(config=config, env_config=env_config) + if isinstance(config, DummyModelConfig): + return load_dummy_model(config=config, env_config=env_config) + def load_model_with_tgi(config: TGIModelConfig): if not is_tgi_available(): @@ -143,3 +155,7 @@ def load_model_with_accelerate_or_default( hlog(f"Model info: {model_info}") return model, model_info + + +def load_dummy_model(config: DummyModelConfig, env_config: EnvConfig): + return DummyModel(config=config, env_config=env_config), ModelInfo(model_name="dummy", model_sha=str(config.seed)) diff --git a/src/lighteval/models/model_output.py b/src/lighteval/models/model_output.py index 510278585..ce85c0201 100644 --- a/src/lighteval/models/model_output.py +++ b/src/lighteval/models/model_output.py @@ -31,8 +31,8 @@ class ModelReturn: result: Union[tuple, list, str] input_tokens: list[int] = field(default_factory=list) # model inputs generated_tokens: list[int] = field(default_factory=list) # model generations - truncated_tokens_count: Optional[int] = None # How many tokens truncated - padded_tokens_count: Optional[int] = None # How many tokens of padding + truncated_tokens_count: Optional[int] = 0 # How many tokens truncated + padded_tokens_count: Optional[int] = 0 # How many tokens of padding def get_result_for_eval(self): raise NotImplementedError()