Skip to content

Commit

Permalink
Merge branch 'main' into metrics_as_fn
Browse files Browse the repository at this point in the history
  • Loading branch information
clefourrier committed Jul 9, 2024
2 parents 7b61431 + 4651531 commit f957a9b
Show file tree
Hide file tree
Showing 16 changed files with 328 additions and 183 deletions.
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Install the dependencies. For the default installation, you just need:
pip install .
```

If you want to evaluate models with frameworks like `accelerate` or `peft`, you will need to specify the optional dependencies group that fits your use case (`accelerate`,`tgi`,`optimum`,`quantization`,`adapters`,`nanotron`):
If you want to evaluate models with frameworks like `accelerate` or `peft`, you will need to specify the optional dependencies group that fits your use case (`accelerate`,`tgi`,`optimum`,`quantization`,`adapters`,`nanotron`,`tensorboardX`):

```bash
pip install '.[optional1,optional2]'
Expand Down Expand Up @@ -239,6 +239,17 @@ python run_evals_accelerate.py \
--output_dir "./evals"
```

### Using the dummy model
To debug or obtain random baseline scores for a given set of tasks, you can use the `dummy` model:
```shell
python run_evals_accelerate.py \
--model_args "dummy"\
--tasks <task parameters> \
--output_dir output_dir
```
This "model" randomly generates logprobs (for selection/accuracy tasks) and the string "random baseline" for generation tasks.
You can also select a specific seed for the random logprob values generated by the dummy model: `--model_args "dummy,seed=123"`.

## Deep thanks
`lighteval` was originally built on top of the great [Eleuther AI Harness](https://github.com/EleutherAI/lm-evaluation-harness) (we use the latter to power the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)). We also took a lot of inspiration from the amazing [HELM](https://crfm.stanford.edu/helm/latest/), notably for metrics.

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ keywords = ["evaluation", "nlp", "llm"]
dependencies = [
# Base dependencies
"transformers>=4.38.0",
"huggingface_hub>=0.22.0",
"huggingface_hub>=0.23.0",
"torch>=2.0",
"GitPython>=3.1.41", # for logging
"datasets>=2.14.0",
Expand Down Expand Up @@ -86,6 +86,7 @@ nanotron = [
"nanotron",
"tensorboardX"
]
tensorboardX = ["tensorboardX"]
quality = ["ruff==v0.2.2","pre-commit"]
tests = ["pytest==7.4.0"]
dev = ["lighteval[accelerate,quality,tests]"]
Expand Down
1 change: 1 addition & 0 deletions run_evals_accelerate.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def get_parser():
parser.add_argument("--push_results_to_hub", default=False, action="store_true")
parser.add_argument("--save_details", action="store_true")
parser.add_argument("--push_details_to_hub", default=False, action="store_true")
parser.add_argument("--push_results_to_tensorboard", default=False, action="store_true")
parser.add_argument(
"--public_run", default=False, action="store_true", help="Push results and details to a public repo"
)
Expand Down
230 changes: 106 additions & 124 deletions src/lighteval/logging/evaluation_tracker.py

Large diffs are not rendered by default.

14 changes: 10 additions & 4 deletions src/lighteval/main_accelerate.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,15 @@
@htrack()
def main(args):
env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
evaluation_tracker = EvaluationTracker(hub_results_org=args.results_org, token=TOKEN)
evaluation_tracker = EvaluationTracker(
output_dir=args.output_dir,
hub_results_org=args.results_org,
push_results_to_hub=args.push_results_to_hub,
push_details_to_hub=args.push_details_to_hub,
push_results_to_tensorboard=args.push_results_to_tensorboard,
public=args.public_run,
token=TOKEN,
)
evaluation_tracker.general_config_logger.log_args_info(
args.num_fewshot_seeds, args.override_batch_size, args.max_samples, args.job_id
)
Expand Down Expand Up @@ -124,9 +132,7 @@ def main(args):
evaluation_tracker.details_logger.aggregate()

if args.output_dir:
evaluation_tracker.save(
args.output_dir, args.push_results_to_hub, args.push_details_to_hub, args.public_run
)
evaluation_tracker.save()

final_dict = evaluation_tracker.generate_final_dict()

Expand Down
8 changes: 7 additions & 1 deletion src/lighteval/main_nanotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,13 @@ def main(
data_parallel_size=lighteval_config.parallelism.dp,
)

evaluation_tracker = EvaluationTracker(token=TOKEN)
evaluation_tracker = EvaluationTracker(
token=TOKEN,
output_dir=lighteval_config.logging.local_output_path,
hub_results_org=lighteval_config.logging.hub_repo_tensorboard,
tensorboard_metric_prefix=lighteval_config.logging.tensorboard_metric_prefix,
nanotron_run_info=nanotron_config.general,
)
evaluation_tracker.general_config_logger.log_args_info(
num_fewshot_seeds=1,
override_batch_size=None,
Expand Down
6 changes: 3 additions & 3 deletions src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,15 +302,15 @@ class Metrics(Enum):
sample_level_fn=LoglikelihoodPreparator().prepare,
category=MetricCategory.MULTICHOICE,
use_case=MetricUseCase.ACCURACY,
corpus_level_fn=CorpusLevelF1Score(None),
corpus_level_fn=CorpusLevelF1Score(None).compute,
higher_is_better=True,
)
loglikelihood_f1_single_token = CorpusLevelMetric(
metric_name="loglikelihood_f1",
sample_level_fn=LoglikelihoodPreparator(is_single_token=True).prepare,
category=MetricCategory.MULTICHOICE_ONE_TOKEN,
use_case=MetricUseCase.ACCURACY,
corpus_level_fn=CorpusLevelF1Score(None),
corpus_level_fn=CorpusLevelF1Score(None).compute,
higher_is_better=True,
)
mcc = CorpusLevelMetric(
Expand Down Expand Up @@ -386,7 +386,7 @@ class Metrics(Enum):
sample_level_fn=LoglikelihoodPreparator(is_single_token=True).prepare,
category=MetricCategory.MULTICHOICE_ONE_TOKEN,
use_case=MetricUseCase.ACCURACY,
corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3),
corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3).compute,
higher_is_better=True,
)
perfect_exact_match = SampleLevelMetric(
Expand Down
37 changes: 17 additions & 20 deletions src/lighteval/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,27 +530,7 @@ def greedy_until(
returns_logits = batch[0].use_logits
num_samples = batch[0].num_samples

# The main question for this step is the following:
# Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk
# of loosing some meaning, or have some generations that are exceedingly short?
# The choice we go for here is to avoid truncating the prompt if we can, since it
# should have been managed by the prompt creator/few shot manager if requested by the user.
context = [c.context for c in batch]
smallest_context = min(len(c) for c in context)
biggest_context = max(len(c) for c in context)
if smallest_context > self.max_length:
hlog_warn(
f"The smallest context of your batch ({smallest_context}) is bigger than the maximum context size allowed by the model ({self.max_length}) for a task in"
+ str({i.task_name for i in batch})
+ ". This is likely to lead to some errors." # noqa C401
)

if (
biggest_context > self.max_length
): # There will be truncation of at least one sample, maximum generation size will be one
max_new_tokens = 1
else: # We can't allow generation of more than max_length
max_new_tokens = min(self.max_length - biggest_context, max_new_tokens)

# See doc https://huggingface.co/docs/transformers/v4.38.2/en/pad_truncation#padding-and-truncation
# Will do left truncation and padding, as defined when creating the tokenizer
Expand All @@ -563,6 +543,23 @@ def greedy_until(
add_special_tokens=self.add_special_tokens,
).to(self.device)

# The main question for this step is the following:
# Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk
# of losing some meaning, or have some generations that are exceedingly short?
# The choice we go for here is to avoid truncating the prompt if we can, since it
# should have been managed by the prompt creator/few shot manager if requested by the user.
context_size = tokenized["input_ids"].shape[1]
if context_size > self.max_length:
hlog_warn(
f"The context size of your batch ({context_size}) is bigger than the maximum context size allowed by the model ({self.max_length}) for a task in"
+ str({i.task_name for i in batch})
+ ". This is likely to lead to some errors." # noqa C401
)
# There will be truncation of at least one sample, maximum generation size will be one
max_new_tokens = 1
else: # We can't allow generation of more than max_length
max_new_tokens = min(self.max_length - context_size, max_new_tokens)

prepared_batch = Batch(
input_ids=tokenized["input_ids"],
input_lengths=[len(item == 1) for item in tokenized["attention_mask"]],
Expand Down
89 changes: 89 additions & 0 deletions src/lighteval/models/dummy_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# MIT License
#
# Copyright (c) 2024 The HuggingFace Team
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# inspired by https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/dummy.py

import random
from typing import Optional

from transformers import AutoTokenizer

from lighteval.models.abstract_model import LightevalModel
from lighteval.models.model_config import DummyModelConfig, EnvConfig
from lighteval.models.model_output import GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn
from lighteval.tasks.requests import (
GreedyUntilRequest,
LoglikelihoodRequest,
LoglikelihoodRollingRequest,
LoglikelihoodSingleTokenRequest,
)


class DummyModel(LightevalModel):
"""Dummy model to generate random baselines."""

def __init__(
self,
config: DummyModelConfig,
env_config: EnvConfig,
):
self.config = config
self.env_config = env_config
self._random = random.Random(self.config.seed)
self._tokenizer = None

@property
def tokenizer(self):
if not self._tokenizer:
self._tokenizer = AutoTokenizer.from_pretrained("gpt2")
return self._tokenizer

@property
def add_special_tokens(self):
return False

@property
def max_length(self) -> int:
return 2048

def greedy_until(
self, requests: list[GreedyUntilRequest], override_bs: Optional[int] = None
) -> list[GenerateReturn]:
return [GenerateReturn(result="random baseline") for _ in range(len(requests))]

def loglikelihood(
self, requests: list[LoglikelihoodRequest], override_bs: Optional[int] = None
) -> list[LoglikelihoodReturn]:
return [LoglikelihoodReturn((-self._random.random(), False)) for _ in requests]

def loglikelihood_rolling(
self, requests: list[LoglikelihoodRollingRequest], override_bs: Optional[int] = None
) -> list[LoglikelihoodReturn]:
return [LoglikelihoodReturn((-self._random.random(), False)) for _ in requests]

def loglikelihood_single_token(
self, requests: list[LoglikelihoodSingleTokenRequest], override_bs: Optional[int] = None
) -> list[LoglikelihoodSingleTokenReturn]:
return [
LoglikelihoodSingleTokenReturn(result=[-self._random.random() for _ in req.tokenized_continuation])
for req in requests
]
24 changes: 21 additions & 3 deletions src/lighteval/models/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,11 @@ class TGIModelConfig:
model_id: str


@dataclass
class DummyModelConfig:
seed: int = 42


@dataclass
class InferenceModelConfig:
model: str
Expand Down Expand Up @@ -253,7 +258,16 @@ def nullable_keys() -> list[str]:
return ["namespace", "env_vars", "image_url"]


def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]) -> BaseModelConfig: # noqa: C901
def create_model_config( # noqa: C901
args: Namespace, accelerator: Union["Accelerator", None]
) -> Union[
BaseModelConfig,
AdapterModelConfig,
DeltaModelConfig,
TGIModelConfig,
InferenceEndpointModelConfig,
DummyModelConfig,
]:
"""
Create a model configuration based on the provided arguments.
Expand All @@ -262,7 +276,7 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]
accelerator (Union[Accelerator, None]): accelerator to use for model training.
Returns:
BaseModelConfig: model configuration.
Union[BaseModelConfig, AdapterModelConfig, DeltaModelConfig, TGIModelConfig, InferenceEndpointModelConfig, DummyModelConfig]: model configuration.
Raises:
ValueError: If both an inference server address and model arguments are provided.
Expand All @@ -271,7 +285,11 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]
ValueError: If a base model is specified when not using delta weights or adapter weights.
"""
if args.model_args:
args_dict = {k.split("=")[0]: k.split("=")[1] for k in args.model_args.split(",")}
args_dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in args.model_args.split(",")}

if args_dict.pop("dummy", False):
return DummyModelConfig(**args_dict)

args_dict["accelerator"] = accelerator
args_dict["use_chat_template"] = args.use_chat_template

Expand Down
20 changes: 18 additions & 2 deletions src/lighteval/models/model_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@
from lighteval.models.adapter_model import AdapterModel
from lighteval.models.base_model import BaseModel
from lighteval.models.delta_model import DeltaModel
from lighteval.models.dummy_model import DummyModel
from lighteval.models.endpoint_model import InferenceEndpointModel
from lighteval.models.model_config import (
AdapterModelConfig,
BaseModelConfig,
DeltaModelConfig,
DummyModelConfig,
EnvConfig,
InferenceEndpointModelConfig,
InferenceModelConfig,
Expand All @@ -54,9 +56,16 @@ class ModelInfo:


def load_model( # noqa: C901
config: Union[BaseModelConfig, AdapterModelConfig, DeltaModelConfig, TGIModelConfig, InferenceEndpointModelConfig],
config: Union[
BaseModelConfig,
AdapterModelConfig,
DeltaModelConfig,
TGIModelConfig,
InferenceEndpointModelConfig,
DummyModelConfig,
],
env_config: EnvConfig,
) -> Tuple[Union[BaseModel, AdapterModel, DeltaModel, ModelClient], ModelInfo]:
) -> Tuple[Union[BaseModel, AdapterModel, DeltaModel, ModelClient, DummyModel], ModelInfo]:
"""Will load either a model from an inference server or a model from a checkpoint, depending
on the config type.
Expand All @@ -82,6 +91,9 @@ def load_model( # noqa: C901
if isinstance(config, BaseModelConfig):
return load_model_with_accelerate_or_default(config=config, env_config=env_config)

if isinstance(config, DummyModelConfig):
return load_dummy_model(config=config, env_config=env_config)


def load_model_with_tgi(config: TGIModelConfig):
if not is_tgi_available():
Expand Down Expand Up @@ -143,3 +155,7 @@ def load_model_with_accelerate_or_default(
hlog(f"Model info: {model_info}")

return model, model_info


def load_dummy_model(config: DummyModelConfig, env_config: EnvConfig):
return DummyModel(config=config, env_config=env_config), ModelInfo(model_name="dummy", model_sha=str(config.seed))
4 changes: 2 additions & 2 deletions src/lighteval/models/model_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ class ModelReturn:
result: Union[tuple, list, str]
input_tokens: list[int] = field(default_factory=list) # model inputs
generated_tokens: list[int] = field(default_factory=list) # model generations
truncated_tokens_count: Optional[int] = None # How many tokens truncated
padded_tokens_count: Optional[int] = None # How many tokens of padding
truncated_tokens_count: Optional[int] = 0 # How many tokens truncated
padded_tokens_count: Optional[int] = 0 # How many tokens of padding

def get_result_for_eval(self):
raise NotImplementedError()
Expand Down
Loading

0 comments on commit f957a9b

Please sign in to comment.