Merge branch 'main' into metrics_as_fn

huggingface · Jul 9, 2024 · f957a9b · f957a9b
2 parents 7b61431 + 4651531
commit f957a9b
Show file tree

Hide file tree

Showing 16 changed files with 328 additions and 183 deletions.
diff --git a/README.md b/README.md
@@ -42,7 +42,7 @@ Install the dependencies. For the default installation, you just need:
 pip install .
 ```
 
-If you want to evaluate models with frameworks like `accelerate` or `peft`, you will need to specify the optional dependencies group that fits your use case (`accelerate`,`tgi`,`optimum`,`quantization`,`adapters`,`nanotron`):
+If you want to evaluate models with frameworks like `accelerate` or `peft`, you will need to specify the optional dependencies group that fits your use case (`accelerate`,`tgi`,`optimum`,`quantization`,`adapters`,`nanotron`,`tensorboardX`):
 
 ```bash
 pip install '.[optional1,optional2]'
@@ -239,6 +239,17 @@ python run_evals_accelerate.py \
     --output_dir "./evals"
 ```
 
+### Using the dummy model
+To debug or obtain random baseline scores for a given set of tasks, you can use the `dummy` model:
+```shell
+python run_evals_accelerate.py \
+    --model_args "dummy"\
+    --tasks <task parameters> \
+    --output_dir output_dir
+```
+This "model" randomly generates logprobs (for selection/accuracy tasks) and the string "random baseline" for generation tasks.
+You can also select a specific seed for the random logprob values generated by the dummy model: `--model_args "dummy,seed=123"`.
+
 ## Deep thanks
 `lighteval` was originally built on top of the great [Eleuther AI Harness](https://github.com/EleutherAI/lm-evaluation-harness) (we use the latter to power the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)). We also took a lot of inspiration from the amazing [HELM](https://crfm.stanford.edu/helm/latest/), notably for metrics.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -55,7 +55,7 @@ keywords = ["evaluation", "nlp", "llm"]
 dependencies = [
     # Base dependencies
     "transformers>=4.38.0",
-    "huggingface_hub>=0.22.0",
+    "huggingface_hub>=0.23.0",
     "torch>=2.0",
     "GitPython>=3.1.41", # for logging
     "datasets>=2.14.0",
@@ -86,6 +86,7 @@ nanotron = [
   "nanotron",
   "tensorboardX"
 ]
+tensorboardX = ["tensorboardX"]
 quality = ["ruff==v0.2.2","pre-commit"]
 tests = ["pytest==7.4.0"]
 dev = ["lighteval[accelerate,quality,tests]"]

diff --git a/run_evals_accelerate.py b/run_evals_accelerate.py
@@ -48,6 +48,7 @@ def get_parser():
     parser.add_argument("--push_results_to_hub", default=False, action="store_true")
     parser.add_argument("--save_details", action="store_true")
     parser.add_argument("--push_details_to_hub", default=False, action="store_true")
+    parser.add_argument("--push_results_to_tensorboard", default=False, action="store_true")
     parser.add_argument(
         "--public_run", default=False, action="store_true", help="Push results and details to a public repo"
     )

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
@@ -56,7 +56,15 @@
 @htrack()
 def main(args):
     env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
-    evaluation_tracker = EvaluationTracker(hub_results_org=args.results_org, token=TOKEN)
+    evaluation_tracker = EvaluationTracker(
+        output_dir=args.output_dir,
+        hub_results_org=args.results_org,
+        push_results_to_hub=args.push_results_to_hub,
+        push_details_to_hub=args.push_details_to_hub,
+        push_results_to_tensorboard=args.push_results_to_tensorboard,
+        public=args.public_run,
+        token=TOKEN,
+    )
     evaluation_tracker.general_config_logger.log_args_info(
         args.num_fewshot_seeds, args.override_batch_size, args.max_samples, args.job_id
     )
@@ -124,9 +132,7 @@ def main(args):
             evaluation_tracker.details_logger.aggregate()
 
             if args.output_dir:
-                evaluation_tracker.save(
-                    args.output_dir, args.push_results_to_hub, args.push_details_to_hub, args.public_run
-                )
+                evaluation_tracker.save()
 
             final_dict = evaluation_tracker.generate_final_dict()
 

diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
@@ -96,7 +96,13 @@ def main(
             data_parallel_size=lighteval_config.parallelism.dp,
         )
 
-        evaluation_tracker = EvaluationTracker(token=TOKEN)
+        evaluation_tracker = EvaluationTracker(
+            token=TOKEN,
+            output_dir=lighteval_config.logging.local_output_path,
+            hub_results_org=lighteval_config.logging.hub_repo_tensorboard,
+            tensorboard_metric_prefix=lighteval_config.logging.tensorboard_metric_prefix,
+            nanotron_run_info=nanotron_config.general,
+        )
         evaluation_tracker.general_config_logger.log_args_info(
             num_fewshot_seeds=1,
             override_batch_size=None,

diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -302,15 +302,15 @@ class Metrics(Enum):
         sample_level_fn=LoglikelihoodPreparator().prepare,
         category=MetricCategory.MULTICHOICE,
         use_case=MetricUseCase.ACCURACY,
-        corpus_level_fn=CorpusLevelF1Score(None),
+        corpus_level_fn=CorpusLevelF1Score(None).compute,
         higher_is_better=True,
     )
     loglikelihood_f1_single_token = CorpusLevelMetric(
         metric_name="loglikelihood_f1",
         sample_level_fn=LoglikelihoodPreparator(is_single_token=True).prepare,
         category=MetricCategory.MULTICHOICE_ONE_TOKEN,
         use_case=MetricUseCase.ACCURACY,
-        corpus_level_fn=CorpusLevelF1Score(None),
+        corpus_level_fn=CorpusLevelF1Score(None).compute,
         higher_is_better=True,
     )
     mcc = CorpusLevelMetric(
@@ -386,7 +386,7 @@ class Metrics(Enum):
         sample_level_fn=LoglikelihoodPreparator(is_single_token=True).prepare,
         category=MetricCategory.MULTICHOICE_ONE_TOKEN,
         use_case=MetricUseCase.ACCURACY,
-        corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3),
+        corpus_level_fn=CorpusLevelF1Score(average=None, num_classes=3).compute,
         higher_is_better=True,
     )
     perfect_exact_match = SampleLevelMetric(

diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
@@ -530,27 +530,7 @@ def greedy_until(
                 returns_logits = batch[0].use_logits
                 num_samples = batch[0].num_samples
 
-                # The main question for this step is the following:
-                # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk
-                # of loosing some meaning, or have some generations that are exceedingly short?
-                # The choice we go for here is to avoid truncating the prompt if we can, since it
-                # should have been managed by the prompt creator/few shot manager if requested by the user.
                 context = [c.context for c in batch]
-                smallest_context = min(len(c) for c in context)
-                biggest_context = max(len(c) for c in context)
-                if smallest_context > self.max_length:
-                    hlog_warn(
-                        f"The smallest context of your batch ({smallest_context}) is bigger than the maximum context size allowed by the model ({self.max_length}) for a task in"
-                        + str({i.task_name for i in batch})
-                        + ". This is likely to lead to some errors."  # noqa C401
-                    )
-
-                if (
-                    biggest_context > self.max_length
-                ):  # There will be truncation of at least one sample, maximum generation size will be one
-                    max_new_tokens = 1
-                else:  # We can't allow generation of more than max_length
-                    max_new_tokens = min(self.max_length - biggest_context, max_new_tokens)
 
                 # See doc https://huggingface.co/docs/transformers/v4.38.2/en/pad_truncation#padding-and-truncation
                 # Will do left truncation and padding, as defined when creating the tokenizer
@@ -563,6 +543,23 @@ def greedy_until(
                     add_special_tokens=self.add_special_tokens,
                 ).to(self.device)
 
+                # The main question for this step is the following:
+                # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk
+                # of losing some meaning, or have some generations that are exceedingly short?
+                # The choice we go for here is to avoid truncating the prompt if we can, since it
+                # should have been managed by the prompt creator/few shot manager if requested by the user.
+                context_size = tokenized["input_ids"].shape[1]
+                if context_size > self.max_length:
+                    hlog_warn(
+                        f"The context size of your batch ({context_size}) is bigger than the maximum context size allowed by the model ({self.max_length}) for a task in"
+                        + str({i.task_name for i in batch})
+                        + ". This is likely to lead to some errors."  # noqa C401
+                    )
+                    # There will be truncation of at least one sample, maximum generation size will be one
+                    max_new_tokens = 1
+                else:  # We can't allow generation of more than max_length
+                    max_new_tokens = min(self.max_length - context_size, max_new_tokens)
+
                 prepared_batch = Batch(
                     input_ids=tokenized["input_ids"],
                     input_lengths=[len(item == 1) for item in tokenized["attention_mask"]],

diff --git a/src/lighteval/models/dummy_model.py b/src/lighteval/models/dummy_model.py
@@ -0,0 +1,89 @@
+# MIT License
+#
+# Copyright (c) 2024 The HuggingFace Team
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# inspired by https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/dummy.py
+
+import random
+from typing import Optional
+
+from transformers import AutoTokenizer
+
+from lighteval.models.abstract_model import LightevalModel
+from lighteval.models.model_config import DummyModelConfig, EnvConfig
+from lighteval.models.model_output import GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn
+from lighteval.tasks.requests import (
+    GreedyUntilRequest,
+    LoglikelihoodRequest,
+    LoglikelihoodRollingRequest,
+    LoglikelihoodSingleTokenRequest,
+)
+
+
+class DummyModel(LightevalModel):
+    """Dummy model to generate random baselines."""
+
+    def __init__(
+        self,
+        config: DummyModelConfig,
+        env_config: EnvConfig,
+    ):
+        self.config = config
+        self.env_config = env_config
+        self._random = random.Random(self.config.seed)
+        self._tokenizer = None
+
+    @property
+    def tokenizer(self):
+        if not self._tokenizer:
+            self._tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        return self._tokenizer
+
+    @property
+    def add_special_tokens(self):
+        return False
+
+    @property
+    def max_length(self) -> int:
+        return 2048
+
+    def greedy_until(
+        self, requests: list[GreedyUntilRequest], override_bs: Optional[int] = None
+    ) -> list[GenerateReturn]:
+        return [GenerateReturn(result="random baseline") for _ in range(len(requests))]
+
+    def loglikelihood(
+        self, requests: list[LoglikelihoodRequest], override_bs: Optional[int] = None
+    ) -> list[LoglikelihoodReturn]:
+        return [LoglikelihoodReturn((-self._random.random(), False)) for _ in requests]
+
+    def loglikelihood_rolling(
+        self, requests: list[LoglikelihoodRollingRequest], override_bs: Optional[int] = None
+    ) -> list[LoglikelihoodReturn]:
+        return [LoglikelihoodReturn((-self._random.random(), False)) for _ in requests]
+
+    def loglikelihood_single_token(
+        self, requests: list[LoglikelihoodSingleTokenRequest], override_bs: Optional[int] = None
+    ) -> list[LoglikelihoodSingleTokenReturn]:
+        return [
+            LoglikelihoodSingleTokenReturn(result=[-self._random.random() for _ in req.tokenized_continuation])
+            for req in requests
+        ]
diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py
@@ -203,6 +203,11 @@ class TGIModelConfig:
     model_id: str
 
 
+@dataclass
+class DummyModelConfig:
+    seed: int = 42
+
+
 @dataclass
 class InferenceModelConfig:
     model: str
@@ -253,7 +258,16 @@ def nullable_keys() -> list[str]:
         return ["namespace", "env_vars", "image_url"]
 
 
-def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]) -> BaseModelConfig:  # noqa: C901
+def create_model_config(  # noqa: C901
+    args: Namespace, accelerator: Union["Accelerator", None]
+) -> Union[
+    BaseModelConfig,
+    AdapterModelConfig,
+    DeltaModelConfig,
+    TGIModelConfig,
+    InferenceEndpointModelConfig,
+    DummyModelConfig,
+]:
     """
     Create a model configuration based on the provided arguments.
 
@@ -262,7 +276,7 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]
         accelerator (Union[Accelerator, None]): accelerator to use for model training.
 
     Returns:
-        BaseModelConfig: model configuration.
+        Union[BaseModelConfig, AdapterModelConfig, DeltaModelConfig, TGIModelConfig, InferenceEndpointModelConfig, DummyModelConfig]: model configuration.
 
     Raises:
         ValueError: If both an inference server address and model arguments are provided.
@@ -271,7 +285,11 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]
         ValueError: If a base model is specified when not using delta weights or adapter weights.
     """
     if args.model_args:
-        args_dict = {k.split("=")[0]: k.split("=")[1] for k in args.model_args.split(",")}
+        args_dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in args.model_args.split(",")}
+
+        if args_dict.pop("dummy", False):
+            return DummyModelConfig(**args_dict)
+
         args_dict["accelerator"] = accelerator
         args_dict["use_chat_template"] = args.use_chat_template
 

diff --git a/src/lighteval/models/model_loader.py b/src/lighteval/models/model_loader.py
@@ -27,11 +27,13 @@
 from lighteval.models.adapter_model import AdapterModel
 from lighteval.models.base_model import BaseModel
 from lighteval.models.delta_model import DeltaModel
+from lighteval.models.dummy_model import DummyModel
 from lighteval.models.endpoint_model import InferenceEndpointModel
 from lighteval.models.model_config import (
     AdapterModelConfig,
     BaseModelConfig,
     DeltaModelConfig,
+    DummyModelConfig,
     EnvConfig,
     InferenceEndpointModelConfig,
     InferenceModelConfig,
@@ -54,9 +56,16 @@ class ModelInfo:
 
 
 def load_model(  # noqa: C901
-    config: Union[BaseModelConfig, AdapterModelConfig, DeltaModelConfig, TGIModelConfig, InferenceEndpointModelConfig],
+    config: Union[
+        BaseModelConfig,
+        AdapterModelConfig,
+        DeltaModelConfig,
+        TGIModelConfig,
+        InferenceEndpointModelConfig,
+        DummyModelConfig,
+    ],
     env_config: EnvConfig,
-) -> Tuple[Union[BaseModel, AdapterModel, DeltaModel, ModelClient], ModelInfo]:
+) -> Tuple[Union[BaseModel, AdapterModel, DeltaModel, ModelClient, DummyModel], ModelInfo]:
     """Will load either a model from an inference server or a model from a checkpoint, depending
     on the config type.
 
@@ -82,6 +91,9 @@ def load_model(  # noqa: C901
     if isinstance(config, BaseModelConfig):
         return load_model_with_accelerate_or_default(config=config, env_config=env_config)
 
+    if isinstance(config, DummyModelConfig):
+        return load_dummy_model(config=config, env_config=env_config)
+
 
 def load_model_with_tgi(config: TGIModelConfig):
     if not is_tgi_available():
@@ -143,3 +155,7 @@ def load_model_with_accelerate_or_default(
     hlog(f"Model info: {model_info}")
 
     return model, model_info
+
+
+def load_dummy_model(config: DummyModelConfig, env_config: EnvConfig):
+    return DummyModel(config=config, env_config=env_config), ModelInfo(model_name="dummy", model_sha=str(config.seed))
diff --git a/src/lighteval/models/model_output.py b/src/lighteval/models/model_output.py
@@ -31,8 +31,8 @@ class ModelReturn:
     result: Union[tuple, list, str]
     input_tokens: list[int] = field(default_factory=list)  # model inputs
     generated_tokens: list[int] = field(default_factory=list)  # model generations
-    truncated_tokens_count: Optional[int] = None  # How many tokens truncated
-    padded_tokens_count: Optional[int] = None  # How many tokens of padding
+    truncated_tokens_count: Optional[int] = 0  # How many tokens truncated
+    padded_tokens_count: Optional[int] = 0  # How many tokens of padding
 
     def get_result_for_eval(self):
         raise NotImplementedError()