huggingface · NathanHB · Dec 5, 2024 · Nov 21, 2024 · Nov 26, 2024 · Nov 26, 2024
diff --git a/README.md b/README.md
@@ -55,7 +55,7 @@ Hub, S3, or locally.
 ## ⚡️ Installation
 
 ```bash
-pip install lighteval[accelerate]
+pip install lighteval
 ```
 
 Lighteval allows for many extras when installing, see [here](https://github.com/huggingface/lighteval/wiki/Installation) for a complete list.
@@ -71,20 +71,24 @@ huggingface-cli login
 
 Lighteval offers two main entry points for model evaluation:
 
-
-* `lighteval accelerate`: evaluate models on CPU or one or more GPUs using [🤗
-  Accelerate](https://github.com/huggingface/accelerate).
-* `lighteval nanotron`: evaluate models in distributed settings using [⚡️
-  Nanotron](https://github.com/huggingface/nanotron).
+- `lighteval accelerate` : evaluate models on CPU or one or more GPUs using [🤗
+  Accelerate](https://github.com/huggingface/accelerate)
+- `lighteval nanotron`: evaluate models in distributed settings using [⚡️
+  Nanotron](https://github.com/huggingface/nanotron)
+- `lighteval vllm`: evaluate models on one or more GPUs using [🚀
+  VLLM](https://github.com/vllm-project/vllm)
+- `lighteval endpoint`
+    - `inference-endpoint`: evaluate models on one or more GPUs using [🔗
+  Inference Endpoint](https://huggingface.co/inference-endpoints/dedicated)
+    - `tgi`: evaluate models on one or more GPUs using [🔗 Text Generation Inference](https://huggingface.co/docs/text-generation-inference/en/index)
+    - `openai`: evaluate models on one or more GPUs using [🔗 OpenAI API](https://platform.openai.com/)
 
 Here’s a quick command to evaluate using the Accelerate backend:
 
 ```shell
 lighteval accelerate \
-    --model_args "pretrained=gpt2" \
-    --tasks "leaderboard|truthfulqa:mc|0|0" \
-    --override_batch_size 1 \
-    --output_dir="./evals/"
+    "pretrained=gpt2" \
+    "leaderboard|truthfulqa:mc|0|0"
 ```
 
 ## 🙏 Acknowledgements

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
@@ -25,7 +25,6 @@ appropriate extras group.
 
 | extra name   | description                                                               |
 |--------------|---------------------------------------------------------------------------|
-| accelerate   | To use accelerate for model and data parallelism with transformers models |
 | tgi          | To use Text Generation Inference API to evaluate your model               |
 | nanotron     | To evaluate nanotron models                                               |
 | quantization | To evaluate quantized models                                              |

diff --git a/pyproject.toml b/pyproject.toml
@@ -55,6 +55,7 @@ keywords = ["evaluation", "nlp", "llm"]
 dependencies = [
     # Base dependencies
     "transformers>=4.38.0",
+    "accelerate",
     "huggingface_hub>=0.23.0",
     "torch>=2.0,<2.5",
     "GitPython>=3.1.41", # for logging
@@ -64,7 +65,8 @@ dependencies = [
     "typer",
     "termcolor==2.3.0",
     "pytablewriter",
-    "colorama",
+    "rich",
+    "colorlog",
     # Extension of metrics
     "aenum==3.1.15",
     # Base metrics
@@ -80,7 +82,6 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-accelerate = ["accelerate"]
 tgi = ["text-generation==0.6.0"]
 optimum = ["optimum==1.12.0"]
 quantization = ["bitsandbytes>=0.41.0", "auto-gptq>=0.4.2"]

diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py
@@ -19,7 +19,10 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
+import logging
+from logging.config import dictConfig
 
+import colorlog
 import typer
 
 import lighteval.main_accelerate
@@ -32,6 +35,29 @@
 
 app = typer.Typer()
 
+logging_config = dict(  # noqa C408
+    version=1,
+    formatters={
+        "c": {
+            "()": colorlog.ColoredFormatter,
+            "format": "[%(asctime)s] [%(log_color)s%(levelname)8s%(reset)s]: %(message)s (%(filename)s:%(lineno)s)",
+            "log_colors": {
+                "DEBUG": "cyan",
+                "INFO": "green",
+                "WARNING": "yellow",
+                "ERROR": "red",
+                "CRITICAL": "red,bg_white",
+            },
+        },
+    },
+    handlers={"h": {"class": "logging.StreamHandler", "formatter": "c", "level": logging.INFO}},
+    root={
+        "handlers": ["h"],
+        "level": logging.INFO,
+    },
+)
+
+dictConfig(logging_config)
 
 app.command(rich_help_panel="Evaluation Backends")(lighteval.main_accelerate.accelerate)
 app.command(rich_help_panel="Evaluation Utils")(lighteval.main_baseline.baseline)

diff --git a/src/lighteval/data.py b/src/lighteval/data.py
@@ -20,14 +20,14 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import logging
 import math
 from typing import Iterator, Tuple
 
 import torch
 from torch.utils.data import Dataset
 from torch.utils.data.distributed import DistributedSampler, T_co
 
-from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.tasks.requests import (
     GreedyUntilRequest,
     LoglikelihoodRequest,
@@ -37,6 +37,9 @@
 )
 
 
+logger = logging.getLogger(__name__)
+
+
 class DynamicBatchDataset(Dataset):
     def __init__(
         self,
@@ -76,7 +79,7 @@ def __init__(
 
     def init_split_limits(self, num_dataset_splits):
         if num_dataset_splits >= self.total_size:
-            hlog_warn(
+            logger.warning(
                 f"num_dataset_splits ({num_dataset_splits}) >= total_size ({self.total_size}), setting num_dataset_splits to 1"
             )
             num_dataset_splits = 1
@@ -247,7 +250,7 @@ def init_split_limits(self, num_dataset_splits):
             _type_: _description_
         """
         if num_dataset_splits is not None:
-            hlog_warn(
+            logger.warning(
                 "You cannot select the number of dataset splits for a generative evaluation at the moment. Automatically inferring."
             )
 

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
@@ -22,6 +22,7 @@
 
 import copy
 import json
+import logging
 import os
 import re
 import time
@@ -37,7 +38,6 @@
 from fsspec import url_to_fs
 from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HFSummaryWriter, hf_hub_url
 
-from lighteval.logging.hierarchical_logger import hlog, hlog_warn
 from lighteval.logging.info_loggers import (
     DetailsLogger,
     GeneralConfigLogger,
@@ -49,6 +49,8 @@
 from lighteval.utils.utils import obj_to_markdown
 
 
+logger = logging.getLogger(__name__)
+
 if is_nanotron_available():
     from nanotron.config import GeneralArgs  # type: ignore
 
@@ -147,7 +149,7 @@ def __init__(
 
     def save(self) -> None:
         """Saves the experiment information and results to files, and to the hub if requested."""
-        hlog("Saving experiment tracker")
+        logger.info("Saving experiment tracker")
         date_id = datetime.now().isoformat().replace(":", "-")
 
         # We first prepare data to save
@@ -202,15 +204,15 @@ def save_results(self, date_id: str, results_dict: dict):
         output_dir_results = Path(self.output_dir) / "results" / self.general_config_logger.model_name
         self.fs.mkdirs(output_dir_results, exist_ok=True)
         output_results_file = output_dir_results / f"results_{date_id}.json"
-        hlog(f"Saving results to {output_results_file}")
+        logger.info(f"Saving results to {output_results_file}")
         with self.fs.open(output_results_file, "w") as f:
             f.write(json.dumps(results_dict, cls=EnhancedJSONEncoder, indent=2, ensure_ascii=False))
 
     def save_details(self, date_id: str, details_datasets: dict[str, Dataset]):
         output_dir_details = Path(self.output_dir) / "details" / self.general_config_logger.model_name
         output_dir_details_sub_folder = output_dir_details / date_id
         self.fs.mkdirs(output_dir_details_sub_folder, exist_ok=True)
-        hlog(f"Saving details to {output_dir_details_sub_folder}")
+        logger.info(f"Saving details to {output_dir_details_sub_folder}")
         for task_name, dataset in details_datasets.items():
             output_file_details = output_dir_details_sub_folder / f"details_{task_name}_{date_id}.parquet"
             with self.fs.open(str(output_file_details), "wb") as f:
@@ -255,7 +257,7 @@ def push_to_hub(
 
         if not self.api.repo_exists(repo_id):
             self.api.create_repo(repo_id, private=not (self.public), repo_type="dataset", exist_ok=True)
-            hlog(f"Repository {repo_id} not found, creating it.")
+            logger.info(f"Repository {repo_id} not found, creating it.")
 
         # We upload it both as a json and a parquet file
         result_file_base_name = f"results_{date_id}"
@@ -490,11 +492,11 @@ def push_to_tensorboard(  # noqa: C901
         self, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail]
     ):
         if not is_tensorboardX_available:
-            hlog_warn(NO_TENSORBOARDX_WARN_MSG)
+            logger.warning(NO_TENSORBOARDX_WARN_MSG)
             return
 
         if not is_nanotron_available():
-            hlog_warn("You cannot push results to tensorboard without having nanotron installed. Skipping")
+            logger.warning("You cannot push results to tensorboard without having nanotron installed. Skipping")
             return
 
         prefix = self.tensorboard_metric_prefix
@@ -526,14 +528,14 @@ def push_to_tensorboard(  # noqa: C901
             bench_suite = None
             if ":" in task_name:
                 bench_suite = task_name.split(":")[0]  # e.g. MMLU
-                hlog(f"bench_suite {bench_suite} in {task_name}")
+                logger.info(f"bench_suite {bench_suite} in {task_name}")
                 for metric, value in values.items():
                     if "stderr" in metric:
                         continue
                     if bench_suite not in bench_averages:
                         bench_averages[bench_suite] = {}
                     bench_averages[bench_suite][metric] = bench_averages[bench_suite].get(metric, []) + [float(value)]
-            hlog(f"Pushing {task_name} {values} to tensorboard")
+            logger.info(f"Pushing {task_name} {values} to tensorboard")
             for metric, value in values.items():
                 if "stderr" in metric:
                     tb_context.add_scalar(f"stderr_{prefix}/{task_name}/{metric}", value, global_step=global_step)
@@ -546,7 +548,7 @@ def push_to_tensorboard(  # noqa: C901
         # Tasks with subtasks
         for name, values in bench_averages.items():
             for metric, values in values.items():
-                hlog(f"Pushing average {name} {metric} {sum(values) / len(values)} to tensorboard")
+                logger.info(f"Pushing average {name} {metric} {sum(values) / len(values)} to tensorboard")
                 tb_context.add_scalar(f"{prefix}/{name}/{metric}", sum(values) / len(values), global_step=global_step)
 
         tb_context.add_text("eval_config", obj_to_markdown(results), global_step=global_step)
@@ -571,7 +573,7 @@ def push_to_tensorboard(  # noqa: C901
 
         # Now we can push to the hub
         tb_context.scheduler.trigger()
-        hlog(
+        logger.info(
             f"Pushed to tensorboard at https://huggingface.co/{self.tensorboard_repo}/{output_dir_tb}/tensorboard"
             f" at global_step {global_step}"
         )