huggingface · NathanHB · Feb 7, 2024 · Feb 5, 2024 · Feb 5, 2024 · Feb 5, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -37,4 +37,5 @@ repos:
     rev: 'v0.1.6'
     hooks:
       - id: ruff
+        args: ['--fix']
       - id: ruff-format
diff --git a/README.md b/README.md
@@ -8,10 +8,10 @@ LightEval is an evaluation suite which gathers a selection of features from wide
 
 It is still an early, internal version - it should be nice to use but don't expect 100% stability!
 
-In case of problems or question, feel free to open an issue! 
+In case of problems or question, feel free to open an issue!
 
 ## How to install and use
-### Requirements
+### Installation
 0) Create your virtual environment using virtualenv or conda depending on your preferences. We require Python3.10
 
 1) Clone the package using `git clone`, then `cd lighteval-harness`, `pip install -e .` Once the dependencies are installed, `cd src`.
@@ -22,6 +22,12 @@ Optional:
 
 2) Add your user token to the environment variable `HUGGING_FACE_HUB_TOKEN` if you want to push your results to the hub
 
+For the linting:
+```bash
+pre-commit install
+pre-commit run --config .pre-commit-config.yaml --all-files
+```
+
 
 ### Usage
 - Launching on CPU
@@ -50,11 +56,11 @@ Lastly, create a **line summary** of your evaluation, in `metadata_table.json`.
 - `suite` (list), the suite(s) to which your evaluation should belong. This field allows us to compare different tasks implementation, and is used a task selection to differentiate the versions to launch. At the moment, you'll find the keywords ["helm", "bigbench", "original", "lighteval"]; you can add also add new ones (for test, we recommend using "custom").
 - `prompt_function` (str), the name of the prompt function you defined in the step above
 - `hf_repo` (str), the path to your evaluation dataset on the hub
-- `hf_subset` (str), the specific subset you want to use for your evaluation (note: when the dataset has no subset, fill this field with `"default"`, not with `None` or `""`) 
+- `hf_subset` (str), the specific subset you want to use for your evaluation (note: when the dataset has no subset, fill this field with `"default"`, not with `None` or `""`)
 - `hf_avail_splits` (list), all the splits available for your dataset (train, valid or validation, test, other...)
 - `evaluation_splits` (list), the splits you want to use for evaluation
 - `few_shots_split` (str, can be `null`), the specific split from which you want to select samples for your few-shot examples. It should be different from the sets included in `evaluation_splits`
-- `few_shots_select` (str, can be `null`), the method that you will use to select items for your few-shot examples. Can be `null`, or one of: 
+- `few_shots_select` (str, can be `null`), the method that you will use to select items for your few-shot examples. Can be `null`, or one of:
     - `balanced` selects examples from the `few_shots_split` with balanced labels, to avoid skewing the few shot examples (hence the model generations) towards one specific label
     - `random` selects examples at random from the `few_shots_split`
     - `random_sampling` selects new examples at random from the `few_shots_split` for every new item, but if a sampled item is equal to the current one, it is removed from the available samples
@@ -102,7 +108,7 @@ These metrics need the model to generate an output. They are therefore slower.
     - `exact_match_indicator`: Exact match with some preceding context (before an indicator) removed
     - `f1_score_quasi` (HELM): Average F1 score in terms of word overlap between the model output and gold, with both being normalized first
     - `f1_score`:  Average F1 score in terms of word overlap between the model output and gold without normalisation
-    - `f1_score_macro`: Corpus level macro F1 score 
+    - `f1_score_macro`: Corpus level macro F1 score
     - `f1_score_macro`: Corpus level micro F1 score
 - Summarization:
     - `rouge` (Harness): Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/)
@@ -141,7 +147,7 @@ These metrics need both the generation and its logprob. They are not working at
 - `prediction_perplexity` (HELM): Measure of the logprob of a given input.
 
 ## Adding a new metric
-If you want to add a new metric, first check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `metrics_sample`. If not, add it to either of these files depending on the level at which it is applied. Then, follow the example in `src.lighteval.metrics.metrics` to register your metric. 
+If you want to add a new metric, first check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `metrics_sample`. If not, add it to either of these files depending on the level at which it is applied. Then, follow the example in `src.lighteval.metrics.metrics` to register your metric.
 
 ## Examples of scripts to launch lighteval on the cluster
 ### Evaluate a whole suite on one node, 8 GPUs

diff --git a/pyproject.toml b/pyproject.toml
@@ -82,8 +82,7 @@ optimum = ["optimum==1.12.0"]
 quantization = ["bitsandbytes>=0.41.0", "auto-gptq>=0.4.2"]
 adapters = ["peft==0.3.0"]
 nanotron = [
-  "nanotron@git+https://github.com/huggingface/nanotron@8c1a49588d0745a6404644a86547c2dd6a63640e",
-  "brrr@git+https://github.com/huggingface/brrr@e8a503e2ec08b34eed7522d331aec3bee8cdd29b",
+  "nanotron@git+https://github.com/huggingface/nanotron",
   "tensorboardX"
 ]
 

diff --git a/src/lighteval/data.py b/src/lighteval/data.py
@@ -189,7 +189,41 @@ def _sorting_criteria(self, x) -> int:
         Returns:
             Any: The collated data.
         """
-        toks, (stop_tokens, gen_length) = x
+        toks = x[0]
+        meta_data = x[1]
+        _, gen_length = meta_data[0], meta_data[1]
+        return -(len(toks) + gen_length)
+
+
+class GenerativeTaskDatasetNanotron(DynamicBatchDataset):
+    def __getitem__(self, index) -> Request:
+        """
+        Get an item from the dataset depending on the split we are currently in.
+        For instance, if we are in split 0, we will get the item at index 0, if
+        we are in split 1, we will get the item at index self.split_size, etc.
+        Used for dynamic batching.
+
+        Args:
+            index (int): The index of the item.
+
+        Returns:
+            Any: The item at the specified index.
+        """
+        return index, self.sorted_data[index + self.split_start]
+
+    def _sorting_criteria(self, x) -> int:
+        """
+        Collate function for generating batches.
+
+        Args:
+            x (Any): The input data.
+
+        Returns:
+            Any: The collated data.
+        """
+        toks = x[0]
+        meta_data = x[1]
+        _, gen_length = meta_data[0], meta_data[1]
         return -(len(toks) + gen_length)
 
 

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
@@ -18,13 +18,11 @@
     TaskConfigLogger,
     VersionsLogger,
 )
-from lighteval.utils import is_nanotron_available
+from lighteval.utils import is_nanotron_available, obj_to_markdown
 
 
 if is_nanotron_available():
-    from brrr.config import BrrrConfig
-    from brrr.experiment_loggers import obj_to_markdown
-    from nanotron.config import get_config_from_dict
+    from nanotron.config import Config, get_config_from_dict
 
 
 class EnhancedJSONEncoder(json.JSONEncoder):
@@ -104,81 +102,81 @@ def save(
 
         """
         hlog("Saving experiment tracker")
-        try:
-            date_id = datetime.now().isoformat().replace(":", "-")
-
-            output_dir_results = Path(output_dir) / "results" / self.general_config_logger.model_name
-            output_dir_details = Path(output_dir) / "details" / self.general_config_logger.model_name
-            output_dir_details_sub_folder = output_dir_details / date_id
-            output_dir_results.mkdir(parents=True, exist_ok=True)
-            output_dir_details_sub_folder.mkdir(parents=True, exist_ok=True)
-
-            output_results_file = output_dir_results / f"results_{date_id}.json"
-            output_results_in_details_file = output_dir_details / f"results_{date_id}.json"
-
-            hlog(f"Saving results to {output_results_file} and {output_results_in_details_file}")
-
-            to_dump = {
-                "config_general": asdict(self.general_config_logger),
-                "results": self.metrics_logger.metric_aggregated,
-                "versions": self.versions_logger.versions,
-                "config_tasks": self.task_config_logger.tasks_configs,
-                "summary_tasks": self.details_logger.compiled_details,
-                "summary_general": asdict(self.details_logger.compiled_details_over_all_tasks),
-            }
-            dumped = json.dumps(to_dump, cls=EnhancedJSONEncoder, indent=2)
-
-            with open(output_results_file, "w") as f:
-                f.write(dumped)
-
-            with open(output_results_in_details_file, "w") as f:
-                f.write(dumped)
-
-            for task_name, task_details in self.details_logger.details.items():
-                output_file_details = output_dir_details_sub_folder / f"details_{task_name}_{date_id}.parquet"
-                # Create a dataset from the dictionary
-                try:
-                    dataset = Dataset.from_list([asdict(detail) for detail in task_details])
-                except Exception:
-                    # We force cast to str to avoid formatting problems for nested objects
-                    dataset = Dataset.from_list(
-                        [{k: str(v) for k, v in asdict(detail).items()} for detail in task_details]
-                    )
+        # try:
+        date_id = datetime.now().isoformat().replace(":", "-")
 
-                # We don't keep 'id' around if it's there
-                column_names = dataset.column_names
-                if "id" in dataset.column_names:
-                    column_names = [t for t in dataset.column_names if t != "id"]
-
-                # Sort column names to make it easier later
-                dataset = dataset.select_columns(sorted(column_names))
-                # Save the dataset to a Parquet file
-                dataset.to_parquet(output_file_details.as_posix())
-
-            if push_results_to_hub:
-                self.api.upload_folder(
-                    repo_id=self.hub_results_repo if public else self.hub_private_results_repo,
-                    folder_path=output_dir_results,
-                    path_in_repo=self.general_config_logger.model_name,
-                    repo_type="dataset",
-                    commit_message=f"Updating model {self.general_config_logger.model_name}",
-                )
+        output_dir_results = Path(output_dir) / "results" / self.general_config_logger.model_name
+        output_dir_details = Path(output_dir) / "details" / self.general_config_logger.model_name
+        output_dir_details_sub_folder = output_dir_details / date_id
+        output_dir_results.mkdir(parents=True, exist_ok=True)
+        output_dir_details_sub_folder.mkdir(parents=True, exist_ok=True)
 
-            if push_details_to_hub:
-                self.details_to_hub(
-                    model_name=self.general_config_logger.model_name,
-                    results_file_path=output_results_in_details_file,
-                    details_folder_path=output_dir_details_sub_folder,
-                    push_as_public=public,
-                )
+        output_results_file = output_dir_results / f"results_{date_id}.json"
+        output_results_in_details_file = output_dir_details / f"results_{date_id}.json"
+
+        hlog(f"Saving results to {output_results_file} and {output_results_in_details_file}")
 
-            if push_results_to_tensorboard:
-                self.push_results_to_tensorboard(
-                    results=self.metrics_logger.metric_aggregated, details=self.details_logger.details
+        to_dump = {
+            "config_general": asdict(self.general_config_logger),
+            "results": self.metrics_logger.metric_aggregated,
+            "versions": self.versions_logger.versions,
+            "config_tasks": self.task_config_logger.tasks_configs,
+            "summary_tasks": self.details_logger.compiled_details,
+            "summary_general": asdict(self.details_logger.compiled_details_over_all_tasks),
+        }
+        dumped = json.dumps(to_dump, cls=EnhancedJSONEncoder, indent=2)
+
+        with open(output_results_file, "w") as f:
+            f.write(dumped)
+
+        with open(output_results_in_details_file, "w") as f:
+            f.write(dumped)
+
+        for task_name, task_details in self.details_logger.details.items():
+            output_file_details = output_dir_details_sub_folder / f"details_{task_name}_{date_id}.parquet"
+            # Create a dataset from the dictionary
+            try:
+                dataset = Dataset.from_list([asdict(detail) for detail in task_details])
+            except Exception:
+                # We force cast to str to avoid formatting problems for nested objects
+                dataset = Dataset.from_list(
+                    [{k: str(v) for k, v in asdict(detail).items()} for detail in task_details]
                 )
-        except Exception as e:
-            hlog("WARNING: Could not save results")
-            hlog(repr(e))
+
+            # We don't keep 'id' around if it's there
+            column_names = dataset.column_names
+            if "id" in dataset.column_names:
+                column_names = [t for t in dataset.column_names if t != "id"]
+
+            # Sort column names to make it easier later
+            dataset = dataset.select_columns(sorted(column_names))
+            # Save the dataset to a Parquet file
+            dataset.to_parquet(output_file_details.as_posix())
+
+        if push_results_to_hub:
+            self.api.upload_folder(
+                repo_id=self.hub_results_repo if public else self.hub_private_results_repo,
+                folder_path=output_dir_results,
+                path_in_repo=self.general_config_logger.model_name,
+                repo_type="dataset",
+                commit_message=f"Updating model {self.general_config_logger.model_name}",
+            )
+
+        if push_details_to_hub:
+            self.details_to_hub(
+                model_name=self.general_config_logger.model_name,
+                results_file_path=output_results_in_details_file,
+                details_folder_path=output_dir_details_sub_folder,
+                push_as_public=public,
+            )
+
+        if push_results_to_tensorboard:
+            self.push_results_to_tensorboard(
+                results=self.metrics_logger.metric_aggregated, details=self.details_logger.details
+            )
+        # except Exception as e:
+        #     hlog("WARNING: Could not save results")
+        #     hlog(repr(e))
 
     def generate_final_dict(self) -> dict:
         """Aggregates and returns all the logger's experiment information in a dictionary.
@@ -487,7 +485,7 @@ def push_results_to_tensorboard(  # noqa: C901
         if not is_nanotron_available():
             hlog_warn("You cannot push results to tensorboard with having nanotron installed. Skipping")
             return
-        config: BrrrConfig = get_config_from_dict(self.general_config_logger.config, config_class=BrrrConfig)
+        config: Config = get_config_from_dict(self.general_config_logger.config, config_class=Config)
         lighteval_config = config.lighteval
         try:
             global_step = config.general.step

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
@@ -19,7 +19,7 @@
 
 
 if is_nanotron_available():
-    from brrr.config import BrrrConfig
+    from nanotron.config import Config
 
 
 @dataclass(init=False)
@@ -64,8 +64,8 @@ class GeneralConfigLogger:
     model_dtype: str = None
     model_size: str = None
 
-    # Nanotron/Brrr config
-    config: "BrrrConfig" = None
+    # Nanotron config
+    config: "Config" = None
 
     def __init__(self) -> None:
         """Stores the current lighteval commit for reproducibility, and starts the evaluation timer."""
@@ -79,7 +79,7 @@ def log_args_info(
         override_batch_size: Union[None, int],
         max_samples: Union[None, int],
         job_id: str,
-        config: "BrrrConfig" = None,
+        config: "Config" = None,
     ) -> None:
         """
         Logs the information about the arguments passed to the method.
@@ -91,7 +91,7 @@ def log_args_info(
                 Else, the batch size is automatically inferred depending on what fits in memory.
             max_samples (Union[None, int]): maximum number of samples, if None, use all the samples available.
             job_id (str): job ID, used to retrieve logs.
-            config (optional): BrrrConfig
+            config (optional): Nanotron Config
 
         Returns:
             None

diff --git a/src/main.py → src/lighteval/main.py b/src/main.py → src/lighteval/main.py