From c5d9090ce4aeb44431a5ab3d2b4677d1bda3b7a9 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Tue, 30 Jan 2024 13:14:08 +0000
Subject: [PATCH 01/13] add doc to utils.py

---
 src/lighteval/utils.py | 54 +++++++++++++++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 11 deletions(-)

diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py
index f8ec0665..246510fe 100644
--- a/src/lighteval/utils.py
+++ b/src/lighteval/utils.py
@@ -12,12 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import importlib
-from typing import Union
+from typing import Any, Union
 
 import numpy as np
 
 
-def sanitize_numpy(example_dict):
+def sanitize_numpy(example_dict: dict) -> dict:
+    """
+    Sanitizes a dictionary by converting any numpy generic types to their corresponding Python types.
+
+    Args:
+        example_dict (dict): The dictionary to be sanitized.
+
+    Returns:
+        dict: The sanitized dictionary with numpy generic types converted to Python types.
+    """
     output_dict = {}
     for k, v in example_dict.items():
         if isinstance(v, np.generic):
@@ -27,7 +36,21 @@ def sanitize_numpy(example_dict):
     return output_dict
 
 
-def as_list(item):
+def as_list(item: Union[list, tuple, Any]) -> list:
+    """
+    Convert the given item into a list.
+
+    If the item is already a list, it is returned as is.
+    If the item is a tuple, it is converted into a list.
+    Otherwise, the item is wrapped in a list.
+
+    Args:
+        item (Union[list, tuple, Any]): The item to be converted.
+
+    Returns:
+        list: The converted list.
+
+    """
     if isinstance(item, list):
         return item
     elif isinstance(item, tuple):
@@ -35,53 +58,62 @@ def as_list(item):
     return [item]
 
 
-def flatten(item: list[Union[list, str]]):
+def flatten(item: list[Union[list, str]]) -> list[str]:
+    """
+    Flattens a nested list of strings into a single flat list.
+
+    Args:
+        item (list[Union[list, str]]): The nested list to be flattened.
+
+    Returns:
+        list[str]: The flattened list of strings.
+    """
     flat_item = []
     for sub_item in item:
         flat_item.extend(sub_item) if isinstance(sub_item, list) else flat_item.append(sub_item)
     return flat_item
 
 
-def is_accelerate_available():
+def is_accelerate_available() -> bool:
     return importlib.util.find_spec("accelerate") is not None
 
 
 NO_ACCELERATE_ERROR_MSG = "You requested the use of accelerate for this evaluation, but it is not available in your current environement. Please install it using pip."
 
 
-def is_tgi_available():
+def is_tgi_available() -> bool:
     return importlib.util.find_spec("text-generation") is not None
 
 
 NO_TGI_ERROR_MSG = "You are trying to start a text generation inference endpoint, but text-generation is not present in your local environement. Please install it using pip."
 
 
-def is_nanotron_available():
+def is_nanotron_available() -> bool:
     return importlib.util.find_spec("nanotron") is not None
 
 
 NO_NANOTRON_ERROR_MSG = "YYou requested the use of nanotron for this evaluation, but it is not available in your current environement. Please install it using pip."
 
 
-def is_optimum_available():
+def is_optimum_available() -> bool:
     return importlib.util.find_spec("optimum") is not None
 
 
-def is_bnb_available():
+def is_bnb_available() -> bool:
     return importlib.util.find_spec("bitsandbytes") is not None
 
 
 NO_BNB_ERROR_MSG = "You are trying to load a model quantized with `bitsandbytes`, which is not available in your local environement. Please install it using pip."
 
 
-def is_autogptq_available():
+def is_autogptq_available() -> bool:
     return importlib.util.find_spec("auto-gptq") is not None
 
 
 NO_AUTOGPTQ_ERROR_MSG = "You are trying to load a model quantized with `auto-gptq`, which is not available in your local environement. Please install it using pip."
 
 
-def is_peft_available():
+def is_peft_available() -> bool:
     return importlib.util.find_spec("peft") is not None
 
 

From 9b47974b920a4946133fe7cbe0598b459e3a9470 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Tue, 30 Jan 2024 13:59:47 +0000
Subject: [PATCH 02/13] add doc and typing

---
 src/lighteval/logging/evaluation_tracker.py | 27 +++++++---
 src/lighteval/logging/info_loggers.py       | 23 +++++++-
 src/lighteval/models/model_config.py        | 60 ++++++++++++++++++---
 3 files changed, 95 insertions(+), 15 deletions(-)

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index 3d36d76c..a14767c6 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -28,7 +28,8 @@
 
 
 class EnhancedJSONEncoder(json.JSONEncoder):
-    """Provides a proper json encoding for the loggers and trackers json dumps.
+    """
+    Provides a proper json encoding for the loggers and trackers json dumps.
     Notably manages the json encoding of dataclasses.
     """
 
@@ -39,10 +40,16 @@ def default(self, o):
 
 
 class EvaluationTracker:
-    """Keeps track of the overall evaluation process and relevant informations.
-
-    The [`EvaluationTracker`] contains specific loggers for experiments details ([`DetailsLogger`]), metrics ([`MetricsLogger`]), task versions ([`VersionsLogger`]) as well as for the general configurations of both the specific task ([`TaskConfigLogger`]) and overall evaluation run ([`GeneralConfigLogger`]).
-    It compiles the data from these loggers and writes it to files, which can be published to the Hugging Face hub if requested.
+    """
+    Keeps track of the overall evaluation process and relevant informations.
+
+    The [`EvaluationTracker`] contains specific loggers for experiments details
+    ([`DetailsLogger`]), metrics ([`MetricsLogger`]), task versions
+    ([`VersionsLogger`]) as well as for the general configurations of both the
+    specific task ([`TaskConfigLogger`]) and overall evaluation run
+    ([`GeneralConfigLogger`]).  It compiles the data from these loggers and
+    writes it to files, which can be published to the Hugging Face hub if
+    requested.
     """
 
     details_logger: DetailsLogger
@@ -53,11 +60,15 @@ class EvaluationTracker:
     hub_results_org: str
 
     def __init__(self, hub_results_org: str = "", token: str = "") -> None:
-        """Creates all the necessary loggers for evaluation tracking.
+        """
+        Creates all the necessary loggers for evaluation tracking.
 
         Args:
-            hub_results_org (str): The organisation to push the results to. See more details about the datasets organisation in [`EvaluationTracker.save`]
-            token (str): Token to use when pushing to the hub. This token should have write access to `hub_results_org`.
+            hub_results_org (str): The organisation to push the results to. See
+                more details about the datasets organisation in
+                [`EvaluationTracker.save`]
+            token (str): Token to use when pushing to the hub. This token should
+                have write access to `hub_results_org`.
         """
         self.details_logger = DetailsLogger()
         self.metrics_logger = MetricsLogger()
diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index 9fb4249e..36276fc1 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -2,6 +2,7 @@
 import os
 import time
 from dataclasses import asdict, dataclass, field
+from typing import Union
 
 import git
 import numpy as np
@@ -72,7 +73,27 @@ def __init__(self) -> None:
         self.lighteval_sha = repo.git.rev_parse("HEAD")
         self.start_time = time.perf_counter()
 
-    def log_args_info(self, num_fewshot_seeds, override_batch_size, max_samples, job_id, config=None) -> None:
+    def log_args_info(
+        self,
+        num_fewshot_seeds: int,
+        override_batch_size: Union[None, int],
+        max_samples: Union[None, int],
+        job_id: str,
+        config: "BrrrConfig" = None,
+    ) -> None:
+        """
+        Logs the information about the arguments passed to the method.
+
+        Args:
+            num_fewshot_seeds (int): The number of few-shot seeds.
+            override_batch_size (Union[None, int]): The overridden batch size.
+            max_samples (Union[None, int]): The maximum number of samples.
+            job_id (str): The job ID.
+            config (optional): BrrrConfig
+
+        Returns:
+            None
+        """
         self.num_fewshot_seeds = num_fewshot_seeds
         self.override_batch_size = override_batch_size
         self.max_samples = max_samples
diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py
index abe95bc2..a5e58e03 100644
--- a/src/lighteval/models/model_config.py
+++ b/src/lighteval/models/model_config.py
@@ -2,7 +2,7 @@
 from typing import Optional, Union
 
 import torch
-from transformers import AutoConfig, BitsAndBytesConfig, GPTQConfig
+from transformers import AutoConfig, BitsAndBytesConfig, GPTQConfig, PretrainedConfig
 
 from lighteval.logging.hierarchical_logger import hlog
 from lighteval.models.utils import _get_model_sha
@@ -23,12 +23,16 @@
 
 @dataclass
 class EnvConfig:
+    """
+    Configuration class for environment settings.
+
+    Attributes:
+        cache_dir (str): The directory for caching data.
+        token (str): The authentication token used for accessing the HuggingFace Hub.
+    """
     cache_dir: str = None
     token: str = None
 
-
-@dataclass
-class BaseModelConfig:
     """Args:
     pretrained (str):
         The HuggingFace Hub model ID name or the path to a pre-trained
@@ -50,6 +54,50 @@ class BaseModelConfig:
         Use `dtype="auto"` to derive the type from the model’s weights.
     """
 
+
+
+@dataclass
+class BaseModelConfig:
+    """
+    Base configuration class for models.
+
+    Attributes:
+        pretrained (str): The HuggingFace Hub model ID name or the path to a
+            pre-trained model to load. This is effectively the
+            `pretrained_model_name_or_path` argument of `from_pretrained` in the
+            HuggingFace `transformers` API.
+        accelerator (Accelerator): The accelerator to use for model training.
+        tokenizer (Optional[str]): The HuggingFace Hub tokenizer ID that will be
+            used for tokenization.
+        multichoice_continuations_start_space (Optional[bool]): Whether to add a
+            space at the start of each continuation in multichoice generation.
+        subfolder (Optional[str]): The subfolder within the model repository.
+        revision (str): The revision of the model.
+        batch_size (int): The batch size for model training.
+        max_gen_toks (Optional[int]): The maximum number of tokens to generate.
+        max_length (Optional[int]): The maximum length of the generated output.
+        add_special_tokens (bool, optional, defaults to True):
+            Whether to add special tokens to the input sequences. If `None`, the
+            default value will be set to `True` for seq2seq models (e.g. T5) and
+            `False` for causal models.
+        model_parallel (Optional[bool]): Whether to use model parallelism.
+        dtype (Optional[Union[str, torch.dtype]]): The data type of the model.
+        device (Union[int, str]): The device to use for model training.
+        quantization_config (Optional[BitsAndBytesConfig]): The quantization
+            configuration for the model.
+        load_in_8bit (bool): Whether to load the model in 8-bit precision.
+        load_in_4bit (bool): Whether to load the model in 4-bit precision.
+        trust_remote_code (bool): Whether to trust remote code during model
+            loading.
+
+    Methods:
+        __post_init__(): Performs post-initialization checks on the configuration.
+        _init_configs(model_name, env_config): Initializes the model configuration.
+        init_configs(env_config): Initializes the model configuration using the environment configuration.
+        get_model_sha(): Retrieves the SHA of the model.
+
+    """
+
     pretrained: str
     accelerator: "Accelerator" = None
     tokenizer: Optional[str] = None
@@ -77,7 +125,7 @@ def __post_init__(self):
         if not isinstance(self.device, str):
             raise ValueError("Current device must be passed as string.")
 
-    def _init_configs(self, model_name, env_config: EnvConfig):
+    def _init_configs(self, model_name: str, env_config: EnvConfig) -> PretrainedConfig:
         revision = self.revision
         if self.subfolder:
             revision = f"{self.revision}/{self.subfolder}"
@@ -98,7 +146,7 @@ def _init_configs(self, model_name, env_config: EnvConfig):
 
         return auto_config
 
-    def init_configs(self, env_config: EnvConfig):
+    def init_configs(self, env_config: EnvConfig) -> PretrainedConfig:
         return self._init_configs(self.pretrained, env_config=env_config)
 
     def get_model_sha(self):

From ce024b09c8bcef8980d783a7a22338f50ff5c94b Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Tue, 30 Jan 2024 14:14:17 +0000
Subject: [PATCH 03/13] add doc and typing

---
 src/lighteval/logging/info_loggers.py | 14 ++++++++++----
 src/lighteval/models/model_config.py  | 22 +++++++++++++++++++---
 src/lighteval/tasks/registry.py       |  8 ++++----
 src/lighteval/utils_parallelism.py    | 10 ++++++++++
 4 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index 36276fc1..1b2d2523 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -101,10 +101,16 @@ def log_args_info(
         self.config = config
 
     def log_model_info(self, model_info: ModelInfo) -> None:
-        self.model_name = model_info.model_name
-        self.model_sha = model_info.model_sha
-        self.model_dtype = model_info.model_dtype
-        self.model_size = model_info.model_size
+            """
+            Logs the model information.
+
+            Args:
+                model_info (ModelInfo): The model information to be logged.
+            """
+            self.model_name = model_info.model_name
+            self.model_sha = model_info.model_sha
+            self.model_dtype = model_info.model_dtype
+            self.model_size = model_info.model_size
 
     def log_end_time(self) -> None:
         self.end_time = time.perf_counter()
diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py
index a5e58e03..cbc87655 100644
--- a/src/lighteval/models/model_config.py
+++ b/src/lighteval/models/model_config.py
@@ -1,3 +1,4 @@
+from argparse import Namespace
 from dataclasses import dataclass
 from typing import Optional, Union
 
@@ -30,6 +31,7 @@ class EnvConfig:
         cache_dir (str): The directory for caching data.
         token (str): The authentication token used for accessing the HuggingFace Hub.
     """
+
     cache_dir: str = None
     token: str = None
 
@@ -55,7 +57,6 @@ class EnvConfig:
     """
 
 
-
 @dataclass
 class BaseModelConfig:
     """
@@ -194,8 +195,23 @@ class TGIModelConfig:
     inference_server_auth: str
 
 
-def create_model_config(args, accelerator: Accelerator):  # noqa C901
-    # Tests
+def create_model_config(args: Namespace, accelerator: Union[Accelerator, None]) -> BaseModelConfig:  # noqa: C901
+    """
+    Create a model configuration based on the provided arguments.
+
+    Args:
+        args (Namespace): The command-line arguments.
+        accelerator (Union[Accelerator, None]): The accelerator to use for model training.
+
+    Returns:
+        BaseModelConfig: The model configuration.
+
+    Raises:
+        ValueError: If both an inference server address and model arguments are provided.
+        ValueError: If both multichoice continuations start with a space and do not start with a space.
+        ValueError: If a base model is not specified when using delta weights or adapter weights.
+        ValueError: If a base model is specified when not using delta weights or adapter weights.
+    """
     if args.inference_server_address is not None and args.model_args is not None:
         raise ValueError("You cannot both use an inference server and load a model from its checkpoint.")
 
diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
index 1989584a..d3cb6615 100644
--- a/src/lighteval/tasks/registry.py
+++ b/src/lighteval/tasks/registry.py
@@ -71,8 +71,8 @@ def get_custom_tasks(custom_tasks_file: str) -> Tuple[ModuleType, str]:
 
 
 def taskinfo_selector(
-    tasks: str, few_shot_default: int = 0
-) -> tuple[list[str], dict[str, list[tuple[int, bool]]], dict[str, str]]:
+    tasks: str,
+) -> tuple[list[str], dict[str, list[tuple[int, bool]]]]:
     """
     Selects task information based on the given tasks and description dictionary path.
 
@@ -80,12 +80,12 @@ def taskinfo_selector(
         tasks (str): A string containing a comma-separated list of tasks in the
             format "suite|task|few_shot|truncate_few_shots" or a path to a file
             containing a list of tasks.
+        few_shot_default (int, optional): The default few_shot value to use if not provided.
 
     Returns:
-        tuple[list[str], dict[str, list[tuple[int, bool]]], dict[str, str]]: A tuple containing:
+        tuple[list[str], dict[str, list[tuple[int, bool]]]]: A tuple containing:
             - A sorted list of unique task names in the format "suite|task".
             - A dictionary mapping each task name to a list of tuples representing the few_shot and truncate_few_shots values.
-            - A dictionary containing the description dictionary loaded from the given path, or an empty dictionary if no path is provided.
     """
     few_shot_dict = collections.defaultdict(list)
 
diff --git a/src/lighteval/utils_parallelism.py b/src/lighteval/utils_parallelism.py
index a009eae9..7c38df46 100644
--- a/src/lighteval/utils_parallelism.py
+++ b/src/lighteval/utils_parallelism.py
@@ -92,6 +92,16 @@ def decorator(*args, **kwargs):
 
 
 def test_all_gather(accelerator=None, parallel_context=None):
+    """
+    Test the gather operation in a parallel setup.
+
+    Args:
+        accelerator (Optional): The accelerator object used for parallelism.
+        parallel_context (Optional): The parallel context object used for parallelism.
+
+    Raises:
+        ImportError: If the required accelerator or parallel context is not available.
+    """
     if accelerator:
         if not is_accelerate_available():
             raise ImportError(NO_ACCELERATE_ERROR_MSG)

From a02306899742f48817980db0a1cba91f5784be99 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Tue, 30 Jan 2024 14:32:32 +0000
Subject: [PATCH 04/13] add doc and typing to registry.py

---
 src/lighteval/tasks/registry.py | 70 +++++++++++++++++++++++++++------
 1 file changed, 59 insertions(+), 11 deletions(-)

diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
index d3cb6615..8b74c00a 100644
--- a/src/lighteval/tasks/registry.py
+++ b/src/lighteval/tasks/registry.py
@@ -22,11 +22,36 @@
 
 
 class Registry:
-    def __init__(self, cache_dir):
-        self.cache_dir = cache_dir
-        self.TASK_REGISTRY = {**create_config_tasks(cache_dir=cache_dir)}
-
-    def get_task_class(self, task_name, custom_tasks_registry=None):
+    def __init__(self, cache_dir: str):
+        """
+        Initialize the Registry class.
+
+        Args:
+            cache_dir (str): The directory path for caching.
+
+        Attributes:
+            cache_dir (str): The directory path for caching.
+            TASK_REGISTRY (dict[str, LightevalTask]): A dictionary containing the registered tasks.
+        """
+        self.cache_dir: str = cache_dir
+        self.TASK_REGISTRY: dict[str, LightevalTask] = {**create_config_tasks(cache_dir=cache_dir)}
+
+    def get_task_class(
+        self, task_name: str, custom_tasks_registry: Optional[dict[str, LightevalTask]] = None
+    ) -> LightevalTask:
+        """
+        Get the task class based on the task name.
+
+        Args:
+            task_name (str): The name of the task.
+            custom_tasks_registry (Optional[dict[str, LightevalTask]]): A dictionary containing custom tasks.
+
+        Returns:
+            LightevalTask: The task class.
+
+        Raises:
+            ValueError: If the task is not found in the task registry or custom task registry.
+        """
         if task_name in self.TASK_REGISTRY:
             return self.TASK_REGISTRY[task_name]
         elif custom_tasks_registry is not None and task_name in custom_tasks_registry:
@@ -41,14 +66,27 @@ def get_task_class(self, task_name, custom_tasks_registry=None):
     def get_task_dict(
         self, task_name_list: List[str], custom_tasks_file: Optional[str] = None
     ) -> Dict[str, LightevalTask]:
-        ## todo: make clearer
+        """
+        Get a dictionary of tasks based on the task name list.
+
+        Args:
+            task_name_list (List[str]): A list of task names.
+            custom_tasks_file (Optional[str]): The path to the custom tasks file.
+
+        Returns:
+            Dict[str, LightevalTask]: A dictionary containing the tasks.
+
+        Notes:
+            - If custom_tasks_file is provided, it will import the custom tasks module and create a custom tasks registry.
+            - Each task in the task_name_list will be instantiated with the corresponding task class.
+        """
         if custom_tasks_file is not None:
             dataset_module = dataset_module_factory(str(custom_tasks_file))
             custom_tasks_module = importlib.import_module(dataset_module.module_path)
             custom_tasks_registry = create_config_tasks(
                 meta_table=custom_tasks_module.TASKS_TABLE, cache_dir=self.cache_dir
             )
-            print(custom_tasks_registry)
+            hlog(custom_tasks_registry)
         else:
             custom_tasks_module = None
             custom_tasks_registry = None
@@ -80,7 +118,6 @@ def taskinfo_selector(
         tasks (str): A string containing a comma-separated list of tasks in the
             format "suite|task|few_shot|truncate_few_shots" or a path to a file
             containing a list of tasks.
-        few_shot_default (int, optional): The default few_shot value to use if not provided.
 
     Returns:
         tuple[list[str], dict[str, list[tuple[int, bool]]]]: A tuple containing:
@@ -117,9 +154,20 @@ def taskinfo_selector(
     return sorted(few_shot_dict.keys()), {k: list(set(v)) for k, v in few_shot_dict.items()}
 
 
-def create_config_tasks(meta_table=None, cache_dir: str = None) -> Dict[str, LightevalTask]:
-    """Creates a dictionary of tasks from a list of subjects
-    :return: {task_name: task}
+def create_config_tasks(
+    meta_table: Optional[Dataset] = None, cache_dir: Optional[str] = None
+) -> Dict[str, LightevalTask]:
+    """
+    Create configuration tasks based on the provided meta_table.
+
+    Args:
+        meta_table (Optional[Dataset]): The meta_table containing task
+            configurations. If not provided, it will be loaded from TABLE_PATH.
+        cache_dir (Optional[str]): The directory to store cached data. If not
+            provided, the default cache directory will be used.
+
+    Returns:
+        Dict[str, LightevalTask]: A dictionary of task names mapped to their corresponding LightevalTask classes.
     """
 
     def create_task(name, cfg, cache_dir):

From 561ec9501e50379e67022830a9cdefe1f9f35eac Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Tue, 30 Jan 2024 14:33:48 +0000
Subject: [PATCH 05/13] change doc in lighteval_tasks.py

---
 src/lighteval/tasks/lighteval_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index ff7197fe..39ff3e58 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -324,7 +324,7 @@ def aggregation(self):
     @staticmethod
     def load_datasets(tasks: list["LightevalTask"], dataset_loading_processes: int = 1) -> None:
         """
-        Load datasets for the given tasks.
+        Load datasets from the HuggingFace Hub for the given tasks.
 
         Args:
             tasks (list): A list of tasks.

From d34ae2f293b2fd0f9c78a28f06cfb1dc6f09078f Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Tue, 30 Jan 2024 14:35:32 +0000
Subject: [PATCH 06/13] make style

---
 src/lighteval/logging/info_loggers.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index 1b2d2523..b59614cf 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -101,16 +101,16 @@ def log_args_info(
         self.config = config
 
     def log_model_info(self, model_info: ModelInfo) -> None:
-            """
-            Logs the model information.
-
-            Args:
-                model_info (ModelInfo): The model information to be logged.
-            """
-            self.model_name = model_info.model_name
-            self.model_sha = model_info.model_sha
-            self.model_dtype = model_info.model_dtype
-            self.model_size = model_info.model_size
+        """
+        Logs the model information.
+
+        Args:
+            model_info (ModelInfo): The model information to be logged.
+        """
+        self.model_name = model_info.model_name
+        self.model_sha = model_info.model_sha
+        self.model_dtype = model_info.model_dtype
+        self.model_size = model_info.model_size
 
     def log_end_time(self) -> None:
         self.end_time = time.perf_counter()

From 8b36fe0130040d648eecad3f0f0f0dc40d42bbaf Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Tue, 30 Jan 2024 14:51:14 +0000
Subject: [PATCH 07/13] add doc and typing to registry.py

---
 src/lighteval/tasks/registry.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
index 8b74c00a..ff3caa46 100644
--- a/src/lighteval/tasks/registry.py
+++ b/src/lighteval/tasks/registry.py
@@ -22,6 +22,10 @@
 
 
 class Registry:
+    """
+    The Registry class is used to manage the task registry and get task classes.
+    """
+
     def __init__(self, cache_dir: str):
         """
         Initialize the Registry class.

From 4176f1e4da0d6b2449108ee1d62021f1b2d71948 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Tue, 30 Jan 2024 15:30:43 +0000
Subject: [PATCH 08/13] add doc to lighteval_tasks

---
 src/lighteval/metrics/metrics.py      |   2 +-
 src/lighteval/tasks/lighteval_task.py | 145 ++++++++++++++++++++++----
 2 files changed, 128 insertions(+), 19 deletions(-)

diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index df9af332..318a4599 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -501,7 +501,7 @@ def higher_is_better():
         return res
 
     @staticmethod
-    def corpus_level_fns():
+    def corpus_level_fns() -> dict[str, callable]:
         res = {}
         for metric in Metrics:
             if metric.value.category == MetricCategory.IGNORED:
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 39ff3e58..17dbc024 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -40,7 +40,19 @@
 
 
 class LightevalTask:
-    def __init__(self, name: str, cfg: dict, cache_dir: str = None, custom_tasks_module=None):
+    def __init__(self, name: str, cfg: dict, cache_dir: Optional[str] = None, custom_tasks_module=None):
+        """
+        Initialize a LightEval task.
+
+        Args:
+            name (str): The name of the task.
+            cfg (dict): The configuration dictionary containing
+                task-specific settings (from the task_table.json file).
+            cache_dir (Optional[str], optional): The directory to cache the
+                dataset. Defaults to None.
+            custom_tasks_module ([type], optional): A custom module
+                containing task-specific functions. Defaults to None.
+        """
         self.name = name
         self.VERSION = 0
         self.is_main_process = False
@@ -108,6 +120,16 @@ def cfg(self):
         return self._cfg
 
     def doc_to_text_without_instructions(self, doc: Doc) -> str:
+        """
+        Returns the query of the document without the instructions. If the
+        document has instructions, it removes them from the query:
+
+        Args:
+            doc (Doc): The document.
+
+        Returns:
+            str: The query of the document without the instructions.
+        """
         if doc.instruction is not None:
             if not doc.query.startswith(doc.instruction):
                 raise ValueError(f"Prompt query {doc.query} is not starting with instruction {doc.instruction}")
@@ -115,6 +137,18 @@ def doc_to_text_without_instructions(self, doc: Doc) -> str:
         return doc.query
 
     def doc_to_text_and_instructions(self, doc: Doc) -> Tuple[str, str]:
+        """
+        Returns a tuple with the query of the document and the instructions.
+        If the document has no instructions, the second element of the tuple is
+        an empty string.
+
+        Args:
+            doc (Doc): The document.
+
+        Returns:
+            Tuple[str, str]: A tuple with the query of the document and the
+                instructions.
+        """
         if doc.instruction is not None:
             if not doc.query.startswith(doc.instruction):
                 raise ValueError(f"Prompt query {doc.query} is not starting with instruction {doc.instruction}")
@@ -122,10 +156,17 @@ def doc_to_text_and_instructions(self, doc: Doc) -> Tuple[str, str]:
         return (doc.query, "")
 
     def get_first_possible_fewshot_splits(self, number_of_splits: int = 1) -> list[str]:
-        """Parses the possible fewshot split keys in order:
-        train, then validation keys
-        and matches them with the available keys.
-        Returns the first available.
+        """
+        Parses the possible fewshot split keys in order: train, then validation
+        keys and matches them with the available keys.  Returns the first
+        available.
+
+        Args:
+            number_of_splits (int, optional): The number of splits to return.
+                Defaults to 1.
+
+        Returns:
+            list[str]: The list of the first available fewshot splits.
         """
         # Possible few shot splits are the available splits not used for evaluation
         possible_fewshot_splits = [k for k in self.all_available_splits if k not in self.evaluation_split]
@@ -145,6 +186,17 @@ def get_first_possible_fewshot_splits(self, number_of_splits: int = 1) -> list[s
         return None
 
     def _get_docs_from_split(self, keys, few_shots=False) -> list[Doc]:
+        """
+        Get the documents from the dataset for the given keys (splits).
+
+        Args:
+            keys (list): The list of keys (splits).
+            few_shots (bool, optional): Whether the documents are used for few
+                shot examples. Defaults to False.
+
+        Returns:
+            list[Doc]: The list of documents.
+        """
         if self.dataset is None:
             self.dataset = download_dataset_worker((self.dataset_path, self.dataset_config_name))
 
@@ -159,6 +211,13 @@ def _get_docs_from_split(self, keys, few_shots=False) -> list[Doc]:
         return docs
 
     def fewshot_docs(self) -> list[Doc]:
+        """
+        Returns the few shot documents. If the few shot documents are not
+        available, it gets them from the few shot split or the evaluation split.
+
+        Returns:
+            list[Doc]: The few shot documents.
+        """
         if self._fewshot_docs is None:
             self._fewshot_docs = []
 
@@ -170,11 +229,28 @@ def fewshot_docs(self) -> list[Doc]:
         return self._fewshot_docs
 
     def eval_docs(self) -> list[Doc]:
+        """
+        Returns the evaluation documents.
+
+        Returns:
+            list[Doc]: The evaluation documents.
+        """
         if self._docs is None:
             self._docs = self._get_docs_from_split(self.evaluation_split)
         return self._docs
 
-    def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False):
+    def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False) -> str:
+        """
+        Returns the target of the given document.
+
+        Args:
+            formatted_doc (Doc): The formatted document.
+            few_shot (bool, optional): Whether the document is used for few
+                shot examples. Defaults to False.
+
+        Returns:
+            str: The target of the document.
+        """
         if few_shot:
             if formatted_doc.target_for_fewshot_sorting is not None:
                 return formatted_doc.target_for_fewshot_sorting
@@ -184,6 +260,16 @@ def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False):
 
     # Requests
     def get_request_type(self) -> list[RequestType]:
+        """
+        Returns the request types for the task.
+
+        Returns:
+            list[RequestType]: The request types for the task.
+
+        Raises:
+            NotImplementedError: If the request type is not implemented for the
+                task.
+        """
         request_types = []
         if self.has_metric_category[MetricCategory.TARGET_PERPLEXITY]:
             request_types.append(RequestType.LOGLIKELIHOOD)
@@ -207,7 +293,7 @@ def construct_requests(
         self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str
     ) -> List[Request]:
         """
-        Constructs a list of requests based on the given parameters.
+        Constructs a list of requests from the task based on the given parameters.
 
         Args:
             formatted_doc (Doc): The formatted document almost straight from the dataset.
@@ -282,7 +368,17 @@ def construct_requests(
 
         return requests
 
-    def process_results(self, formatted_doc: Doc, results: list[ModelReturn]):
+    def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dict[str, float]:
+        """
+        Processes the results of the task. and stores them in the output dict.
+
+        Args:
+            formatted_doc (Doc): The formatted document of the task.
+            results (list[ModelReturn]): The results of the task, returned by the model class after evaluation.
+
+        Returns:
+            dict[str, float]: The output dictionary containing the results of the task.
+        """
         # Metrics management is done in metrics.__init__
         outputs = {}
         if self.has_metric_category[MetricCategory.TARGET_PERPLEXITY]:
@@ -319,6 +415,10 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]):
         return outputs
 
     def aggregation(self):
+        """
+        Return a dict with metric name and its aggregation function for all
+        metrics
+        """
         return Metrics.corpus_level_fns()
 
     @staticmethod
@@ -349,6 +449,10 @@ def load_datasets(tasks: list["LightevalTask"], dataset_loading_processes: int =
 
 
 def download_dataset_worker(args):
+    """
+    Worker function to download a dataset from the HuggingFace Hub.
+    Used for parallel dataset loading.
+    """
     dataset_path, dataset_config_name = args
     dataset = load_dataset(
         path=dataset_path,
@@ -370,22 +474,27 @@ def create_requests_from_tasks(  # noqa: C901
     use_chat_template: bool,
 ) -> Tuple[dict[RequestType, list[Request]], dict[TaskExampleId, Doc]]:
     """
-    Takes a task dict and a fewshot dict and returns a dict of requests, a dict of docs, and a dict of requests origins.
-    The construction of prompts and thus the managing of few shots is done here.
+    Takes a task dict and a fewshot dict and returns a dict of requests, a dict
+    of docs, and a dict of requests origins.  The construction of prompts and
+    thus the managing of few shots is done here.
 
     Args:
-        task_dict (_type_): _description_
-        fewshot_dict (_type_): _description_
-        num_fewshot_seeds (_type_): _description_
-        lm (_type_): _description_
-        max_samples (_type_): _description_
-        evaluation_tracker (_type_): _description_
+        task_dict (dict[str, LightevalTask]): A dictionary of tasks.
+        fewshot_dict (dict[str, list[Tuple[int, bool]]]): A dictionary of few
+            shot examples.
+        num_fewshot_seeds (int): The number of few shot seeds.
+        lm (BaseModel): The language model.
+        max_samples (int): The maximum number of samples.
+        evaluation_tracker (EvaluationTracker): The evaluation tracker.
+        use_chat_template (bool): Whether to use the chat template.
 
     Raises:
-        RuntimeError: _description_
+        NotImplementedError: If the request type is not implemented for the
+            task.
 
     Returns:
-        _type_: _description_
+        Tuple[dict[RequestType, list[Request]], dict[TaskExampleId, Doc]]: A
+            tuple containing the requests and the documents.
     """
     docs: dict[TaskExampleId, Doc] = {}
     requests: dict[RequestType, list[Request]] = collections.defaultdict(list)

From ef4a70224f0476454dcaea3e4c06bd9553c1bf13 Mon Sep 17 00:00:00 2001
From: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
Date: Tue, 6 Feb 2024 12:54:25 +0100
Subject: [PATCH 09/13] Update src/lighteval/models/model_config.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>
---
 src/lighteval/models/model_config.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py
index cbc87655..44869a40 100644
--- a/src/lighteval/models/model_config.py
+++ b/src/lighteval/models/model_config.py
@@ -77,9 +77,8 @@ class BaseModelConfig:
         batch_size (int): The batch size for model training.
         max_gen_toks (Optional[int]): The maximum number of tokens to generate.
         max_length (Optional[int]): The maximum length of the generated output.
-        add_special_tokens (bool, optional, defaults to True):
-            Whether to add special tokens to the input sequences. If `None`, the
-            default value will be set to `True` for seq2seq models (e.g. T5) and
+        add_special_tokens (bool, optional, defaults to True): Whether to add special tokens to the input sequences. 
+           If `None`, the default value will be set to `True` for seq2seq models (e.g. T5) and
             `False` for causal models.
         model_parallel (Optional[bool]): Whether to use model parallelism.
         dtype (Optional[Union[str, torch.dtype]]): The data type of the model.

From ba186e66526406d90268c88a48f123464ab31bd4 Mon Sep 17 00:00:00 2001
From: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
Date: Tue, 6 Feb 2024 12:58:03 +0100
Subject: [PATCH 10/13] Update src/lighteval/models/model_config.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>
---
 src/lighteval/models/model_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py
index 44869a40..c69ca6c2 100644
--- a/src/lighteval/models/model_config.py
+++ b/src/lighteval/models/model_config.py
@@ -207,7 +207,7 @@ def create_model_config(args: Namespace, accelerator: Union[Accelerator, None])
 
     Raises:
         ValueError: If both an inference server address and model arguments are provided.
-        ValueError: If both multichoice continuations start with a space and do not start with a space.
+     ValueError: If multichoice continuations both should start with a space and should not start with a space.
         ValueError: If a base model is not specified when using delta weights or adapter weights.
         ValueError: If a base model is specified when not using delta weights or adapter weights.
     """

From 78442251b5b6a13af34aa84266c901e26defdfc6 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Tue, 6 Feb 2024 12:09:36 +0000
Subject: [PATCH 11/13] update from review

---
 src/lighteval/logging/info_loggers.py | 14 +++---
 src/lighteval/models/model_config.py  | 38 ++++++++-------
 src/lighteval/tasks/lighteval_task.py | 68 ++++++++++++++-------------
 src/lighteval/tasks/registry.py       | 14 +++---
 4 files changed, 70 insertions(+), 64 deletions(-)

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index b59614cf..db58c11a 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -85,10 +85,10 @@ def log_args_info(
         Logs the information about the arguments passed to the method.
 
         Args:
-            num_fewshot_seeds (int): The number of few-shot seeds.
-            override_batch_size (Union[None, int]): The overridden batch size.
-            max_samples (Union[None, int]): The maximum number of samples.
-            job_id (str): The job ID.
+            num_fewshot_seeds (int): number of few-shot seeds.
+            override_batch_size (Union[None, int]): overridden batch size.
+            max_samples (Union[None, int]): maximum number of samples, if None, use all the samples available.
+            job_id (str): job ID.
             config (optional): BrrrConfig
 
         Returns:
@@ -105,7 +105,7 @@ def log_model_info(self, model_info: ModelInfo) -> None:
         Logs the model information.
 
         Args:
-            model_info (ModelInfo): The model information to be logged.
+            model_info (ModelInfo): model information to be logged.
         """
         self.model_name = model_info.model_name
         self.model_sha = model_info.model_sha
@@ -187,7 +187,7 @@ class CompiledDetail:
             padded (int): Total umber of samples which needed padding during the batching step for the current task.
             non_padded (int): Total number of samples which did not need padding during the batching step for the current task.
             effective_few_shots (float): Average effective few shots across all samples for the current task.
-                The effective few shot is the number of few shots actually used to fit the prompt in the model context
+                effective few shot is the number of few shots actually used to fit the prompt in the model context
                 length while allowing model generation of the expected size.
             num_truncated_few_shots (int): Total number of samples which required truncated prompts to fit the model size for the current task.
         """
@@ -213,7 +213,7 @@ class CompiledDetailOverAllTasks:
             padded (int): Number of samples which needed padding during the batching step across all tasks.
             non_padded (int): Number of samples which did not need padding during the batching step across all tasks.
             effective_few_shots (float): Average effective few shots across all samples across all tasks.
-                The effective few shot is the number of few shots actually used to fit the prompt in the model context
+                effective few shot is the number of few shots actually used to fit the prompt in the model context
                 length while allowing model generation of the expected size.
             num_truncated_few_shots (int): Number of samples which required truncated prompts to fit the model size across all tasks.
         """
diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py
index cbc87655..f9838706 100644
--- a/src/lighteval/models/model_config.py
+++ b/src/lighteval/models/model_config.py
@@ -28,8 +28,8 @@ class EnvConfig:
     Configuration class for environment settings.
 
     Attributes:
-        cache_dir (str): The directory for caching data.
-        token (str): The authentication token used for accessing the HuggingFace Hub.
+        cache_dir (str): directory for caching data.
+        token (str): authentication token used for accessing the HuggingFace Hub.
     """
 
     cache_dir: str = None
@@ -37,7 +37,7 @@ class EnvConfig:
 
     """Args:
     pretrained (str):
-        The HuggingFace Hub model ID name or the path to a pre-trained
+        HuggingFace Hub model ID name or the path to a pre-trained
         model to load. This is effectively the `pretrained_model_name_or_path`
         argument of `from_pretrained` in the HuggingFace `transformers` API.
     add_special_tokens (bool, optional, defaults to True):
@@ -63,29 +63,31 @@ class BaseModelConfig:
     Base configuration class for models.
 
     Attributes:
-        pretrained (str): The HuggingFace Hub model ID name or the path to a
+        pretrained (str): HuggingFace Hub model ID name or the path to a
             pre-trained model to load. This is effectively the
             `pretrained_model_name_or_path` argument of `from_pretrained` in the
             HuggingFace `transformers` API.
-        accelerator (Accelerator): The accelerator to use for model training.
-        tokenizer (Optional[str]): The HuggingFace Hub tokenizer ID that will be
+        accelerator (Accelerator): accelerator to use for model training.
+        tokenizer (Optional[str]): HuggingFace Hub tokenizer ID that will be
             used for tokenization.
         multichoice_continuations_start_space (Optional[bool]): Whether to add a
             space at the start of each continuation in multichoice generation.
-        subfolder (Optional[str]): The subfolder within the model repository.
-        revision (str): The revision of the model.
-        batch_size (int): The batch size for model training.
-        max_gen_toks (Optional[int]): The maximum number of tokens to generate.
-        max_length (Optional[int]): The maximum length of the generated output.
+            For example, context: "What is the capital of France?" and choices: "Paris", "London".
+            Will be tokenized as: "What is the capital of France? Paris" and "What is the capital of France? London".
+        subfolder (Optional[str]): Tsubfolder within the model repository.
+        revision (str): revision of the model.
+        batch_size (int): batch size for model training.
+        max_gen_toks (Optional[int]): maximum number of tokens to generate.
+        max_length (Optional[int]): maximum length of the generated output.
         add_special_tokens (bool, optional, defaults to True):
             Whether to add special tokens to the input sequences. If `None`, the
             default value will be set to `True` for seq2seq models (e.g. T5) and
             `False` for causal models.
         model_parallel (Optional[bool]): Whether to use model parallelism.
-        dtype (Optional[Union[str, torch.dtype]]): The data type of the model.
-        device (Union[int, str]): The device to use for model training.
-        quantization_config (Optional[BitsAndBytesConfig]): The quantization
-            configuration for the model.
+        dtype (Optional[Union[str, torch.dtype]]): data type of the model.
+        device (Union[int, str]): device to use for model training.
+        quantization_config (Optional[BitsAndBytesConfig]): quantization
+            configuration for the model. Needed for 4-bit and 8-bit precision.
         load_in_8bit (bool): Whether to load the model in 8-bit precision.
         load_in_4bit (bool): Whether to load the model in 4-bit precision.
         trust_remote_code (bool): Whether to trust remote code during model
@@ -200,11 +202,11 @@ def create_model_config(args: Namespace, accelerator: Union[Accelerator, None])
     Create a model configuration based on the provided arguments.
 
     Args:
-        args (Namespace): The command-line arguments.
-        accelerator (Union[Accelerator, None]): The accelerator to use for model training.
+        args (Namespace): command-line arguments.
+        accelerator (Union[Accelerator, None]): accelerator to use for model training.
 
     Returns:
-        BaseModelConfig: The model configuration.
+        BaseModelConfig: model configuration.
 
     Raises:
         ValueError: If both an inference server address and model arguments are provided.
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 17dbc024..e16963a9 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -45,10 +45,10 @@ def __init__(self, name: str, cfg: dict, cache_dir: Optional[str] = None, custom
         Initialize a LightEval task.
 
         Args:
-            name (str): The name of the task.
-            cfg (dict): The configuration dictionary containing
+            name (str): name of the task.
+            cfg (dict): configuration dictionary containing
                 task-specific settings (from the task_table.json file).
-            cache_dir (Optional[str], optional): The directory to cache the
+            cache_dir (Optional[str], optional): directory to cache the
                 dataset. Defaults to None.
             custom_tasks_module ([type], optional): A custom module
                 containing task-specific functions. Defaults to None.
@@ -125,10 +125,11 @@ def doc_to_text_without_instructions(self, doc: Doc) -> str:
         document has instructions, it removes them from the query:
 
         Args:
-            doc (Doc): The document.
+            doc (Doc): document class, containing the query and the
+                instructions.
 
         Returns:
-            str: The query of the document without the instructions.
+            str: Query of the document without the instructions.
         """
         if doc.instruction is not None:
             if not doc.query.startswith(doc.instruction):
@@ -143,7 +144,7 @@ def doc_to_text_and_instructions(self, doc: Doc) -> Tuple[str, str]:
         an empty string.
 
         Args:
-            doc (Doc): The document.
+            doc (Doc): document, containing the query and the instructions.
 
         Returns:
             Tuple[str, str]: A tuple with the query of the document and the
@@ -162,11 +163,11 @@ def get_first_possible_fewshot_splits(self, number_of_splits: int = 1) -> list[s
         available.
 
         Args:
-            number_of_splits (int, optional): The number of splits to return.
+            number_of_splits (int, optional): Number of splits to return.
                 Defaults to 1.
 
         Returns:
-            list[str]: The list of the first available fewshot splits.
+            list[str]: List of the first available fewshot splits.
         """
         # Possible few shot splits are the available splits not used for evaluation
         possible_fewshot_splits = [k for k in self.all_available_splits if k not in self.evaluation_split]
@@ -185,24 +186,24 @@ def get_first_possible_fewshot_splits(self, number_of_splits: int = 1) -> list[s
         hlog_warn(f"Careful, the task {self.name} is using evaluation data to build the few shot examples.")
         return None
 
-    def _get_docs_from_split(self, keys, few_shots=False) -> list[Doc]:
+    def _get_docs_from_split(self, splits: list[str], few_shots=False) -> list[Doc]:
         """
         Get the documents from the dataset for the given keys (splits).
 
         Args:
-            keys (list): The list of keys (splits).
+            splits (list[str]): List of splits, (e.g. ["train", "dev"])
             few_shots (bool, optional): Whether the documents are used for few
                 shot examples. Defaults to False.
 
         Returns:
-            list[Doc]: The list of documents.
+            list[Doc]: List of documents.
         """
         if self.dataset is None:
             self.dataset = download_dataset_worker((self.dataset_path, self.dataset_config_name))
 
         docs = []
-        for key in keys:
-            for item in self.dataset[key]:
+        for split in splits:
+            for item in self.dataset[split]:
                 # Some tasks formatting is applied differently when the document is used for fewshot examples
                 # vs when it's used for the actual prompt. That's why we store whether we are currently using the
                 # doc for a fewshot sample (few_shots=True) or not, which then leads to the creation of a different Doc.
@@ -216,7 +217,8 @@ def fewshot_docs(self) -> list[Doc]:
         available, it gets them from the few shot split or the evaluation split.
 
         Returns:
-            list[Doc]: The few shot documents.
+            list[Doc]: Documents that will be used for few shot examples. One
+                document = one few shot example.
         """
         if self._fewshot_docs is None:
             self._fewshot_docs = []
@@ -233,7 +235,7 @@ def eval_docs(self) -> list[Doc]:
         Returns the evaluation documents.
 
         Returns:
-            list[Doc]: The evaluation documents.
+            list[Doc]: Evaluation documents.
         """
         if self._docs is None:
             self._docs = self._get_docs_from_split(self.evaluation_split)
@@ -244,12 +246,12 @@ def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False) -> str:
         Returns the target of the given document.
 
         Args:
-            formatted_doc (Doc): The formatted document.
+            formatted_doc (Doc): Formatted document.
             few_shot (bool, optional): Whether the document is used for few
                 shot examples. Defaults to False.
 
         Returns:
-            str: The target of the document.
+            str: Target of the document, which is the correct answer for a document.
         """
         if few_shot:
             if formatted_doc.target_for_fewshot_sorting is not None:
@@ -264,7 +266,7 @@ def get_request_type(self) -> list[RequestType]:
         Returns the request types for the task.
 
         Returns:
-            list[RequestType]: The request types for the task.
+            list[RequestType]: Request types for the task.
 
         Raises:
             NotImplementedError: If the request type is not implemented for the
@@ -296,13 +298,13 @@ def construct_requests(
         Constructs a list of requests from the task based on the given parameters.
 
         Args:
-            formatted_doc (Doc): The formatted document almost straight from the dataset.
-            ctx (str): The context, which is the few shot examples + the query.
-            document_id_seed (str): The index of the document in the task appended with the seed used for the few shot sampling.
-            current_task_name (str): The name of the current task.
+            formatted_doc (Doc): Formatted document almost straight from the dataset.
+            ctx (str): Context, which is the few shot examples + the query.
+            document_id_seed (str): Index of the document in the task appended with the seed used for the few shot sampling.
+            current_task_name (str): Name of the current task.
 
         Returns:
-            dict[RequestType, List[Request]]: The list of requests.
+            dict[RequestType, List[Request]]: List of requests.
         """
         requests = {type: [] for type in RequestType}
 
@@ -370,14 +372,14 @@ def construct_requests(
 
     def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dict[str, float]:
         """
-        Processes the results of the task. and stores them in the output dict.
+        Processes the results of the task, and stores them in the output dict.
 
         Args:
-            formatted_doc (Doc): The formatted document of the task.
-            results (list[ModelReturn]): The results of the task, returned by the model class after evaluation.
+            formatted_doc (Doc): formatted document of the task.
+            results (list[ModelReturn]): results of the task, returned by the model class after evaluation.
 
         Returns:
-            dict[str, float]: The output dictionary containing the results of the task.
+            dict[str, float]: output dictionary containing the results of the task.
         """
         # Metrics management is done in metrics.__init__
         outputs = {}
@@ -428,7 +430,7 @@ def load_datasets(tasks: list["LightevalTask"], dataset_loading_processes: int =
 
         Args:
             tasks (list): A list of tasks.
-            dataset_loading_processes (int, optional): The number of processes to use for dataset loading. Defaults to 1.
+            dataset_loading_processes (int, optional): number of processes to use for dataset loading. Defaults to 1.
 
         Returns:
             None
@@ -482,10 +484,12 @@ def create_requests_from_tasks(  # noqa: C901
         task_dict (dict[str, LightevalTask]): A dictionary of tasks.
         fewshot_dict (dict[str, list[Tuple[int, bool]]]): A dictionary of few
             shot examples.
-        num_fewshot_seeds (int): The number of few shot seeds.
-        lm (BaseModel): The language model.
-        max_samples (int): The maximum number of samples.
-        evaluation_tracker (EvaluationTracker): The evaluation tracker.
+        num_fewshot_seeds (int): number of few shot seeds.
+        lm (BaseModel): language model class that will be used to eventually
+            truncate the few shot examples (we need the maximum input size of the
+            model)
+        max_samples (int): maximum number of samples.
+        evaluation_tracker (EvaluationTracker): evaluation tracker.
         use_chat_template (bool): Whether to use the chat template.
 
     Raises:
diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
index ff3caa46..f662bf5a 100644
--- a/src/lighteval/tasks/registry.py
+++ b/src/lighteval/tasks/registry.py
@@ -31,10 +31,10 @@ def __init__(self, cache_dir: str):
         Initialize the Registry class.
 
         Args:
-            cache_dir (str): The directory path for caching.
+            cache_dir (str): Directory path for caching.
 
         Attributes:
-            cache_dir (str): The directory path for caching.
+            cache_dir (str): Directory path for caching.
             TASK_REGISTRY (dict[str, LightevalTask]): A dictionary containing the registered tasks.
         """
         self.cache_dir: str = cache_dir
@@ -47,11 +47,11 @@ def get_task_class(
         Get the task class based on the task name.
 
         Args:
-            task_name (str): The name of the task.
+            task_name (str): Name of the task.
             custom_tasks_registry (Optional[dict[str, LightevalTask]]): A dictionary containing custom tasks.
 
         Returns:
-            LightevalTask: The task class.
+            LightevalTask: Task class.
 
         Raises:
             ValueError: If the task is not found in the task registry or custom task registry.
@@ -75,7 +75,7 @@ def get_task_dict(
 
         Args:
             task_name_list (List[str]): A list of task names.
-            custom_tasks_file (Optional[str]): The path to the custom tasks file.
+            custom_tasks_file (Optional[str]): Path to the custom tasks file.
 
         Returns:
             Dict[str, LightevalTask]: A dictionary containing the tasks.
@@ -165,9 +165,9 @@ def create_config_tasks(
     Create configuration tasks based on the provided meta_table.
 
     Args:
-        meta_table (Optional[Dataset]): The meta_table containing task
+        meta_table (Optional[Dataset]): meta_table containing task
             configurations. If not provided, it will be loaded from TABLE_PATH.
-        cache_dir (Optional[str]): The directory to store cached data. If not
+        cache_dir (Optional[str]): Directory to store cached data. If not
             provided, the default cache directory will be used.
 
     Returns:

From 2975ee237621c5be851f6d14458325a1fda3ef9a Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Tue, 6 Feb 2024 12:10:41 +0000
Subject: [PATCH 12/13] make style

---
 src/lighteval/models/model_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py
index 54cf74a4..75f13f94 100644
--- a/src/lighteval/models/model_config.py
+++ b/src/lighteval/models/model_config.py
@@ -53,7 +53,7 @@ class EnvConfig:
     dtype (Union[str, torch.dtype], optional, defaults to None):):
         Converts the model weights to `dtype`, if specified. Strings get
         converted to `torch.dtype` objects (e.g. `float16` -> `torch.float16`).
-        Use `dtype="auto"` to derive the type from the model’s weights.
+        Use `dtype="auto"` to derive the type from the model's weights.
     """
 
 
@@ -79,7 +79,7 @@ class BaseModelConfig:
         batch_size (int): The batch size for model training.
         max_gen_toks (Optional[int]): The maximum number of tokens to generate.
         max_length (Optional[int]): The maximum length of the generated output.
-        add_special_tokens (bool, optional, defaults to True): Whether to add special tokens to the input sequences. 
+        add_special_tokens (bool, optional, defaults to True): Whether to add special tokens to the input sequences.
            If `None`, the default value will be set to `True` for seq2seq models (e.g. T5) and
             `False` for causal models.
         model_parallel (Optional[bool]): Whether to use model parallelism.

From c94b8b5f1ceaa279731b663475f9a0121b3a2a89 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Tue, 6 Feb 2024 12:40:14 +0000
Subject: [PATCH 13/13] fix doc to match google style

---
 src/lighteval/logging/evaluation_tracker.py |  5 ++++-
 src/lighteval/logging/info_loggers.py       | 20 ++++++++++++++++----
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index a14767c6..0515af46 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -90,7 +90,8 @@ def save(
     ) -> None:
         """Saves the experiment information and results to files, and to the hub if requested.
 
-        Note: In case of save failure, this function will only print a warning, with the error message.
+        Note:
+            In case of save failure, this function will only print a warning, with the error message.
 
         Args:
             output_dir (str): Local folder path where you want results to be saved
@@ -215,6 +216,7 @@ def details_to_hub(
             details_folder_path (str or Path): Local path of the current's experiment details folder.
                 The details folder (created by [`EvaluationTracker.save`]) should contain one parquet file per task used during the evaluation run of the current model.
             push_as_public (bool, optional): If True, the results will be pushed publicly, else the datasets will be private.
+
         """
         results_file_path = str(results_file_path)
         details_folder_path = str(details_folder_path)
@@ -266,6 +268,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:
         Args:
             repo_id (str): Details dataset repository path on the hub (`org/dataset`)
             model_name (str): Name of the currently evaluated model.
+
         """
         # Add a nice dataset card and the configuration YAML
         files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index db58c11a..38d4d7ab 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -39,7 +39,7 @@ class GeneralConfigLogger:
         job_id (int): If the evaluation suite is launched as a slurm job, stores the current job id.
             Purely informative parameter used to retrieve scheduler logs.
         start_time (float): Start time of the experiment. Logged at class init.
-        end_time (float): Start time of the experiment. Logged when calling [`GeneralConfigLogger.log_end_time`]
+        end_time (float): End time of the experiment. Logged when calling [`GeneralConfigLogger.log_end_time`]
         total_evaluation_time_secondes (str): Inferred total evaluation time in seconds (from the start and end times).
         model_name (str): Name of the currently evaluated model.
         model_sha (str): Commit hash of the currently evaluated model on the hub if available.
@@ -87,12 +87,15 @@ def log_args_info(
         Args:
             num_fewshot_seeds (int): number of few-shot seeds.
             override_batch_size (Union[None, int]): overridden batch size.
+                If strictly positive, its value is used as the batch size for all experiments.
+                Else, the batch size is automatically inferred depending on what fits in memory.
             max_samples (Union[None, int]): maximum number of samples, if None, use all the samples available.
-            job_id (str): job ID.
+            job_id (str): job ID, used to retrieve logs.
             config (optional): BrrrConfig
 
         Returns:
             None
+
         """
         self.num_fewshot_seeds = num_fewshot_seeds
         self.override_batch_size = override_batch_size
@@ -105,7 +108,8 @@ def log_model_info(self, model_info: ModelInfo) -> None:
         Logs the model information.
 
         Args:
-            model_info (ModelInfo): model information to be logged.
+            model_info (ModelInfo): Model information to be logged.
+
         """
         self.model_name = model_info.model_name
         self.model_sha = model_info.model_sha
@@ -129,6 +133,7 @@ class DetailsLogger:
             Example: winogrande: [sample1_details, sample2_details, ...]
         compiled_details (dict[str, `CompiledDetail`]): : Maps each task name to the list of its samples' compiled details.
         compiled_details_over_all_tasks (CompiledDetailOverAllTasks): Aggregated details over all the tasks.
+
     """
 
     @dataclass()
@@ -156,6 +161,7 @@ class Detail:
             choices (list): List of the possible choices (for multichoice/loglikelihood evaluations)
             gold_index (list): Indices of the gold targets among the [`choices`]
             metrics (dict): Metric name to current example score
+
         """
 
         example: str = ""
@@ -190,6 +196,7 @@ class CompiledDetail:
                 effective few shot is the number of few shots actually used to fit the prompt in the model context
                 length while allowing model generation of the expected size.
             num_truncated_few_shots (int): Total number of samples which required truncated prompts to fit the model size for the current task.
+
         """
 
         hashes: dict = field(default_factory=dict)
@@ -216,6 +223,7 @@ class CompiledDetailOverAllTasks:
                 effective few shot is the number of few shots actually used to fit the prompt in the model context
                 length while allowing model generation of the expected size.
             num_truncated_few_shots (int): Number of samples which required truncated prompts to fit the model size across all tasks.
+
         """
 
         hashes: dict = field(default_factory=dict)
@@ -415,7 +423,8 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
 
         Args:
             task_dict (dict[str, LightevalTask]): used to determine what aggregation function to use for each metric
-            bootstrap_iters (int, optional): _description_. Defaults to 1000.
+            bootstrap_iters (int, optional): Number of runs used to run the statistical bootstrap. Defaults to 1000.
+
         """
 
         for task_name, metrics in self.metrics_values.items():
@@ -467,6 +476,7 @@ class VersionsLogger:
 
     Attributes:
         version (dict[str, int]): Maps the task names with the task versions.
+
     """
 
     # the versions dict will be a dict of task_name: task_version
@@ -482,6 +492,7 @@ class TaskConfigLogger:
 
     Attributes:
         tasks_config (dict[str, TaskConfig]): Maps each task to its associated [`TaskConfig`]
+
     """
 
     @dataclass
@@ -506,6 +517,7 @@ class TaskConfig:
             truncated_num_docs (bool): Whether less than the total number of documents were used
             output_regex (str)
             frozen (bool)
+
         """
 
         name: str