From c5d9090ce4aeb44431a5ab3d2b4677d1bda3b7a9 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 30 Jan 2024 13:14:08 +0000 Subject: [PATCH 01/13] add doc to utils.py --- src/lighteval/utils.py | 54 +++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py index f8ec0665..246510fe 100644 --- a/src/lighteval/utils.py +++ b/src/lighteval/utils.py @@ -12,12 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. import importlib -from typing import Union +from typing import Any, Union import numpy as np -def sanitize_numpy(example_dict): +def sanitize_numpy(example_dict: dict) -> dict: + """ + Sanitizes a dictionary by converting any numpy generic types to their corresponding Python types. + + Args: + example_dict (dict): The dictionary to be sanitized. + + Returns: + dict: The sanitized dictionary with numpy generic types converted to Python types. + """ output_dict = {} for k, v in example_dict.items(): if isinstance(v, np.generic): @@ -27,7 +36,21 @@ def sanitize_numpy(example_dict): return output_dict -def as_list(item): +def as_list(item: Union[list, tuple, Any]) -> list: + """ + Convert the given item into a list. + + If the item is already a list, it is returned as is. + If the item is a tuple, it is converted into a list. + Otherwise, the item is wrapped in a list. + + Args: + item (Union[list, tuple, Any]): The item to be converted. + + Returns: + list: The converted list. + + """ if isinstance(item, list): return item elif isinstance(item, tuple): @@ -35,53 +58,62 @@ def as_list(item): return [item] -def flatten(item: list[Union[list, str]]): +def flatten(item: list[Union[list, str]]) -> list[str]: + """ + Flattens a nested list of strings into a single flat list. + + Args: + item (list[Union[list, str]]): The nested list to be flattened. + + Returns: + list[str]: The flattened list of strings. + """ flat_item = [] for sub_item in item: flat_item.extend(sub_item) if isinstance(sub_item, list) else flat_item.append(sub_item) return flat_item -def is_accelerate_available(): +def is_accelerate_available() -> bool: return importlib.util.find_spec("accelerate") is not None NO_ACCELERATE_ERROR_MSG = "You requested the use of accelerate for this evaluation, but it is not available in your current environement. Please install it using pip." -def is_tgi_available(): +def is_tgi_available() -> bool: return importlib.util.find_spec("text-generation") is not None NO_TGI_ERROR_MSG = "You are trying to start a text generation inference endpoint, but text-generation is not present in your local environement. Please install it using pip." -def is_nanotron_available(): +def is_nanotron_available() -> bool: return importlib.util.find_spec("nanotron") is not None NO_NANOTRON_ERROR_MSG = "YYou requested the use of nanotron for this evaluation, but it is not available in your current environement. Please install it using pip." -def is_optimum_available(): +def is_optimum_available() -> bool: return importlib.util.find_spec("optimum") is not None -def is_bnb_available(): +def is_bnb_available() -> bool: return importlib.util.find_spec("bitsandbytes") is not None NO_BNB_ERROR_MSG = "You are trying to load a model quantized with `bitsandbytes`, which is not available in your local environement. Please install it using pip." -def is_autogptq_available(): +def is_autogptq_available() -> bool: return importlib.util.find_spec("auto-gptq") is not None NO_AUTOGPTQ_ERROR_MSG = "You are trying to load a model quantized with `auto-gptq`, which is not available in your local environement. Please install it using pip." -def is_peft_available(): +def is_peft_available() -> bool: return importlib.util.find_spec("peft") is not None From 9b47974b920a4946133fe7cbe0598b459e3a9470 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 30 Jan 2024 13:59:47 +0000 Subject: [PATCH 02/13] add doc and typing --- src/lighteval/logging/evaluation_tracker.py | 27 +++++++--- src/lighteval/logging/info_loggers.py | 23 +++++++- src/lighteval/models/model_config.py | 60 ++++++++++++++++++--- 3 files changed, 95 insertions(+), 15 deletions(-) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 3d36d76c..a14767c6 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -28,7 +28,8 @@ class EnhancedJSONEncoder(json.JSONEncoder): - """Provides a proper json encoding for the loggers and trackers json dumps. + """ + Provides a proper json encoding for the loggers and trackers json dumps. Notably manages the json encoding of dataclasses. """ @@ -39,10 +40,16 @@ def default(self, o): class EvaluationTracker: - """Keeps track of the overall evaluation process and relevant informations. - - The [`EvaluationTracker`] contains specific loggers for experiments details ([`DetailsLogger`]), metrics ([`MetricsLogger`]), task versions ([`VersionsLogger`]) as well as for the general configurations of both the specific task ([`TaskConfigLogger`]) and overall evaluation run ([`GeneralConfigLogger`]). - It compiles the data from these loggers and writes it to files, which can be published to the Hugging Face hub if requested. + """ + Keeps track of the overall evaluation process and relevant informations. + + The [`EvaluationTracker`] contains specific loggers for experiments details + ([`DetailsLogger`]), metrics ([`MetricsLogger`]), task versions + ([`VersionsLogger`]) as well as for the general configurations of both the + specific task ([`TaskConfigLogger`]) and overall evaluation run + ([`GeneralConfigLogger`]). It compiles the data from these loggers and + writes it to files, which can be published to the Hugging Face hub if + requested. """ details_logger: DetailsLogger @@ -53,11 +60,15 @@ class EvaluationTracker: hub_results_org: str def __init__(self, hub_results_org: str = "", token: str = "") -> None: - """Creates all the necessary loggers for evaluation tracking. + """ + Creates all the necessary loggers for evaluation tracking. Args: - hub_results_org (str): The organisation to push the results to. See more details about the datasets organisation in [`EvaluationTracker.save`] - token (str): Token to use when pushing to the hub. This token should have write access to `hub_results_org`. + hub_results_org (str): The organisation to push the results to. See + more details about the datasets organisation in + [`EvaluationTracker.save`] + token (str): Token to use when pushing to the hub. This token should + have write access to `hub_results_org`. """ self.details_logger = DetailsLogger() self.metrics_logger = MetricsLogger() diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index 9fb4249e..36276fc1 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -2,6 +2,7 @@ import os import time from dataclasses import asdict, dataclass, field +from typing import Union import git import numpy as np @@ -72,7 +73,27 @@ def __init__(self) -> None: self.lighteval_sha = repo.git.rev_parse("HEAD") self.start_time = time.perf_counter() - def log_args_info(self, num_fewshot_seeds, override_batch_size, max_samples, job_id, config=None) -> None: + def log_args_info( + self, + num_fewshot_seeds: int, + override_batch_size: Union[None, int], + max_samples: Union[None, int], + job_id: str, + config: "BrrrConfig" = None, + ) -> None: + """ + Logs the information about the arguments passed to the method. + + Args: + num_fewshot_seeds (int): The number of few-shot seeds. + override_batch_size (Union[None, int]): The overridden batch size. + max_samples (Union[None, int]): The maximum number of samples. + job_id (str): The job ID. + config (optional): BrrrConfig + + Returns: + None + """ self.num_fewshot_seeds = num_fewshot_seeds self.override_batch_size = override_batch_size self.max_samples = max_samples diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py index abe95bc2..a5e58e03 100644 --- a/src/lighteval/models/model_config.py +++ b/src/lighteval/models/model_config.py @@ -2,7 +2,7 @@ from typing import Optional, Union import torch -from transformers import AutoConfig, BitsAndBytesConfig, GPTQConfig +from transformers import AutoConfig, BitsAndBytesConfig, GPTQConfig, PretrainedConfig from lighteval.logging.hierarchical_logger import hlog from lighteval.models.utils import _get_model_sha @@ -23,12 +23,16 @@ @dataclass class EnvConfig: + """ + Configuration class for environment settings. + + Attributes: + cache_dir (str): The directory for caching data. + token (str): The authentication token used for accessing the HuggingFace Hub. + """ cache_dir: str = None token: str = None - -@dataclass -class BaseModelConfig: """Args: pretrained (str): The HuggingFace Hub model ID name or the path to a pre-trained @@ -50,6 +54,50 @@ class BaseModelConfig: Use `dtype="auto"` to derive the type from the model’s weights. """ + + +@dataclass +class BaseModelConfig: + """ + Base configuration class for models. + + Attributes: + pretrained (str): The HuggingFace Hub model ID name or the path to a + pre-trained model to load. This is effectively the + `pretrained_model_name_or_path` argument of `from_pretrained` in the + HuggingFace `transformers` API. + accelerator (Accelerator): The accelerator to use for model training. + tokenizer (Optional[str]): The HuggingFace Hub tokenizer ID that will be + used for tokenization. + multichoice_continuations_start_space (Optional[bool]): Whether to add a + space at the start of each continuation in multichoice generation. + subfolder (Optional[str]): The subfolder within the model repository. + revision (str): The revision of the model. + batch_size (int): The batch size for model training. + max_gen_toks (Optional[int]): The maximum number of tokens to generate. + max_length (Optional[int]): The maximum length of the generated output. + add_special_tokens (bool, optional, defaults to True): + Whether to add special tokens to the input sequences. If `None`, the + default value will be set to `True` for seq2seq models (e.g. T5) and + `False` for causal models. + model_parallel (Optional[bool]): Whether to use model parallelism. + dtype (Optional[Union[str, torch.dtype]]): The data type of the model. + device (Union[int, str]): The device to use for model training. + quantization_config (Optional[BitsAndBytesConfig]): The quantization + configuration for the model. + load_in_8bit (bool): Whether to load the model in 8-bit precision. + load_in_4bit (bool): Whether to load the model in 4-bit precision. + trust_remote_code (bool): Whether to trust remote code during model + loading. + + Methods: + __post_init__(): Performs post-initialization checks on the configuration. + _init_configs(model_name, env_config): Initializes the model configuration. + init_configs(env_config): Initializes the model configuration using the environment configuration. + get_model_sha(): Retrieves the SHA of the model. + + """ + pretrained: str accelerator: "Accelerator" = None tokenizer: Optional[str] = None @@ -77,7 +125,7 @@ def __post_init__(self): if not isinstance(self.device, str): raise ValueError("Current device must be passed as string.") - def _init_configs(self, model_name, env_config: EnvConfig): + def _init_configs(self, model_name: str, env_config: EnvConfig) -> PretrainedConfig: revision = self.revision if self.subfolder: revision = f"{self.revision}/{self.subfolder}" @@ -98,7 +146,7 @@ def _init_configs(self, model_name, env_config: EnvConfig): return auto_config - def init_configs(self, env_config: EnvConfig): + def init_configs(self, env_config: EnvConfig) -> PretrainedConfig: return self._init_configs(self.pretrained, env_config=env_config) def get_model_sha(self): From ce024b09c8bcef8980d783a7a22338f50ff5c94b Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 30 Jan 2024 14:14:17 +0000 Subject: [PATCH 03/13] add doc and typing --- src/lighteval/logging/info_loggers.py | 14 ++++++++++---- src/lighteval/models/model_config.py | 22 +++++++++++++++++++--- src/lighteval/tasks/registry.py | 8 ++++---- src/lighteval/utils_parallelism.py | 10 ++++++++++ 4 files changed, 43 insertions(+), 11 deletions(-) diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index 36276fc1..1b2d2523 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -101,10 +101,16 @@ def log_args_info( self.config = config def log_model_info(self, model_info: ModelInfo) -> None: - self.model_name = model_info.model_name - self.model_sha = model_info.model_sha - self.model_dtype = model_info.model_dtype - self.model_size = model_info.model_size + """ + Logs the model information. + + Args: + model_info (ModelInfo): The model information to be logged. + """ + self.model_name = model_info.model_name + self.model_sha = model_info.model_sha + self.model_dtype = model_info.model_dtype + self.model_size = model_info.model_size def log_end_time(self) -> None: self.end_time = time.perf_counter() diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py index a5e58e03..cbc87655 100644 --- a/src/lighteval/models/model_config.py +++ b/src/lighteval/models/model_config.py @@ -1,3 +1,4 @@ +from argparse import Namespace from dataclasses import dataclass from typing import Optional, Union @@ -30,6 +31,7 @@ class EnvConfig: cache_dir (str): The directory for caching data. token (str): The authentication token used for accessing the HuggingFace Hub. """ + cache_dir: str = None token: str = None @@ -55,7 +57,6 @@ class EnvConfig: """ - @dataclass class BaseModelConfig: """ @@ -194,8 +195,23 @@ class TGIModelConfig: inference_server_auth: str -def create_model_config(args, accelerator: Accelerator): # noqa C901 - # Tests +def create_model_config(args: Namespace, accelerator: Union[Accelerator, None]) -> BaseModelConfig: # noqa: C901 + """ + Create a model configuration based on the provided arguments. + + Args: + args (Namespace): The command-line arguments. + accelerator (Union[Accelerator, None]): The accelerator to use for model training. + + Returns: + BaseModelConfig: The model configuration. + + Raises: + ValueError: If both an inference server address and model arguments are provided. + ValueError: If both multichoice continuations start with a space and do not start with a space. + ValueError: If a base model is not specified when using delta weights or adapter weights. + ValueError: If a base model is specified when not using delta weights or adapter weights. + """ if args.inference_server_address is not None and args.model_args is not None: raise ValueError("You cannot both use an inference server and load a model from its checkpoint.") diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 1989584a..d3cb6615 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -71,8 +71,8 @@ def get_custom_tasks(custom_tasks_file: str) -> Tuple[ModuleType, str]: def taskinfo_selector( - tasks: str, few_shot_default: int = 0 -) -> tuple[list[str], dict[str, list[tuple[int, bool]]], dict[str, str]]: + tasks: str, +) -> tuple[list[str], dict[str, list[tuple[int, bool]]]]: """ Selects task information based on the given tasks and description dictionary path. @@ -80,12 +80,12 @@ def taskinfo_selector( tasks (str): A string containing a comma-separated list of tasks in the format "suite|task|few_shot|truncate_few_shots" or a path to a file containing a list of tasks. + few_shot_default (int, optional): The default few_shot value to use if not provided. Returns: - tuple[list[str], dict[str, list[tuple[int, bool]]], dict[str, str]]: A tuple containing: + tuple[list[str], dict[str, list[tuple[int, bool]]]]: A tuple containing: - A sorted list of unique task names in the format "suite|task". - A dictionary mapping each task name to a list of tuples representing the few_shot and truncate_few_shots values. - - A dictionary containing the description dictionary loaded from the given path, or an empty dictionary if no path is provided. """ few_shot_dict = collections.defaultdict(list) diff --git a/src/lighteval/utils_parallelism.py b/src/lighteval/utils_parallelism.py index a009eae9..7c38df46 100644 --- a/src/lighteval/utils_parallelism.py +++ b/src/lighteval/utils_parallelism.py @@ -92,6 +92,16 @@ def decorator(*args, **kwargs): def test_all_gather(accelerator=None, parallel_context=None): + """ + Test the gather operation in a parallel setup. + + Args: + accelerator (Optional): The accelerator object used for parallelism. + parallel_context (Optional): The parallel context object used for parallelism. + + Raises: + ImportError: If the required accelerator or parallel context is not available. + """ if accelerator: if not is_accelerate_available(): raise ImportError(NO_ACCELERATE_ERROR_MSG) From a02306899742f48817980db0a1cba91f5784be99 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 30 Jan 2024 14:32:32 +0000 Subject: [PATCH 04/13] add doc and typing to registry.py --- src/lighteval/tasks/registry.py | 70 +++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 11 deletions(-) diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index d3cb6615..8b74c00a 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -22,11 +22,36 @@ class Registry: - def __init__(self, cache_dir): - self.cache_dir = cache_dir - self.TASK_REGISTRY = {**create_config_tasks(cache_dir=cache_dir)} - - def get_task_class(self, task_name, custom_tasks_registry=None): + def __init__(self, cache_dir: str): + """ + Initialize the Registry class. + + Args: + cache_dir (str): The directory path for caching. + + Attributes: + cache_dir (str): The directory path for caching. + TASK_REGISTRY (dict[str, LightevalTask]): A dictionary containing the registered tasks. + """ + self.cache_dir: str = cache_dir + self.TASK_REGISTRY: dict[str, LightevalTask] = {**create_config_tasks(cache_dir=cache_dir)} + + def get_task_class( + self, task_name: str, custom_tasks_registry: Optional[dict[str, LightevalTask]] = None + ) -> LightevalTask: + """ + Get the task class based on the task name. + + Args: + task_name (str): The name of the task. + custom_tasks_registry (Optional[dict[str, LightevalTask]]): A dictionary containing custom tasks. + + Returns: + LightevalTask: The task class. + + Raises: + ValueError: If the task is not found in the task registry or custom task registry. + """ if task_name in self.TASK_REGISTRY: return self.TASK_REGISTRY[task_name] elif custom_tasks_registry is not None and task_name in custom_tasks_registry: @@ -41,14 +66,27 @@ def get_task_class(self, task_name, custom_tasks_registry=None): def get_task_dict( self, task_name_list: List[str], custom_tasks_file: Optional[str] = None ) -> Dict[str, LightevalTask]: - ## todo: make clearer + """ + Get a dictionary of tasks based on the task name list. + + Args: + task_name_list (List[str]): A list of task names. + custom_tasks_file (Optional[str]): The path to the custom tasks file. + + Returns: + Dict[str, LightevalTask]: A dictionary containing the tasks. + + Notes: + - If custom_tasks_file is provided, it will import the custom tasks module and create a custom tasks registry. + - Each task in the task_name_list will be instantiated with the corresponding task class. + """ if custom_tasks_file is not None: dataset_module = dataset_module_factory(str(custom_tasks_file)) custom_tasks_module = importlib.import_module(dataset_module.module_path) custom_tasks_registry = create_config_tasks( meta_table=custom_tasks_module.TASKS_TABLE, cache_dir=self.cache_dir ) - print(custom_tasks_registry) + hlog(custom_tasks_registry) else: custom_tasks_module = None custom_tasks_registry = None @@ -80,7 +118,6 @@ def taskinfo_selector( tasks (str): A string containing a comma-separated list of tasks in the format "suite|task|few_shot|truncate_few_shots" or a path to a file containing a list of tasks. - few_shot_default (int, optional): The default few_shot value to use if not provided. Returns: tuple[list[str], dict[str, list[tuple[int, bool]]]]: A tuple containing: @@ -117,9 +154,20 @@ def taskinfo_selector( return sorted(few_shot_dict.keys()), {k: list(set(v)) for k, v in few_shot_dict.items()} -def create_config_tasks(meta_table=None, cache_dir: str = None) -> Dict[str, LightevalTask]: - """Creates a dictionary of tasks from a list of subjects - :return: {task_name: task} +def create_config_tasks( + meta_table: Optional[Dataset] = None, cache_dir: Optional[str] = None +) -> Dict[str, LightevalTask]: + """ + Create configuration tasks based on the provided meta_table. + + Args: + meta_table (Optional[Dataset]): The meta_table containing task + configurations. If not provided, it will be loaded from TABLE_PATH. + cache_dir (Optional[str]): The directory to store cached data. If not + provided, the default cache directory will be used. + + Returns: + Dict[str, LightevalTask]: A dictionary of task names mapped to their corresponding LightevalTask classes. """ def create_task(name, cfg, cache_dir): From 561ec9501e50379e67022830a9cdefe1f9f35eac Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 30 Jan 2024 14:33:48 +0000 Subject: [PATCH 05/13] change doc in lighteval_tasks.py --- src/lighteval/tasks/lighteval_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index ff7197fe..39ff3e58 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -324,7 +324,7 @@ def aggregation(self): @staticmethod def load_datasets(tasks: list["LightevalTask"], dataset_loading_processes: int = 1) -> None: """ - Load datasets for the given tasks. + Load datasets from the HuggingFace Hub for the given tasks. Args: tasks (list): A list of tasks. From d34ae2f293b2fd0f9c78a28f06cfb1dc6f09078f Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 30 Jan 2024 14:35:32 +0000 Subject: [PATCH 06/13] make style --- src/lighteval/logging/info_loggers.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index 1b2d2523..b59614cf 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -101,16 +101,16 @@ def log_args_info( self.config = config def log_model_info(self, model_info: ModelInfo) -> None: - """ - Logs the model information. - - Args: - model_info (ModelInfo): The model information to be logged. - """ - self.model_name = model_info.model_name - self.model_sha = model_info.model_sha - self.model_dtype = model_info.model_dtype - self.model_size = model_info.model_size + """ + Logs the model information. + + Args: + model_info (ModelInfo): The model information to be logged. + """ + self.model_name = model_info.model_name + self.model_sha = model_info.model_sha + self.model_dtype = model_info.model_dtype + self.model_size = model_info.model_size def log_end_time(self) -> None: self.end_time = time.perf_counter() From 8b36fe0130040d648eecad3f0f0f0dc40d42bbaf Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 30 Jan 2024 14:51:14 +0000 Subject: [PATCH 07/13] add doc and typing to registry.py --- src/lighteval/tasks/registry.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 8b74c00a..ff3caa46 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -22,6 +22,10 @@ class Registry: + """ + The Registry class is used to manage the task registry and get task classes. + """ + def __init__(self, cache_dir: str): """ Initialize the Registry class. From 4176f1e4da0d6b2449108ee1d62021f1b2d71948 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 30 Jan 2024 15:30:43 +0000 Subject: [PATCH 08/13] add doc to lighteval_tasks --- src/lighteval/metrics/metrics.py | 2 +- src/lighteval/tasks/lighteval_task.py | 145 ++++++++++++++++++++++---- 2 files changed, 128 insertions(+), 19 deletions(-) diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index df9af332..318a4599 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -501,7 +501,7 @@ def higher_is_better(): return res @staticmethod - def corpus_level_fns(): + def corpus_level_fns() -> dict[str, callable]: res = {} for metric in Metrics: if metric.value.category == MetricCategory.IGNORED: diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index 39ff3e58..17dbc024 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -40,7 +40,19 @@ class LightevalTask: - def __init__(self, name: str, cfg: dict, cache_dir: str = None, custom_tasks_module=None): + def __init__(self, name: str, cfg: dict, cache_dir: Optional[str] = None, custom_tasks_module=None): + """ + Initialize a LightEval task. + + Args: + name (str): The name of the task. + cfg (dict): The configuration dictionary containing + task-specific settings (from the task_table.json file). + cache_dir (Optional[str], optional): The directory to cache the + dataset. Defaults to None. + custom_tasks_module ([type], optional): A custom module + containing task-specific functions. Defaults to None. + """ self.name = name self.VERSION = 0 self.is_main_process = False @@ -108,6 +120,16 @@ def cfg(self): return self._cfg def doc_to_text_without_instructions(self, doc: Doc) -> str: + """ + Returns the query of the document without the instructions. If the + document has instructions, it removes them from the query: + + Args: + doc (Doc): The document. + + Returns: + str: The query of the document without the instructions. + """ if doc.instruction is not None: if not doc.query.startswith(doc.instruction): raise ValueError(f"Prompt query {doc.query} is not starting with instruction {doc.instruction}") @@ -115,6 +137,18 @@ def doc_to_text_without_instructions(self, doc: Doc) -> str: return doc.query def doc_to_text_and_instructions(self, doc: Doc) -> Tuple[str, str]: + """ + Returns a tuple with the query of the document and the instructions. + If the document has no instructions, the second element of the tuple is + an empty string. + + Args: + doc (Doc): The document. + + Returns: + Tuple[str, str]: A tuple with the query of the document and the + instructions. + """ if doc.instruction is not None: if not doc.query.startswith(doc.instruction): raise ValueError(f"Prompt query {doc.query} is not starting with instruction {doc.instruction}") @@ -122,10 +156,17 @@ def doc_to_text_and_instructions(self, doc: Doc) -> Tuple[str, str]: return (doc.query, "") def get_first_possible_fewshot_splits(self, number_of_splits: int = 1) -> list[str]: - """Parses the possible fewshot split keys in order: - train, then validation keys - and matches them with the available keys. - Returns the first available. + """ + Parses the possible fewshot split keys in order: train, then validation + keys and matches them with the available keys. Returns the first + available. + + Args: + number_of_splits (int, optional): The number of splits to return. + Defaults to 1. + + Returns: + list[str]: The list of the first available fewshot splits. """ # Possible few shot splits are the available splits not used for evaluation possible_fewshot_splits = [k for k in self.all_available_splits if k not in self.evaluation_split] @@ -145,6 +186,17 @@ def get_first_possible_fewshot_splits(self, number_of_splits: int = 1) -> list[s return None def _get_docs_from_split(self, keys, few_shots=False) -> list[Doc]: + """ + Get the documents from the dataset for the given keys (splits). + + Args: + keys (list): The list of keys (splits). + few_shots (bool, optional): Whether the documents are used for few + shot examples. Defaults to False. + + Returns: + list[Doc]: The list of documents. + """ if self.dataset is None: self.dataset = download_dataset_worker((self.dataset_path, self.dataset_config_name)) @@ -159,6 +211,13 @@ def _get_docs_from_split(self, keys, few_shots=False) -> list[Doc]: return docs def fewshot_docs(self) -> list[Doc]: + """ + Returns the few shot documents. If the few shot documents are not + available, it gets them from the few shot split or the evaluation split. + + Returns: + list[Doc]: The few shot documents. + """ if self._fewshot_docs is None: self._fewshot_docs = [] @@ -170,11 +229,28 @@ def fewshot_docs(self) -> list[Doc]: return self._fewshot_docs def eval_docs(self) -> list[Doc]: + """ + Returns the evaluation documents. + + Returns: + list[Doc]: The evaluation documents. + """ if self._docs is None: self._docs = self._get_docs_from_split(self.evaluation_split) return self._docs - def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False): + def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False) -> str: + """ + Returns the target of the given document. + + Args: + formatted_doc (Doc): The formatted document. + few_shot (bool, optional): Whether the document is used for few + shot examples. Defaults to False. + + Returns: + str: The target of the document. + """ if few_shot: if formatted_doc.target_for_fewshot_sorting is not None: return formatted_doc.target_for_fewshot_sorting @@ -184,6 +260,16 @@ def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False): # Requests def get_request_type(self) -> list[RequestType]: + """ + Returns the request types for the task. + + Returns: + list[RequestType]: The request types for the task. + + Raises: + NotImplementedError: If the request type is not implemented for the + task. + """ request_types = [] if self.has_metric_category[MetricCategory.TARGET_PERPLEXITY]: request_types.append(RequestType.LOGLIKELIHOOD) @@ -207,7 +293,7 @@ def construct_requests( self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str ) -> List[Request]: """ - Constructs a list of requests based on the given parameters. + Constructs a list of requests from the task based on the given parameters. Args: formatted_doc (Doc): The formatted document almost straight from the dataset. @@ -282,7 +368,17 @@ def construct_requests( return requests - def process_results(self, formatted_doc: Doc, results: list[ModelReturn]): + def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dict[str, float]: + """ + Processes the results of the task. and stores them in the output dict. + + Args: + formatted_doc (Doc): The formatted document of the task. + results (list[ModelReturn]): The results of the task, returned by the model class after evaluation. + + Returns: + dict[str, float]: The output dictionary containing the results of the task. + """ # Metrics management is done in metrics.__init__ outputs = {} if self.has_metric_category[MetricCategory.TARGET_PERPLEXITY]: @@ -319,6 +415,10 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]): return outputs def aggregation(self): + """ + Return a dict with metric name and its aggregation function for all + metrics + """ return Metrics.corpus_level_fns() @staticmethod @@ -349,6 +449,10 @@ def load_datasets(tasks: list["LightevalTask"], dataset_loading_processes: int = def download_dataset_worker(args): + """ + Worker function to download a dataset from the HuggingFace Hub. + Used for parallel dataset loading. + """ dataset_path, dataset_config_name = args dataset = load_dataset( path=dataset_path, @@ -370,22 +474,27 @@ def create_requests_from_tasks( # noqa: C901 use_chat_template: bool, ) -> Tuple[dict[RequestType, list[Request]], dict[TaskExampleId, Doc]]: """ - Takes a task dict and a fewshot dict and returns a dict of requests, a dict of docs, and a dict of requests origins. - The construction of prompts and thus the managing of few shots is done here. + Takes a task dict and a fewshot dict and returns a dict of requests, a dict + of docs, and a dict of requests origins. The construction of prompts and + thus the managing of few shots is done here. Args: - task_dict (_type_): _description_ - fewshot_dict (_type_): _description_ - num_fewshot_seeds (_type_): _description_ - lm (_type_): _description_ - max_samples (_type_): _description_ - evaluation_tracker (_type_): _description_ + task_dict (dict[str, LightevalTask]): A dictionary of tasks. + fewshot_dict (dict[str, list[Tuple[int, bool]]]): A dictionary of few + shot examples. + num_fewshot_seeds (int): The number of few shot seeds. + lm (BaseModel): The language model. + max_samples (int): The maximum number of samples. + evaluation_tracker (EvaluationTracker): The evaluation tracker. + use_chat_template (bool): Whether to use the chat template. Raises: - RuntimeError: _description_ + NotImplementedError: If the request type is not implemented for the + task. Returns: - _type_: _description_ + Tuple[dict[RequestType, list[Request]], dict[TaskExampleId, Doc]]: A + tuple containing the requests and the documents. """ docs: dict[TaskExampleId, Doc] = {} requests: dict[RequestType, list[Request]] = collections.defaultdict(list) From ef4a70224f0476454dcaea3e4c06bd9553c1bf13 Mon Sep 17 00:00:00 2001 From: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Date: Tue, 6 Feb 2024 12:54:25 +0100 Subject: [PATCH 09/13] Update src/lighteval/models/model_config.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- src/lighteval/models/model_config.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py index cbc87655..44869a40 100644 --- a/src/lighteval/models/model_config.py +++ b/src/lighteval/models/model_config.py @@ -77,9 +77,8 @@ class BaseModelConfig: batch_size (int): The batch size for model training. max_gen_toks (Optional[int]): The maximum number of tokens to generate. max_length (Optional[int]): The maximum length of the generated output. - add_special_tokens (bool, optional, defaults to True): - Whether to add special tokens to the input sequences. If `None`, the - default value will be set to `True` for seq2seq models (e.g. T5) and + add_special_tokens (bool, optional, defaults to True): Whether to add special tokens to the input sequences. + If `None`, the default value will be set to `True` for seq2seq models (e.g. T5) and `False` for causal models. model_parallel (Optional[bool]): Whether to use model parallelism. dtype (Optional[Union[str, torch.dtype]]): The data type of the model. From ba186e66526406d90268c88a48f123464ab31bd4 Mon Sep 17 00:00:00 2001 From: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Date: Tue, 6 Feb 2024 12:58:03 +0100 Subject: [PATCH 10/13] Update src/lighteval/models/model_config.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> --- src/lighteval/models/model_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py index 44869a40..c69ca6c2 100644 --- a/src/lighteval/models/model_config.py +++ b/src/lighteval/models/model_config.py @@ -207,7 +207,7 @@ def create_model_config(args: Namespace, accelerator: Union[Accelerator, None]) Raises: ValueError: If both an inference server address and model arguments are provided. - ValueError: If both multichoice continuations start with a space and do not start with a space. + ValueError: If multichoice continuations both should start with a space and should not start with a space. ValueError: If a base model is not specified when using delta weights or adapter weights. ValueError: If a base model is specified when not using delta weights or adapter weights. """ From 78442251b5b6a13af34aa84266c901e26defdfc6 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 6 Feb 2024 12:09:36 +0000 Subject: [PATCH 11/13] update from review --- src/lighteval/logging/info_loggers.py | 14 +++--- src/lighteval/models/model_config.py | 38 ++++++++------- src/lighteval/tasks/lighteval_task.py | 68 ++++++++++++++------------- src/lighteval/tasks/registry.py | 14 +++--- 4 files changed, 70 insertions(+), 64 deletions(-) diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index b59614cf..db58c11a 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -85,10 +85,10 @@ def log_args_info( Logs the information about the arguments passed to the method. Args: - num_fewshot_seeds (int): The number of few-shot seeds. - override_batch_size (Union[None, int]): The overridden batch size. - max_samples (Union[None, int]): The maximum number of samples. - job_id (str): The job ID. + num_fewshot_seeds (int): number of few-shot seeds. + override_batch_size (Union[None, int]): overridden batch size. + max_samples (Union[None, int]): maximum number of samples, if None, use all the samples available. + job_id (str): job ID. config (optional): BrrrConfig Returns: @@ -105,7 +105,7 @@ def log_model_info(self, model_info: ModelInfo) -> None: Logs the model information. Args: - model_info (ModelInfo): The model information to be logged. + model_info (ModelInfo): model information to be logged. """ self.model_name = model_info.model_name self.model_sha = model_info.model_sha @@ -187,7 +187,7 @@ class CompiledDetail: padded (int): Total umber of samples which needed padding during the batching step for the current task. non_padded (int): Total number of samples which did not need padding during the batching step for the current task. effective_few_shots (float): Average effective few shots across all samples for the current task. - The effective few shot is the number of few shots actually used to fit the prompt in the model context + effective few shot is the number of few shots actually used to fit the prompt in the model context length while allowing model generation of the expected size. num_truncated_few_shots (int): Total number of samples which required truncated prompts to fit the model size for the current task. """ @@ -213,7 +213,7 @@ class CompiledDetailOverAllTasks: padded (int): Number of samples which needed padding during the batching step across all tasks. non_padded (int): Number of samples which did not need padding during the batching step across all tasks. effective_few_shots (float): Average effective few shots across all samples across all tasks. - The effective few shot is the number of few shots actually used to fit the prompt in the model context + effective few shot is the number of few shots actually used to fit the prompt in the model context length while allowing model generation of the expected size. num_truncated_few_shots (int): Number of samples which required truncated prompts to fit the model size across all tasks. """ diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py index cbc87655..f9838706 100644 --- a/src/lighteval/models/model_config.py +++ b/src/lighteval/models/model_config.py @@ -28,8 +28,8 @@ class EnvConfig: Configuration class for environment settings. Attributes: - cache_dir (str): The directory for caching data. - token (str): The authentication token used for accessing the HuggingFace Hub. + cache_dir (str): directory for caching data. + token (str): authentication token used for accessing the HuggingFace Hub. """ cache_dir: str = None @@ -37,7 +37,7 @@ class EnvConfig: """Args: pretrained (str): - The HuggingFace Hub model ID name or the path to a pre-trained + HuggingFace Hub model ID name or the path to a pre-trained model to load. This is effectively the `pretrained_model_name_or_path` argument of `from_pretrained` in the HuggingFace `transformers` API. add_special_tokens (bool, optional, defaults to True): @@ -63,29 +63,31 @@ class BaseModelConfig: Base configuration class for models. Attributes: - pretrained (str): The HuggingFace Hub model ID name or the path to a + pretrained (str): HuggingFace Hub model ID name or the path to a pre-trained model to load. This is effectively the `pretrained_model_name_or_path` argument of `from_pretrained` in the HuggingFace `transformers` API. - accelerator (Accelerator): The accelerator to use for model training. - tokenizer (Optional[str]): The HuggingFace Hub tokenizer ID that will be + accelerator (Accelerator): accelerator to use for model training. + tokenizer (Optional[str]): HuggingFace Hub tokenizer ID that will be used for tokenization. multichoice_continuations_start_space (Optional[bool]): Whether to add a space at the start of each continuation in multichoice generation. - subfolder (Optional[str]): The subfolder within the model repository. - revision (str): The revision of the model. - batch_size (int): The batch size for model training. - max_gen_toks (Optional[int]): The maximum number of tokens to generate. - max_length (Optional[int]): The maximum length of the generated output. + For example, context: "What is the capital of France?" and choices: "Paris", "London". + Will be tokenized as: "What is the capital of France? Paris" and "What is the capital of France? London". + subfolder (Optional[str]): Tsubfolder within the model repository. + revision (str): revision of the model. + batch_size (int): batch size for model training. + max_gen_toks (Optional[int]): maximum number of tokens to generate. + max_length (Optional[int]): maximum length of the generated output. add_special_tokens (bool, optional, defaults to True): Whether to add special tokens to the input sequences. If `None`, the default value will be set to `True` for seq2seq models (e.g. T5) and `False` for causal models. model_parallel (Optional[bool]): Whether to use model parallelism. - dtype (Optional[Union[str, torch.dtype]]): The data type of the model. - device (Union[int, str]): The device to use for model training. - quantization_config (Optional[BitsAndBytesConfig]): The quantization - configuration for the model. + dtype (Optional[Union[str, torch.dtype]]): data type of the model. + device (Union[int, str]): device to use for model training. + quantization_config (Optional[BitsAndBytesConfig]): quantization + configuration for the model. Needed for 4-bit and 8-bit precision. load_in_8bit (bool): Whether to load the model in 8-bit precision. load_in_4bit (bool): Whether to load the model in 4-bit precision. trust_remote_code (bool): Whether to trust remote code during model @@ -200,11 +202,11 @@ def create_model_config(args: Namespace, accelerator: Union[Accelerator, None]) Create a model configuration based on the provided arguments. Args: - args (Namespace): The command-line arguments. - accelerator (Union[Accelerator, None]): The accelerator to use for model training. + args (Namespace): command-line arguments. + accelerator (Union[Accelerator, None]): accelerator to use for model training. Returns: - BaseModelConfig: The model configuration. + BaseModelConfig: model configuration. Raises: ValueError: If both an inference server address and model arguments are provided. diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index 17dbc024..e16963a9 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -45,10 +45,10 @@ def __init__(self, name: str, cfg: dict, cache_dir: Optional[str] = None, custom Initialize a LightEval task. Args: - name (str): The name of the task. - cfg (dict): The configuration dictionary containing + name (str): name of the task. + cfg (dict): configuration dictionary containing task-specific settings (from the task_table.json file). - cache_dir (Optional[str], optional): The directory to cache the + cache_dir (Optional[str], optional): directory to cache the dataset. Defaults to None. custom_tasks_module ([type], optional): A custom module containing task-specific functions. Defaults to None. @@ -125,10 +125,11 @@ def doc_to_text_without_instructions(self, doc: Doc) -> str: document has instructions, it removes them from the query: Args: - doc (Doc): The document. + doc (Doc): document class, containing the query and the + instructions. Returns: - str: The query of the document without the instructions. + str: Query of the document without the instructions. """ if doc.instruction is not None: if not doc.query.startswith(doc.instruction): @@ -143,7 +144,7 @@ def doc_to_text_and_instructions(self, doc: Doc) -> Tuple[str, str]: an empty string. Args: - doc (Doc): The document. + doc (Doc): document, containing the query and the instructions. Returns: Tuple[str, str]: A tuple with the query of the document and the @@ -162,11 +163,11 @@ def get_first_possible_fewshot_splits(self, number_of_splits: int = 1) -> list[s available. Args: - number_of_splits (int, optional): The number of splits to return. + number_of_splits (int, optional): Number of splits to return. Defaults to 1. Returns: - list[str]: The list of the first available fewshot splits. + list[str]: List of the first available fewshot splits. """ # Possible few shot splits are the available splits not used for evaluation possible_fewshot_splits = [k for k in self.all_available_splits if k not in self.evaluation_split] @@ -185,24 +186,24 @@ def get_first_possible_fewshot_splits(self, number_of_splits: int = 1) -> list[s hlog_warn(f"Careful, the task {self.name} is using evaluation data to build the few shot examples.") return None - def _get_docs_from_split(self, keys, few_shots=False) -> list[Doc]: + def _get_docs_from_split(self, splits: list[str], few_shots=False) -> list[Doc]: """ Get the documents from the dataset for the given keys (splits). Args: - keys (list): The list of keys (splits). + splits (list[str]): List of splits, (e.g. ["train", "dev"]) few_shots (bool, optional): Whether the documents are used for few shot examples. Defaults to False. Returns: - list[Doc]: The list of documents. + list[Doc]: List of documents. """ if self.dataset is None: self.dataset = download_dataset_worker((self.dataset_path, self.dataset_config_name)) docs = [] - for key in keys: - for item in self.dataset[key]: + for split in splits: + for item in self.dataset[split]: # Some tasks formatting is applied differently when the document is used for fewshot examples # vs when it's used for the actual prompt. That's why we store whether we are currently using the # doc for a fewshot sample (few_shots=True) or not, which then leads to the creation of a different Doc. @@ -216,7 +217,8 @@ def fewshot_docs(self) -> list[Doc]: available, it gets them from the few shot split or the evaluation split. Returns: - list[Doc]: The few shot documents. + list[Doc]: Documents that will be used for few shot examples. One + document = one few shot example. """ if self._fewshot_docs is None: self._fewshot_docs = [] @@ -233,7 +235,7 @@ def eval_docs(self) -> list[Doc]: Returns the evaluation documents. Returns: - list[Doc]: The evaluation documents. + list[Doc]: Evaluation documents. """ if self._docs is None: self._docs = self._get_docs_from_split(self.evaluation_split) @@ -244,12 +246,12 @@ def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False) -> str: Returns the target of the given document. Args: - formatted_doc (Doc): The formatted document. + formatted_doc (Doc): Formatted document. few_shot (bool, optional): Whether the document is used for few shot examples. Defaults to False. Returns: - str: The target of the document. + str: Target of the document, which is the correct answer for a document. """ if few_shot: if formatted_doc.target_for_fewshot_sorting is not None: @@ -264,7 +266,7 @@ def get_request_type(self) -> list[RequestType]: Returns the request types for the task. Returns: - list[RequestType]: The request types for the task. + list[RequestType]: Request types for the task. Raises: NotImplementedError: If the request type is not implemented for the @@ -296,13 +298,13 @@ def construct_requests( Constructs a list of requests from the task based on the given parameters. Args: - formatted_doc (Doc): The formatted document almost straight from the dataset. - ctx (str): The context, which is the few shot examples + the query. - document_id_seed (str): The index of the document in the task appended with the seed used for the few shot sampling. - current_task_name (str): The name of the current task. + formatted_doc (Doc): Formatted document almost straight from the dataset. + ctx (str): Context, which is the few shot examples + the query. + document_id_seed (str): Index of the document in the task appended with the seed used for the few shot sampling. + current_task_name (str): Name of the current task. Returns: - dict[RequestType, List[Request]]: The list of requests. + dict[RequestType, List[Request]]: List of requests. """ requests = {type: [] for type in RequestType} @@ -370,14 +372,14 @@ def construct_requests( def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dict[str, float]: """ - Processes the results of the task. and stores them in the output dict. + Processes the results of the task, and stores them in the output dict. Args: - formatted_doc (Doc): The formatted document of the task. - results (list[ModelReturn]): The results of the task, returned by the model class after evaluation. + formatted_doc (Doc): formatted document of the task. + results (list[ModelReturn]): results of the task, returned by the model class after evaluation. Returns: - dict[str, float]: The output dictionary containing the results of the task. + dict[str, float]: output dictionary containing the results of the task. """ # Metrics management is done in metrics.__init__ outputs = {} @@ -428,7 +430,7 @@ def load_datasets(tasks: list["LightevalTask"], dataset_loading_processes: int = Args: tasks (list): A list of tasks. - dataset_loading_processes (int, optional): The number of processes to use for dataset loading. Defaults to 1. + dataset_loading_processes (int, optional): number of processes to use for dataset loading. Defaults to 1. Returns: None @@ -482,10 +484,12 @@ def create_requests_from_tasks( # noqa: C901 task_dict (dict[str, LightevalTask]): A dictionary of tasks. fewshot_dict (dict[str, list[Tuple[int, bool]]]): A dictionary of few shot examples. - num_fewshot_seeds (int): The number of few shot seeds. - lm (BaseModel): The language model. - max_samples (int): The maximum number of samples. - evaluation_tracker (EvaluationTracker): The evaluation tracker. + num_fewshot_seeds (int): number of few shot seeds. + lm (BaseModel): language model class that will be used to eventually + truncate the few shot examples (we need the maximum input size of the + model) + max_samples (int): maximum number of samples. + evaluation_tracker (EvaluationTracker): evaluation tracker. use_chat_template (bool): Whether to use the chat template. Raises: diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index ff3caa46..f662bf5a 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -31,10 +31,10 @@ def __init__(self, cache_dir: str): Initialize the Registry class. Args: - cache_dir (str): The directory path for caching. + cache_dir (str): Directory path for caching. Attributes: - cache_dir (str): The directory path for caching. + cache_dir (str): Directory path for caching. TASK_REGISTRY (dict[str, LightevalTask]): A dictionary containing the registered tasks. """ self.cache_dir: str = cache_dir @@ -47,11 +47,11 @@ def get_task_class( Get the task class based on the task name. Args: - task_name (str): The name of the task. + task_name (str): Name of the task. custom_tasks_registry (Optional[dict[str, LightevalTask]]): A dictionary containing custom tasks. Returns: - LightevalTask: The task class. + LightevalTask: Task class. Raises: ValueError: If the task is not found in the task registry or custom task registry. @@ -75,7 +75,7 @@ def get_task_dict( Args: task_name_list (List[str]): A list of task names. - custom_tasks_file (Optional[str]): The path to the custom tasks file. + custom_tasks_file (Optional[str]): Path to the custom tasks file. Returns: Dict[str, LightevalTask]: A dictionary containing the tasks. @@ -165,9 +165,9 @@ def create_config_tasks( Create configuration tasks based on the provided meta_table. Args: - meta_table (Optional[Dataset]): The meta_table containing task + meta_table (Optional[Dataset]): meta_table containing task configurations. If not provided, it will be loaded from TABLE_PATH. - cache_dir (Optional[str]): The directory to store cached data. If not + cache_dir (Optional[str]): Directory to store cached data. If not provided, the default cache directory will be used. Returns: From 2975ee237621c5be851f6d14458325a1fda3ef9a Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 6 Feb 2024 12:10:41 +0000 Subject: [PATCH 12/13] make style --- src/lighteval/models/model_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py index 54cf74a4..75f13f94 100644 --- a/src/lighteval/models/model_config.py +++ b/src/lighteval/models/model_config.py @@ -53,7 +53,7 @@ class EnvConfig: dtype (Union[str, torch.dtype], optional, defaults to None):): Converts the model weights to `dtype`, if specified. Strings get converted to `torch.dtype` objects (e.g. `float16` -> `torch.float16`). - Use `dtype="auto"` to derive the type from the model’s weights. + Use `dtype="auto"` to derive the type from the model's weights. """ @@ -79,7 +79,7 @@ class BaseModelConfig: batch_size (int): The batch size for model training. max_gen_toks (Optional[int]): The maximum number of tokens to generate. max_length (Optional[int]): The maximum length of the generated output. - add_special_tokens (bool, optional, defaults to True): Whether to add special tokens to the input sequences. + add_special_tokens (bool, optional, defaults to True): Whether to add special tokens to the input sequences. If `None`, the default value will be set to `True` for seq2seq models (e.g. T5) and `False` for causal models. model_parallel (Optional[bool]): Whether to use model parallelism. From c94b8b5f1ceaa279731b663475f9a0121b3a2a89 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 6 Feb 2024 12:40:14 +0000 Subject: [PATCH 13/13] fix doc to match google style --- src/lighteval/logging/evaluation_tracker.py | 5 ++++- src/lighteval/logging/info_loggers.py | 20 ++++++++++++++++---- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index a14767c6..0515af46 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -90,7 +90,8 @@ def save( ) -> None: """Saves the experiment information and results to files, and to the hub if requested. - Note: In case of save failure, this function will only print a warning, with the error message. + Note: + In case of save failure, this function will only print a warning, with the error message. Args: output_dir (str): Local folder path where you want results to be saved @@ -215,6 +216,7 @@ def details_to_hub( details_folder_path (str or Path): Local path of the current's experiment details folder. The details folder (created by [`EvaluationTracker.save`]) should contain one parquet file per task used during the evaluation run of the current model. push_as_public (bool, optional): If True, the results will be pushed publicly, else the datasets will be private. + """ results_file_path = str(results_file_path) details_folder_path = str(details_folder_path) @@ -266,6 +268,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None: Args: repo_id (str): Details dataset repository path on the hub (`org/dataset`) model_name (str): Name of the currently evaluated model. + """ # Add a nice dataset card and the configuration YAML files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset") diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index db58c11a..38d4d7ab 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -39,7 +39,7 @@ class GeneralConfigLogger: job_id (int): If the evaluation suite is launched as a slurm job, stores the current job id. Purely informative parameter used to retrieve scheduler logs. start_time (float): Start time of the experiment. Logged at class init. - end_time (float): Start time of the experiment. Logged when calling [`GeneralConfigLogger.log_end_time`] + end_time (float): End time of the experiment. Logged when calling [`GeneralConfigLogger.log_end_time`] total_evaluation_time_secondes (str): Inferred total evaluation time in seconds (from the start and end times). model_name (str): Name of the currently evaluated model. model_sha (str): Commit hash of the currently evaluated model on the hub if available. @@ -87,12 +87,15 @@ def log_args_info( Args: num_fewshot_seeds (int): number of few-shot seeds. override_batch_size (Union[None, int]): overridden batch size. + If strictly positive, its value is used as the batch size for all experiments. + Else, the batch size is automatically inferred depending on what fits in memory. max_samples (Union[None, int]): maximum number of samples, if None, use all the samples available. - job_id (str): job ID. + job_id (str): job ID, used to retrieve logs. config (optional): BrrrConfig Returns: None + """ self.num_fewshot_seeds = num_fewshot_seeds self.override_batch_size = override_batch_size @@ -105,7 +108,8 @@ def log_model_info(self, model_info: ModelInfo) -> None: Logs the model information. Args: - model_info (ModelInfo): model information to be logged. + model_info (ModelInfo): Model information to be logged. + """ self.model_name = model_info.model_name self.model_sha = model_info.model_sha @@ -129,6 +133,7 @@ class DetailsLogger: Example: winogrande: [sample1_details, sample2_details, ...] compiled_details (dict[str, `CompiledDetail`]): : Maps each task name to the list of its samples' compiled details. compiled_details_over_all_tasks (CompiledDetailOverAllTasks): Aggregated details over all the tasks. + """ @dataclass() @@ -156,6 +161,7 @@ class Detail: choices (list): List of the possible choices (for multichoice/loglikelihood evaluations) gold_index (list): Indices of the gold targets among the [`choices`] metrics (dict): Metric name to current example score + """ example: str = "" @@ -190,6 +196,7 @@ class CompiledDetail: effective few shot is the number of few shots actually used to fit the prompt in the model context length while allowing model generation of the expected size. num_truncated_few_shots (int): Total number of samples which required truncated prompts to fit the model size for the current task. + """ hashes: dict = field(default_factory=dict) @@ -216,6 +223,7 @@ class CompiledDetailOverAllTasks: effective few shot is the number of few shots actually used to fit the prompt in the model context length while allowing model generation of the expected size. num_truncated_few_shots (int): Number of samples which required truncated prompts to fit the model size across all tasks. + """ hashes: dict = field(default_factory=dict) @@ -415,7 +423,8 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int = Args: task_dict (dict[str, LightevalTask]): used to determine what aggregation function to use for each metric - bootstrap_iters (int, optional): _description_. Defaults to 1000. + bootstrap_iters (int, optional): Number of runs used to run the statistical bootstrap. Defaults to 1000. + """ for task_name, metrics in self.metrics_values.items(): @@ -467,6 +476,7 @@ class VersionsLogger: Attributes: version (dict[str, int]): Maps the task names with the task versions. + """ # the versions dict will be a dict of task_name: task_version @@ -482,6 +492,7 @@ class TaskConfigLogger: Attributes: tasks_config (dict[str, TaskConfig]): Maps each task to its associated [`TaskConfig`] + """ @dataclass @@ -506,6 +517,7 @@ class TaskConfig: truncated_num_docs (bool): Whether less than the total number of documents were used output_regex (str) frozen (bool) + """ name: str