diff --git a/README.md b/README.md index c04a66118..2e2f35e6d 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ LightEval is an evaluation suite which gathers a selection of features from wide It is still an early, internal version - it should be nice to use but don't expect 100% stability! -In case of problems or question, feel free to open an issue! +In case of problems or question, feel free to open an issue! ## How to install and use ### Requirements @@ -50,11 +50,11 @@ Lastly, create a **line summary** of your evaluation, in `metadata_table.json`. - `suite` (list), the suite(s) to which your evaluation should belong. This field allows us to compare different tasks implementation, and is used a task selection to differentiate the versions to launch. At the moment, you'll find the keywords ["helm", "bigbench", "original", "lighteval"]; you can add also add new ones (for test, we recommend using "custom"). - `prompt_function` (str), the name of the prompt function you defined in the step above - `hf_repo` (str), the path to your evaluation dataset on the hub -- `hf_subset` (str), the specific subset you want to use for your evaluation (note: when the dataset has no subset, fill this field with `"default"`, not with `None` or `""`) +- `hf_subset` (str), the specific subset you want to use for your evaluation (note: when the dataset has no subset, fill this field with `"default"`, not with `None` or `""`) - `hf_avail_splits` (list), all the splits available for your dataset (train, valid or validation, test, other...) - `evaluation_splits` (list), the splits you want to use for evaluation - `few_shots_split` (str, can be `null`), the specific split from which you want to select samples for your few-shot examples. It should be different from the sets included in `evaluation_splits` -- `few_shots_select` (str, can be `null`), the method that you will use to select items for your few-shot examples. Can be `null`, or one of: +- `few_shots_select` (str, can be `null`), the method that you will use to select items for your few-shot examples. Can be `null`, or one of: - `balanced` selects examples from the `few_shots_split` with balanced labels, to avoid skewing the few shot examples (hence the model generations) towards one specific label - `random` selects examples at random from the `few_shots_split` - `random_sampling` selects new examples at random from the `few_shots_split` for every new item, but if a sampled item is equal to the current one, it is removed from the available samples @@ -102,7 +102,7 @@ These metrics need the model to generate an output. They are therefore slower. - `exact_match_indicator`: Exact match with some preceding context (before an indicator) removed - `f1_score_quasi` (HELM): Average F1 score in terms of word overlap between the model output and gold, with both being normalized first - `f1_score`: Average F1 score in terms of word overlap between the model output and gold without normalisation - - `f1_score_macro`: Corpus level macro F1 score + - `f1_score_macro`: Corpus level macro F1 score - `f1_score_macro`: Corpus level micro F1 score - Summarization: - `rouge` (Harness): Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) @@ -141,7 +141,7 @@ These metrics need both the generation and its logprob. They are not working at - `prediction_perplexity` (HELM): Measure of the logprob of a given input. ## Adding a new metric -If you want to add a new metric, first check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `metrics_sample`. If not, add it to either of these files depending on the level at which it is applied. Then, follow the example in `src.lighteval.metrics.metrics` to register your metric. +If you want to add a new metric, first check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `metrics_sample`. If not, add it to either of these files depending on the level at which it is applied. Then, follow the example in `src.lighteval.metrics.metrics` to register your metric. ## Examples of scripts to launch lighteval on the cluster ### Evaluate a whole suite on one node, 8 GPUs diff --git a/pyproject.toml b/pyproject.toml index 3186ebd72..edec95af1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,8 +82,8 @@ optimum = ["optimum==1.12.0"] quantization = ["bitsandbytes>=0.41.0", "auto-gptq>=0.4.2"] adapters = ["peft==0.3.0"] nanotron = [ - "nanotron@git+https://github.com/huggingface/nanotron@main", - "brrr@git+https://github.com/huggingface/brrr@fix-lighteval", + "nanotron@git+https://github.com/huggingface/nanotron@8c1a49588d0745a6404644a86547c2dd6a63640e", + "brrr@git+https://github.com/huggingface/brrr@e8a503e2ec08b34eed7522d331aec3bee8cdd29b", "tensorboardX" ] diff --git a/src/lighteval/data.py b/src/lighteval/data.py index d671aee1c..9c1360aca 100644 --- a/src/lighteval/data.py +++ b/src/lighteval/data.py @@ -72,7 +72,7 @@ def get_original_order(self, new_arr: list) -> list: return original_order - def get_set_split_start_end(self, split_id: int) -> tuple[int, int]: + def get_split_start_end(self, split_id: int) -> tuple[int, int]: """ Get the start and end indices of a dataset split. @@ -96,7 +96,7 @@ def splits_start_end_iterator(self) -> tuple[int, int]: tuple: A tuple containing the start and end indices of a split. """ for split_id in range(self.dataset_splits): - yield self.get_set_split_start_end(split_id) + yield self.get_split_start_end(split_id) def __getitem__(self, index) -> Request: """ @@ -189,9 +189,7 @@ def _sorting_criteria(self, x) -> int: Returns: Any: The collated data. """ - toks = x[0] - meta_data = x[1] - stop_tokens, gen_length = meta_data[0], meta_data[1] + toks, (stop_tokens, gen_length) = x return -(len(toks) + gen_length) diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py index e78257e8a..3b17854e7 100644 --- a/src/lighteval/metrics/__init__.py +++ b/src/lighteval/metrics/__init__.py @@ -7,21 +7,12 @@ def apply_target_perplexity_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]): - if len(formatted_doc.get_golds()) != 1: - raise ValueError("Target perplexity metric can only be used with one gold reference") outputs = {} - reference_text = formatted_doc.get_golds()[0] - current_result = results.pop(0) - target_logprob = current_result.result[0] - target_acc = current_result.result[1] + current_results = [results.pop(0) for _ in range(len(formatted_doc.get_golds()))] for metric in metrics: - if Metrics[metric].value.category == MetricCategory.TARGET_PERPLEXITY: - outputs.update( - Metrics[metric].value.compute( - logprobs=target_logprob, target_acc=target_acc, reference_text=reference_text - ) - ) + if Metrics[metric].value.category == MetricCategory.PERPLEXITY: + outputs.update(Metrics[metric].value.compute(results=current_results)) return results, outputs @@ -39,9 +30,7 @@ def apply_perplexity_metric(results: list[ModelReturn], formatted_doc: Doc, metr for metric in metrics: if Metrics[metric].value.category == MetricCategory.PERPLEXITY: - outputs.update( - Metrics[metric].value.compute(logprobs=current_result.result, reference_text=reference_text) - ) + outputs.update(Metrics[metric].value.compute(results=current_result, reference_text=reference_text)) return results, outputs diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 15070d341..ec123741b 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -275,16 +275,17 @@ def compute(self, choices_logprob: list[float], gold_ixs: list[float], formatted return 1.0 / (min(ranked_choices) + 1) -def acc_golds_likelihood(target_acc: list[int] | int, **kwargs) -> int: +def acc_golds_likelihood(results: list[tuple[float, int]], **kwargs) -> int: """Tests if at least one of predicted gold targets' log-likelihood is above 0.5. Args: - target_acc (list[int]): List of scores indicating whether the predictions log-probabilities are above 0.5 aggregated. + results (list[int]): List of tuples containing, for each gold, the predictions log-probabilities associated with whether they are above 0.5 aggregated. + formatted_doc (Doc): _description_ Returns: int: 1 if at least one of the possible golds had a log-likelihood above 0.5. """ - return max([int(acc_ppl) for acc_ppl in as_list(target_acc)]) + return max([int(acc_ppl) for _, acc_ppl in results]) class ROUGE: diff --git a/src/lighteval/metrics/sample_preparator.py b/src/lighteval/metrics/sample_preparator.py index c28ed2470..659022920 100644 --- a/src/lighteval/metrics/sample_preparator.py +++ b/src/lighteval/metrics/sample_preparator.py @@ -106,14 +106,14 @@ def count_units(self, text: str) -> int: if self.units_type == "bytes": return len(text.encode("utf-8")) - def prepare(self, logprobs: list[float] | float, reference_text: str, **kwargs): + def prepare(self, results, reference_text, **kwargs): """Prepares an individual perplexity example to the format expected by metrics computed at the corpus level (aggregated). Args: - logprobs (list[float]): List of the logprobabilities computed for each item of the sequence or single aggregated logprob over the sequence + results (list[float]): List of the logprobabilities computed for each item reference_text (str): Current reference text for which to compute the length in self.units_type Returns: PerplexityCorpusMetricInput: Stores the measured logprobs and associated text lengths, counted in the reference unit. """ - return PerplexityCorpusMetricInput(logprobs=logprobs, weights=self.count_units(reference_text)) + return PerplexityCorpusMetricInput(logprobs=results.result, weights=self.count_units(reference_text)) diff --git a/src/lighteval/models/brrr_models.py b/src/lighteval/models/brrr_models.py index ac46aaa47..5e82bf1ef 100644 --- a/src/lighteval/models/brrr_models.py +++ b/src/lighteval/models/brrr_models.py @@ -1,8 +1,7 @@ -# flake8: noqa: C901,E1120 +# flake8: noqa: C901 import os import time -from dataclasses import dataclass -from typing import List, Optional, Tuple, Union, Type +from typing import List, Optional, Tuple, Union import torch import torch.nn.functional as F @@ -29,22 +28,9 @@ from tqdm import tqdm from transformers import AutoTokenizer, BatchEncoding -from lighteval.tasks.requests import ( - GreedyUntilRequest, - LoglikelihoodRequest, - LoglikelihoodRollingRequest, - LoglikelihoodSingleTokenRequest, -) -from lighteval.data import ( - GenDistributedSampler, - GenerativeTaskDataset, - LoglikelihoodDataset, - LoglikelihoodSingleTokenDataset, -) +from lighteval.data import GenDataset, GenDistributedSampler, LoglikelihoodDataset, LoglikelihoodSingleTokenDataset from lighteval.models.model_output import Batch, GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn -from lighteval.tasks.requests import GreedyUntilRequest -from lighteval.utils import as_list -from lighteval.utils_parallelism import find_executable_batch_size +from lighteval.utils import as_list, find_executable_batch_size # from .brrr_generation import GenerationConfig, GenerationInputs, SamplerType, greedy_search_tokenized @@ -55,7 +41,8 @@ TokenSequence = Union[List[int], torch.LongTensor, torch.Tensor, BatchEncoding] -STARTING_BATCH_SIZE = 512 +# _DeviceMapping = NewType("DeviceMapping", Mapping[str, Union[int, str, torch.device]]) + class BRRRModel: # Default max sequence length setting for when no `max_length` is provided @@ -81,7 +68,6 @@ def __init__( s5cmd_numworkers: int = 64, s5cmd_concurrency: int = 10, s5cmd_path: str = "/admin/home/thomwolf/miniconda/envs/b4r/bin/s5cmd", - model_class: Optional[Type] = None, ): """Initializes a brrr model for evaluation. Args: @@ -134,9 +120,6 @@ def __init__( self.tokenizer.model_max_length = self.max_length model_config_cls = self.model_config.__class__.__name__ - if model_class is not None: - CONFIG_TO_MODEL_CLASS[self.model_config.__class__.__name__] = model_class - if model_config_cls not in CONFIG_TO_MODEL_CLASS: raise ValueError( f"Unsupported model config {model_config_cls}. Only {CONFIG_TO_MODEL_CLASS.keys()} are supported" @@ -411,7 +394,7 @@ def _encode_pair(self, context, continuation): continuation_enc = whole_enc[context_enc_len:] return context_enc, continuation_enc - def homogeneize_ending_conditions(self, ending_condition: Union[tuple, dict, list, str]) -> tuple[list, int]: + def homogeneize_ending_conditions(self, ending_condition: tuple | dict | list | str) -> tuple[list, int]: """Ending conditions are submitted in several possible formats. By default in lighteval we pass them as tuples (stop sequence, max number of items). In the harness they sometimes are passed as dicts {"until": .., "max_length": ...} or @@ -506,7 +489,7 @@ def loglikelihood_single_token( disable_tqdm=bool(dist.get_rank(self.parallel_context.world_pg) != 0), ) - def loglikelihood(self, requests: List[LoglikelihoodRequest], override_bs=None) -> List[LoglikelihoodReturn]: + def loglikelihood(self, requests: List[Tuple[str, str]], override_bs=None) -> List[LoglikelihoodReturn]: """Tokenize the context and continuation and compute the log likelihood of those tokenized sequences. @@ -535,7 +518,7 @@ def loglikelihood(self, requests: List[LoglikelihoodRequest], override_bs=None) disable_tqdm=bool(dist.get_rank(self.parallel_context.world_pg) != 0), ) - def loglikelihood_rolling(self, requests: List[LoglikelihoodRollingRequest], override_bs=None) -> List[LoglikelihoodReturn]: + def loglikelihood_rolling(self, requests: List[Tuple[str, str]], override_bs=None) -> List[LoglikelihoodReturn]: """This function is used to compute the log likelihood of the context for perplexity metrics.""" tokenized_reqs = [] @@ -625,7 +608,7 @@ def prepare_batch( # when too long to fit in context, truncate from the left inp = torch.tensor( - tokens[-max_context:], # [:-1], + (tokens)[-max_context:], # [:-1], dtype=torch.long, ) @@ -716,7 +699,7 @@ def _get_subsets(self, dataset, dataset_splits): @torch.inference_mode() def _loglikelihood_single_token( - self, requests: List[LoglikelihoodSingleTokenRequest], disable_tqdm: bool = False, override_bs: int = -1, dataset_splits: int = 1 + self, requests, disable_tqdm: bool = False, override_bs: int = -1, dataset_splits: int = 1 ) -> List[LoglikelihoodSingleTokenReturn]: dataset = LoglikelihoodSingleTokenDataset(requests=requests) res = [] @@ -938,7 +921,7 @@ def _loglikelihood_single_token( # We are in a process which return no output (beginning/middle of the PP group) return [] - return dataset.get_original_order(res) + return dataset.ordered.get_original(res) @torch.inference_mode() def _loglikelihood_tokens( @@ -949,14 +932,26 @@ def _loglikelihood_tokens( dataset_splits: int = 1, return_bool_score: bool = True, ) -> List[LoglikelihoodReturn]: - dataset = LoglikelihoodDataset(requests=requests, dataset_splits=dataset_splits) + dataset = LoglikelihoodDataset(requests=requests) res = [] # Dataset is sorted in descending size. # every 20-25% of the dataset we try to double the batch size for speed up - starting_batch_size = STARTING_BATCH_SIZE + starting_batch_size = 512 + + total_length, subset_length = self._get_subsets(dataset, dataset_splits) + + for s, subset_start in enumerate( + tqdm( + range(0, total_length, subset_length), + disable=disable_tqdm, + position=0, + desc=f"loglikelihood -- Node {dist.get_rank(self.parallel_context.world_pg)}", + ) + ): + dataset.split_start = subset_start + dataset.split_end = min(subset_start + subset_length, total_length) - for s, (split_start, split_end) in tqdm(enumerate(dataset.splits_start_end_iterator())): # automatic (variable) batch size detection for vectorization # pull longest context sample from request _, context_enc, continuation_enc = dataset[0] @@ -1160,18 +1155,18 @@ def _loglikelihood_tokens( # print(f"i {i} padded: {r.padded}") if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank: - assert len(res) == (split_end-split_start), "we didn't cover all the data" + assert len(res) == total_length, "we didn't cover all the data" if len(res) == 0: # We are in a process which return no output (beginning/middle of the PP group) return [] - return dataset.get_original_order(res) + return dataset.ordered.get_original(res) @torch.inference_mode() def greedy_until( self, - requests: List[GreedyUntilRequest], + requests: List[Tuple[str, dict]], task_names: Optional[List[str]] = None, returns_logits=False, disable_tqdm: bool = False, @@ -1183,24 +1178,15 @@ def greedy_until( # pull longest context sample from request if task_names: enc_inputs = [ - (index, ( - self.tok_encode(req.context), - self.homogeneize_ending_conditions((req.stop_sequence, req.generation_size)), - task_name, - )) - for index, (req, task_name) in enumerate(zip(requests, task_names)) + (self.tok_encode(req[0]), self.homogeneize_ending_conditions(req[1]), task_name) + for req, task_name in zip(requests, task_names) ] else: enc_inputs = [ - (index, ( - self.tok_encode(req.context), - self.homogeneize_ending_conditions((req.stop_sequence, req.generation_size)), - None, - )) - for index, req in enumerate(requests) + (self.tok_encode(req[0]), self.homogeneize_ending_conditions(req[1]), None) for req in requests ] - dataset = GenerativeTaskDataset(requests=enc_inputs, dataset_splits=dataset_splits) + dataset = GenDataset(requests=enc_inputs) res = [] # Dataset is sorted in descending size. @@ -1209,20 +1195,20 @@ def greedy_until( total_length, subset_length = self._get_subsets(dataset, dataset_splits) - for s, _ in enumerate( + for s, subset_start in enumerate( tqdm( - dataset.splits_start_end_iterator(), - total=dataset_splits, - desc=f"greedy -- Node {dist.get_rank(self.parallel_context.world_pg)}", - position=0, + range(0, total_length, subset_length), disable=disable_tqdm, + position=0, + desc=f"greedy -- Node {dist.get_rank(self.parallel_context.world_pg)}", ) ): - # print(dataset[0]) + dataset.split_start = subset_start + dataset.split_end = min(subset_start + subset_length, total_length) + _, (context_enc, _, _) = dataset[0] max_gen = max(d[1][1][1] for d in dataset) max_input_length = min(len(context_enc) + max_gen, self.max_length) - # max_input_length = len(context_enc) batch_size = self._get_batch_size( override_bs=override_bs, max_input_length=max_input_length, starting_batch_size=starting_batch_size ) @@ -1374,7 +1360,7 @@ def greedy_until( # We are in a process which return no output (beginning/middle of the PP group) return [] - return dataset.get_original_order(res) + return dataset.ordered.get_original(res) class MultiTokenEOSCriteria(transformers.StoppingCriteria): diff --git a/src/main_brrr.py b/src/main_brrr.py index ea03f7e41..bd257eac2 100644 --- a/src/main_brrr.py +++ b/src/main_brrr.py @@ -195,7 +195,6 @@ def main(args): lm=model, max_samples=lighteval_config.tasks.max_samples, evaluation_tracker=evaluation_tracker, - use_chat_template=False, ) with htrack_block("Setting seeds and waiting for all processes"): diff --git a/tasks_examples/open_llm_leaderboard_tasks.txt b/tasks_examples/open_llm_leaderboard_tasks.txt index 5736e9537..41c0ff35a 100644 --- a/tasks_examples/open_llm_leaderboard_tasks.txt +++ b/tasks_examples/open_llm_leaderboard_tasks.txt @@ -57,4 +57,4 @@ lighteval|mmlu:security_studies|5|0 lighteval|mmlu:sociology|5|0 lighteval|mmlu:us_foreign_policy|5|0 lighteval|mmlu:virology|5|0 -lighteval|mmlu:world_religions|5|0 +lighteval|mmlu:world_religions|5|0 \ No newline at end of file diff --git a/tests/reference_scores/harness_metrics.json b/tests/reference_scores/harness_metrics.json index 1c8c5b91d..a6c506f34 100644 --- a/tests/reference_scores/harness_metrics.json +++ b/tests/reference_scores/harness_metrics.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:408956938a6b7a18b03658bb9772b471efcea4aa04afb0b35d76cecfca6a706e -size 48376580 +oid sha256:a1965f0b9c66cfe1b1f3cc380a80949e32eab92ae8eac079c0339506ce827093 +size 48373142