Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix a few typos and do a tiny refactor #187

Merged
merged 15 commits into from
Jul 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions run_evals_accelerate.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,12 @@ def get_parser():
parser.add_argument(
"--public_run", default=False, action="store_true", help="Push results and details to a public repo"
)
parser.add_argument("--cache_dir", type=str, default=CACHE_DIR)
parser.add_argument(
"--cache_dir",
type=str,
default=CACHE_DIR,
help="Cache directory for downloaded datasets & model, defaults to `HF_HOME` environment variable",
)
parser.add_argument(
"--results_org",
type=str,
Expand All @@ -65,13 +70,13 @@ def get_parser():
"--custom_tasks",
type=str,
default=None,
help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)",
help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formatting functions)",
)
group.add_argument(
"--tasks",
type=str,
default=None,
help="Id of a task, e.g. 'original|mmlu:abstract_algebra|5|0' or path to a texte file with a list of tasks",
help="Comma-separated ids of tasks, e.g. 'original|mmlu:abstract_algebra|5' or path to a text file with a list of tasks",
)
parser.add_argument("--num_fewshot_seeds", type=int, default=1, help="Number of trials the few shots")
return parser
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def evaluate( # noqa: C901
# A request output tupe is a Tuple where the first element is the index of
# the request for one document of one task i.e.
# task: "arc_easy", doc: "0"# request: "0" -> request_index = 0,
# We can have multiple request per doc for multi choice tasks for example.
# We can have multiple requests per doc for multi choice tasks for example.

# all responses for each (task, doc)
RequestIndexModelResponseTuple = collections.namedtuple(
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/logging/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ def push_results_to_tensorboard( # noqa: C901
self, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail]
):
if not is_nanotron_available():
hlog_warn("You cannot push results to tensorboard with having nanotron installed. Skipping")
hlog_warn("You cannot push results to tensorboard without having nanotron installed. Skipping")
return
config: Config = self.general_config_logger.config
lighteval_config = config.lighteval
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/metrics/imports/bert_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def greedy_cos_idf(
- :param: `ref_masks` (torch.LongTensor): BxKxK, BERT attention mask for
reference sentences.
- :param: `ref_idf` (torch.Tensor): BxK, idf score of each word
piece in the reference setence
piece in the reference sentence
- :param: `hyp_embedding` (torch.Tensor):
embeddings of candidate sentences, BxKxd,
B: batch size, K: longest length, d: bert dimenison
Expand Down
4 changes: 2 additions & 2 deletions src/lighteval/metrics/judge_prompts.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@
{"name": "pair-math-v1-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. Your evaluation should consider correctness and helpfulness. You will be given reference answers, the assistant A's answers, the assistant B's answers. Your job is to determine which assistant provides correct and helpful answers to the second user question. Begin your evaluation by comparing both assistants' answers with the reference answers. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": "general", "output_format": "[[A]]"}
{"name": "single-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
{"name": "single-math-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
{"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. You evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
{"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. You evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
{"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Your evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
{"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Your evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
2 changes: 1 addition & 1 deletion src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ class Metrics(Enum):
use_case=MetricUseCase.SUMMARIZATION,
sample_level_fn=JudgeLLM(
judge_model_name="gpt-3.5-turbo",
template_path=os.path.join(os.path.dirname(__file__), "", "judge_prompts.jsonl"),
template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
multi_turn=False,
).compute,
corpus_level_fn={
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/metrics/metrics_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,7 +644,7 @@ def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[
"""
Compute the score of a generative task using a llm as a judge.
The generative task can be multiturn with 2 turns max, in that case, we
return scores for turn 1 and 2. Also returns user_prompt and judgment
return scores for turn 1 and 2. Also returns user_prompt and judgement
which are ignored later by the aggregator.
"""

Expand Down
8 changes: 4 additions & 4 deletions src/lighteval/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def __init__(
self._add_special_tokens = config.add_special_tokens if config.add_special_tokens is not None else False
self._tokenizer = self._create_auto_tokenizer(config, env_config)

# If model_parallel is not set we compare the number of process with the number of GPUs
# If model_parallel is not set we compare the number of processes with the number of GPUs
self.model = self._create_auto_model(config, env_config)
self.model.eval()
torch.set_grad_enabled(False)
Expand Down Expand Up @@ -819,7 +819,7 @@ def _loglikelihood_tokens(
)
res.append(answer)

# Clean up GPUS
# Clean up GPUs
del model_output
del logits
del batched_inputs
Expand Down Expand Up @@ -852,7 +852,7 @@ def prepare_batch_logprob(
hlog_warn("max_context is None, using max_length")
max_context = self.max_length

# Each sample is concatenated and cut to lenght or padded to max_length
# Each sample is concatenated and cut to length or padded to max_length
for orig_tokens in inputs:
truncated.append(max(len(orig_tokens) - max_context, 0))

Expand Down Expand Up @@ -1030,7 +1030,7 @@ def _loglikelihood_single_token(
)
res.append(answer)

# Clean up GPUS
# Clean up GPUs
del out
del batch_probs
del batched_inputs
Expand Down
11 changes: 4 additions & 7 deletions src/lighteval/models/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,9 @@ class BaseModelConfig:
If `None`, the default value will be set to `True` for seq2seq models (e.g. T5) and
`False` for causal models.
model_parallel (bool, optional, defaults to False):
True/False: force to uses or not the `accelerate` library to load a large
True/False: force to use or not the `accelerate` library to load a large
model across multiple devices.
Default: None which correspond to comparing the number of process with
Default: None which corresponds to comparing the number of processes with
the number of GPUs. If it's smaller => model-parallelism, else not.
dtype (Union[str, torch.dtype], optional, defaults to None):):
Converts the model weights to `dtype`, if specified. Strings get
Expand Down Expand Up @@ -277,11 +277,8 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]

return BaseModelConfig(**args_dict)

if hasattr(args, "model_config") and args.model_config:
config = args.model_config["model"]
else:
with open(args.model_config_path, "r") as f:
config = yaml.safe_load(f)["model"]
with open(args.model_config_path, "r") as f:
config = yaml.safe_load(f)["model"]

if config["type"] == "tgi":
return TGIModelConfig(
Expand Down
4 changes: 2 additions & 2 deletions src/lighteval/models/model_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ def load_model( # noqa: C901
config: Union[BaseModelConfig, AdapterModelConfig, DeltaModelConfig, TGIModelConfig, InferenceEndpointModelConfig],
env_config: EnvConfig,
) -> Tuple[Union[BaseModel, AdapterModel, DeltaModel, ModelClient], ModelInfo]:
"""Will load either a model from an inference server or a model from a checkpoint. depending
on the arguments passed to the program.
"""Will load either a model from an inference server or a model from a checkpoint, depending
on the config type.

Args:
args (Namespace): arguments passed to the program
Expand Down
4 changes: 2 additions & 2 deletions src/lighteval/models/nanotron_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -846,7 +846,7 @@ def _loglikelihood_single_token(

tq.desc = f"loglikelihood_single_token Subset {s} Node {dist.get_rank(self.parallel_context.world_pg)} - {human_format(tokens_per_sec)} tokens/s"

# Clean up GPUS
# Clean up GPUs
del out
del batch_probs
del batched_inputs
Expand Down Expand Up @@ -1083,7 +1083,7 @@ def _loglikelihood_tokens(
tokens_per_sec = batched_inputs.numel() / (elapsed_time_per_iteration_ms / 1000)
tq.desc = f"loglikelihood Subset {s} Node {dist.get_rank(self.parallel_context.world_pg)} - {human_format(tokens_per_sec)} tokens/s"

# Clean up GPUS
# Clean up GPUs
del out
del logits
del batched_inputs
Expand Down
4 changes: 2 additions & 2 deletions src/lighteval/tasks/lighteval_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from dataclasses import dataclass
from multiprocessing import Pool
from pathlib import Path
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union

from datasets import load_dataset

Expand Down Expand Up @@ -454,7 +454,7 @@ def get_request_type(self) -> list[RequestType]: # noqa C901

def construct_requests(
self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str
) -> List[Request]:
) -> Dict[RequestType, List[Request]]:
"""
Constructs a list of requests from the task based on the given parameters.

Expand Down
6 changes: 3 additions & 3 deletions src/lighteval/tasks/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def get_task_dict(

Args:
task_name_list (List[str]): A list of task names.
custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module it-self
custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module itself
extended_tasks (Optional[str]): The path to the extended tasks group of submodules

Returns:
Expand Down Expand Up @@ -159,7 +159,7 @@ def create_custom_tasks_module(custom_tasks: Union[str, ModuleType]) -> ModuleTy
"""Creates a custom task module to load tasks defined by the user in their own file.

Args:
custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module it-self
custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module itself

Returns:
ModuleType: The newly imported/created custom tasks modules
Expand All @@ -178,7 +178,7 @@ def get_custom_tasks(custom_tasks: Union[str, ModuleType]) -> Tuple[ModuleType,
"""Get all the custom tasks available from the given custom tasks file or module.

Args:
custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module it-self
custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module itself
"""
custom_tasks_module = create_custom_tasks_module(custom_tasks=custom_tasks)
tasks_string = ""
Expand Down
6 changes: 2 additions & 4 deletions src/lighteval/tasks/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ class TaskExampleId(NamedTuple):
Represents the identifier for an example in a task.

Attributes:
task_name (str): The name of the task.
task_name (str): The name of the task in `name|num_fewshot` format.
doc_id_seed (str): The document id with the seed used for few_shot appended at the end.
"""

Expand Down Expand Up @@ -187,9 +187,7 @@ def get_golds(self, few_shot: bool = False):
choices = self.choices
golds = []
for gold_ix in gold_indices:
local_golds = as_list(choices[gold_ix])
for local_gold in local_golds:
golds.append(local_gold)
golds.extend(as_list(choices[gold_ix]))
return golds

def __repr__(self):
Expand Down
Loading