Skip to content

Commit

Permalink
Merge branch 'main' into oz-eval
Browse files Browse the repository at this point in the history
  • Loading branch information
clefourrier authored Oct 15, 2024
2 parents cd859c1 + 9134ca8 commit 70b926b
Show file tree
Hide file tree
Showing 37 changed files with 2,552 additions and 328 deletions.
796 changes: 796 additions & 0 deletions community_tasks/serbian_eval.py

Large diffs are not rendered by default.

75 changes: 75 additions & 0 deletions examples/tasks/serbian_task_group/sr_all_exclusive.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Serbian Evaluations - ARC (AI2 Reasoning Challenge)
community|serbian_evals:arc_easy|0|0
community|serbian_evals:arc_challenge|0|0
# Commonsense Reasoning
community|serbian_evals:hellaswag|0|0
community|serbian_evals:piqa|0|0
community|serbian_evals:winogrande|0|0
# Serbian Evaluations - Custom/Other Task
community|serbian_evals:oz_eval|0|0
# MMLU (Miscellaneous)
community|serbian_evals:mmlu_anatomija|0|0
community|serbian_evals:mmlu_astronomija|0|0
community|serbian_evals:mmlu_poslovna_etika|0|0
community|serbian_evals:mmlu_kliničko_znanje|0|0
community|serbian_evals:mmlu_razno|0|0
community|serbian_evals:mmlu_elektrotehnika|0|0
# Serbian Evaluations - ARC (AI2 Reasoning Challenge)
community|serbian_evals:arc_easy|0|0
community|serbian_evals:arc_challenge|0|0
# Commonsense Reasoning
community|serbian_evals:hellaswag|0|0
community|serbian_evals:piqa|0|0
community|serbian_evals:winogrande|0|0
# Serbian Evaluations - Custom/Other Task
community|serbian_evals:oz_eval|0|0
# MMLU (Miscellaneous)
community|serbian_evals:mmlu_anatomija|0|0
community|serbian_evals:mmlu_astronomija|0|0
community|serbian_evals:mmlu_poslovna_etika|0|0
community|serbian_evals:mmlu_kliničko_znanje|0|0
community|serbian_evals:mmlu_razno|0|0
community|serbian_evals:mmlu_elektrotehnika|0|0
# MMLU (Business Professional)
community|serbian_evals:mmlu_marketing|0|0
community|serbian_evals:mmlu_manadzment|0|0
# MMLU (College Level Tasks)
community|serbian_evals:mmlu_fakultet_biologija|0|0
community|serbian_evals:mmlu_fakultet_hemija|0|0
community|serbian_evals:mmlu_fakultet_racunari|0|0
community|serbian_evals:mmlu_fakultet_matematika|0|0
community|serbian_evals:mmlu_fakultet_medicina|0|0
community|serbian_evals:mmlu_fakultet_fizika|0|0
community|serbian_evals:mmlu_sigurnost_racunara|0|0
# MMLU (Ethics, Philosophy)
community|serbian_evals:mmlu_moralni_sporovi|0|0
community|serbian_evals:mmlu_moralne_dileme|0|0
community|serbian_evals:mmlu_filozofija|0|0
community|serbian_evals:mmlu_svetska_religija|0|0
# MMLU (High School Level Tasks)
community|serbian_evals:mmlu_srednja_skola_biologija|0|0
community|serbian_evals:mmlu_srednja_skola_hemija|0|0
community|serbian_evals:mmlu_srednja_skola_racunari|0|0
community|serbian_evals:mmlu_srednja_skola_istorija_evrope|0|0
community|serbian_evals:mmlu_srednja_skola_geografija|0|0
community|serbian_evals:mmlu_srednja_skola_matematika|0|0
community|serbian_evals:mmlu_srednja_skola_mikroekonomija|0|0
community|serbian_evals:mmlu_srednja_skola_fizika|0|0
community|serbian_evals:mmlu_srednja_skola_psihologija|0|0
community|serbian_evals:mmlu_srednja_skola_statistika|0|0
community|serbian_evals:mmlu_srednja_skola_svetska_istorija|0|0
# MMLU (Math, Logic)
community|serbian_evals:mmlu_abstract_algebra|0|0
community|serbian_evals:mmlu_osnovna_matematika|0|0
community|serbian_evals:mmlu_formalna_logika|0|0
community|serbian_evals:mmlu_konceptualna_fizika|0|0
community|serbian_evals:mmlu_metrika_ekonomije|0|0
community|serbian_evals:mmlu_masinsko_ucenje|0|0
# MMLU (Social Sciences)
community|serbian_evals:mmlu_globalne_cinjenice|0|0
community|serbian_evals:mmlu_logicke_zablude|0|0
community|serbian_evals:mmlu_sociologija|0|0
community|serbian_evals:mmlu_human_aging|0|0
# Question Answering and Knowledge
community|serbian_evals:boolq|0|0
community|serbian_evals:openbook|0|0
2 changes: 2 additions & 0 deletions examples/tasks/serbian_task_group/sr_all_inclusive.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# MMLU (All-inclusive Task Entry)
community|serbian_evals:mmlu|0|0
3 changes: 3 additions & 0 deletions examples/tasks/serbian_task_group/sr_arc.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Serbian Evaluations - ARC (AI2 Reasoning Challenge)
community|serbian_evals:arc_easy|0|0
community|serbian_evals:arc_challenge|0|0
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Commonsense Reasoning
community|serbian_evals:hellaswag|0|0
community|serbian_evals:piqa|0|0
community|serbian_evals:winogrande|0|0
2 changes: 2 additions & 0 deletions examples/tasks/serbian_task_group/sr_custom_task.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Serbian Evaluations - Custom/Other Task
community|serbian_evals:oz_eval|0|0
7 changes: 7 additions & 0 deletions examples/tasks/serbian_task_group/sr_misc.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# MMLU (Miscellaneous)
community|serbian_evals:mmlu_anatomija|0|0
community|serbian_evals:mmlu_astronomija|0|0
community|serbian_evals:mmlu_poslovna_etika|0|0
community|serbian_evals:mmlu_kliničko_znanje|0|0
community|serbian_evals:mmlu_razno|0|0
community|serbian_evals:mmlu_elektrotehnika|0|0
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# MMLU (Business Professional)
community|serbian_evals:mmlu_marketing|0|0
community|serbian_evals:mmlu_manadzment|0|0
8 changes: 8 additions & 0 deletions examples/tasks/serbian_task_group/sr_mmlu_college_level.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# MMLU (College Level Tasks)
community|serbian_evals:mmlu_fakultet_biologija|0|0
community|serbian_evals:mmlu_fakultet_hemija|0|0
community|serbian_evals:mmlu_fakultet_racunari|0|0
community|serbian_evals:mmlu_fakultet_matematika|0|0
community|serbian_evals:mmlu_fakultet_medicina|0|0
community|serbian_evals:mmlu_fakultet_fizika|0|0
community|serbian_evals:mmlu_sigurnost_racunara|0|0
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# MMLU (Ethics, Philosophy)
community|serbian_evals:mmlu_moralni_sporovi|0|0
community|serbian_evals:mmlu_moralne_dileme|0|0
community|serbian_evals:mmlu_filozofija|0|0
community|serbian_evals:mmlu_svetska_religija|0|0
12 changes: 12 additions & 0 deletions examples/tasks/serbian_task_group/sr_mmlu_high_school_level.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# MMLU (High School Level Tasks)
community|serbian_evals:mmlu_srednja_skola_biologija|0|0
community|serbian_evals:mmlu_srednja_skola_hemija|0|0
community|serbian_evals:mmlu_srednja_skola_racunari|0|0
community|serbian_evals:mmlu_srednja_skola_istorija_evrope|0|0
community|serbian_evals:mmlu_srednja_skola_geografija|0|0
community|serbian_evals:mmlu_srednja_skola_matematika|0|0
community|serbian_evals:mmlu_srednja_skola_mikroekonomija|0|0
community|serbian_evals:mmlu_srednja_skola_fizika|0|0
community|serbian_evals:mmlu_srednja_skola_psihologija|0|0
community|serbian_evals:mmlu_srednja_skola_statistika|0|0
community|serbian_evals:mmlu_srednja_skola_svetska_istorija|0|0
7 changes: 7 additions & 0 deletions examples/tasks/serbian_task_group/sr_mmlu_math_logic.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# MMLU (Math, Logic)
community|serbian_evals:mmlu_abstract_algebra|0|0
community|serbian_evals:mmlu_osnovna_matematika|0|0
community|serbian_evals:mmlu_formalna_logika|0|0
community|serbian_evals:mmlu_konceptualna_fizika|0|0
community|serbian_evals:mmlu_metrika_ekonomije|0|0
community|serbian_evals:mmlu_masinsko_ucenje|0|0
5 changes: 5 additions & 0 deletions examples/tasks/serbian_task_group/sr_mmlu_social_sciences.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# MMLU (Social Sciences)
community|serbian_evals:mmlu_globalne_cinjenice|0|0
community|serbian_evals:mmlu_logicke_zablude|0|0
community|serbian_evals:mmlu_sociologija|0|0
community|serbian_evals:mmlu_human_aging|0|0
3 changes: 3 additions & 0 deletions examples/tasks/serbian_task_group/sr_qa_knowledge.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Question Answering and Knowledge
community|serbian_evals:boolq|0|0
community|serbian_evals:openbook|0|0
2 changes: 1 addition & 1 deletion src/lighteval/config/lighteval_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ class LightEvalTasksArgs:

dataset_loading_processes: int = 8
multichoice_continuations_start_space: Optional[bool] = None
pair_wise_tokenization: bool = False
pairwise_tokenization: bool = False


@dataclass
Expand Down
4 changes: 2 additions & 2 deletions src/lighteval/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def init_split_limits(self, num_dataset_splits):
splits_indices = [tuple(e) for e in splits_indices]
return num_dataset_splits, splits_indices

def _sorting_criteria(self, request: GreedyUntilRequest) -> tuple[bool, list, int]:
def _sorting_criteria(self, request: GreedyUntilRequest) -> tuple[bool, bool, list, int]:
"""
Collate function for generating batches.
Expand All @@ -279,7 +279,7 @@ def _sorting_criteria(self, request: GreedyUntilRequest) -> tuple[bool, list, in
# The generative task has no limit except the model context
if gen_length is None:
gen_length = 0
return request.use_logits, request.stop_sequence, -(len(toks) + gen_length)
return request.do_sample, request.use_logits, request.stop_sequence, -(len(toks) + gen_length)


class GenerativeTaskDatasetNanotron(GenerativeTaskDataset):
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/logging/info_loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ class MetricsLogger:
default_factory=lambda: collections.defaultdict(lambda: collections.defaultdict(list))
)
metric_aggregated: dict[str, dict[str, float]] = field(
default_factory=lambda: collections.defaultdict(lambda: collections.defaultdict(dict))
default_factory=lambda: collections.defaultdict(lambda: collections.defaultdict(float))
)

def log(self, task_name: str, metrics: dict) -> None:
Expand Down
45 changes: 14 additions & 31 deletions src/lighteval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,21 @@ def apply_generative_metric( # noqa: C901
formatted_docs: list[Doc],
metrics: list[Metric],
output_regex: str = None,
max_num_samples: int = 1,
):
outputs = []

for sample_id, results, formatted_doc in zip(sample_ids, responses, formatted_docs):
output = {}

# Extracting gold
try:
golds = formatted_doc.get_golds()
except (KeyError, IndexError):
golds = None

# Post processing prediction
if len(results) > 1:
# In case of sampling, it's a list of one list of n samples
raise Exception("You returned more than one result for a sample with a generative metric.")
results = results[0]

Expand All @@ -112,38 +119,14 @@ def apply_generative_metric( # noqa: C901
pred = pred_raw
preds.append(pred)

# Extracting gold
try:
golds = formatted_doc.get_golds()
except (KeyError, IndexError):
golds = None

# Specific process for HELM like evals # hrm
# if "label_to_choices" in formatted_doc:
if formatted_doc.specific is not None and "label_to_choices" in formatted_doc.specific:
# Helm predicts on labels keys (A/B/C/D), but computes metrics on choices
preds = [formatted_doc.specific["label_to_choices"].get(p) for p in preds]
golds = [formatted_doc.specific["label_to_choices"][g] for g in golds]

for metric in metrics:
if metric.category == MetricCategory.GENERATIVE:
output.update(
metric.compute(
golds=golds,
predictions=as_list(preds[0]) if max_num_samples > 1 else preds,
formatted_doc=formatted_doc,
)
output.update(
metric.compute(
golds=golds,
predictions=preds,
formatted_doc=formatted_doc,
)
if metric.category == MetricCategory.GENERATIVE_LOGPROB:
output.update(
metric.compute(
golds=golds,
predictions=as_list(preds[0]) if max_num_samples > 1 else preds,
formatted_doc=formatted_doc,
)
)
if metric.category == MetricCategory.GENERATIVE_SAMPLING:
output.update(metric.compute(golds=golds, predictions=preds, formatted_doc=formatted_doc))
)
outputs.append(output)

return outputs
Expand Down
29 changes: 17 additions & 12 deletions src/lighteval/models/abstract_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,23 +183,28 @@ def tok_encode_pair(self, context, continuation, pairwise: bool = False):
context = context[:-n_spaces]

if pairwise:
context_enc, continuation_enc = self.tok_encode(context), self.tok_encode(continuation)
if self.add_special_tokens:
tokenized_with_special_tokens = self.tokenizer.build_inputs_with_special_tokens(
context_enc + continuation_enc
)
# If this fails something went wrong as the function above should only add special tokens
first_non_prefix_token_idx = tokenized_with_special_tokens.index(context_enc[0])
last_context_token_idx = first_non_prefix_token_idx + len(context_enc)
context_enc, continuation_enc = (
tokenized_with_special_tokens[:last_context_token_idx],
tokenized_with_special_tokens[last_context_token_idx:],
)
# We don't add special tokens to the continuation as if bos is added
# models tend to to completely ignore a context
context_enc, continuation_enc = (
self.tok_encode(context, add_special_tokens=self.add_special_tokens),
self.tok_encode(continuation, add_special_tokens=False),
)

# In theory the context_enc can be ended with eos token, this would again
# cause the model to ignore the context. We thus strip the eos token from context_enc
if len(context_enc) > 0 and context_enc[-1] == self.tokenizer.eos_token_id:
context_enc = context_enc[:-1]

return context_enc, continuation_enc

whole_enc = self.tok_encode(context + continuation)
context_enc = self.tok_encode(context)
context_enc_len = len(context_enc)
# In case continuation tokens merge with context tokens we use the merged token as continuation
if len(context_enc) == len(whole_enc):
context_enc_len = len(context_enc) - 1
context_enc = whole_enc[:context_enc_len]

continuation_enc = whole_enc[context_enc_len:]
return context_enc, continuation_enc

Expand Down
13 changes: 6 additions & 7 deletions src/lighteval/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def __init__(
model_size=model_size,
)

self.pair_wise_tokenization = config.pair_wise_tokenization
self.pairwise_tokenization = config.pairwise_tokenization

@property
def tokenizer(self):
Expand Down Expand Up @@ -537,6 +537,7 @@ def greedy_until(
max_new_tokens = batch[0].generation_size
returns_logits = batch[0].use_logits
num_samples = batch[0].num_samples
do_sample = batch[0].do_sample

context = [c.context for c in batch]

Expand Down Expand Up @@ -590,6 +591,7 @@ def greedy_until(
stop_tokens=stop_tokens,
returns_logits=returns_logits,
num_samples=num_samples,
do_sample=do_sample,
)
results.extend(cur_reponses)

Expand All @@ -602,6 +604,7 @@ def _generate(
stop_tokens: list[str],
returns_logits: Optional[bool] = False,
num_samples: Optional[int] = 1,
do_sample: Optional[bool] = False,
) -> list[GenerativeResponse]:
"""Contains the actual logic of the generation.
First computes the stop sequences, then generates the predictions, then converts the outputs to GenerativeResponse.
Expand All @@ -619,7 +622,7 @@ def _generate(
return_dict_in_generate=True,
output_scores=True,
eos_token_id=self.tokenizer.eos_token_id,
do_sample=num_samples > 1,
do_sample=do_sample,
num_return_sequences=num_samples,
)
if returns_logits:
Expand Down Expand Up @@ -660,10 +663,6 @@ def _generate(

decoded_generations.append(decoded_generation)

if num_samples == 1: # We only return one item
result_generations = result_generations[0]
decoded_generations = decoded_generations[0]

cur_response = GenerativeResponse(
result=decoded_generations,
logits=logits[ix][: len_logits[ix]] if returns_logits else None,
Expand Down Expand Up @@ -697,7 +696,7 @@ def loglikelihood(
else:
# The following line is mandatory for compatibility with the harness
request.tokenized_context, request.tokenized_continuation = self.tok_encode_pair(
request.context, request.choice, pairwise=self.pair_wise_tokenization
request.context, request.choice, pairwise=self.pairwise_tokenization
)

return self._loglikelihood_tokens(requests, override_bs=override_bs)
Expand Down
7 changes: 5 additions & 2 deletions src/lighteval/models/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class BaseModelConfig:
For example, context: "What is the capital of France?" and choices: "Paris", "London".
Will be tokenized as: "What is the capital of France? Paris" and "What is the capital of France? London".
True adds a space, False strips a space, None does nothing
pair_wise_tokenization (bool): Whether to tokenize the context and continuation as separately or together.
pairwise_tokenization (bool): Whether to tokenize the context and continuation as separately or together.
subfolder (Optional[str]): The subfolder within the model repository.
revision (str): The revision of the model.
batch_size (int): The batch size for model training.
Expand Down Expand Up @@ -100,7 +100,7 @@ class BaseModelConfig:
accelerator: "Accelerator" = None
tokenizer: Optional[str] = None
multichoice_continuations_start_space: Optional[bool] = None
pair_wise_tokenization: bool = False
pairwise_tokenization: bool = False
subfolder: Optional[str] = None
revision: str = "main"
batch_size: int = -1
Expand Down Expand Up @@ -226,7 +226,10 @@ class VLLMModelConfig:
multichoice_continuations_start_space: bool = (
True # whether to add a space at the start of each continuation in multichoice generation
)
pairwise_tokenization: bool = False # whether to tokenize the context and continuation separately or together.

subfolder: Optional[str] = None
temperature: float = 0.6 # will be used for multi sampling tasks, for tasks requiring no sampling, this will be ignored and set to 0.


@dataclass
Expand Down
Loading

0 comments on commit 70b926b

Please sign in to comment.