diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index c0b06d36..950a7597 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -18,7 +18,6 @@ jobs: uses: actions/checkout@v3 with: lfs: 'true' - ref: ${{ github.event.pull_request.head.sha }} # we want to test against our branch not against a merge commit - name: Setup Python environment uses: actions/setup-python@v4 with: diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml index 8ac08ad6..ecdca01d 100644 --- a/.github/workflows/trufflehog.yml +++ b/.github/workflows/trufflehog.yml @@ -16,4 +16,3 @@ jobs: fetch-depth: 0 - name: Secret Scanning uses: trufflesecurity/trufflehog@main - diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 4408f22f..86ab69e2 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -86,7 +86,6 @@ def arabic_mmlu_pfn(line, task_name: str = None): choices=valid_keys_arabic, # Return only valid choices (Arabic keys) gold_index=answer_index, # Correct index in the valid Arabic keys instruction=instruction, - target_for_fewshot_sorting=valid_keys_arabic[answer_index], # Correct answer in Arabic form ) @@ -149,7 +148,6 @@ def arabic_mmlu_ht_pfn(line, task_name: str = None): choices=[str(i) for i in range(1, len(choices) + 1)], # List of strings instead of ints gold_index=answer_index, instruction=instruction, - target_for_fewshot_sorting=str(answer_index), # Assuming it's sorted based on the number ) @@ -328,7 +326,6 @@ def aratrust_pfn(line, task_name: str = None): choices=LETTER_INDICES_AR[:3], gold_index=answer_index, instruction=instruction, - target_for_fewshot_sorting=LETTER_INDICES_AR[answer_index], ) @@ -413,7 +410,8 @@ def arabic_exams_pfn(line, task_name: str = None): def alghafa_pfn(line, task_name: str = None): question = line["query"] answer_index = int(line["label"]) - choices = [line[key] for key in ["sol1", "sol2", "sol3", "sol4"]] + allowed_keys = [f"sol{i}" for i in range(1, 6)] + choices = [line[key] for key in allowed_keys if key in line] instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" query = f"{instruction}السؤال: {question}\n" @@ -802,7 +800,6 @@ def madinah_qa_pfn(line, task_name: str = None): choices=choices, gold_index=answer_index, # Correct index in the valid keys instruction=instruction, - target_for_fewshot_sorting=valid_keys_latin[answer_index], # Correct answer in Latin form ) diff --git a/docs/source/adding-a-new-metric.mdx b/docs/source/adding-a-new-metric.mdx index 35fc975f..6433d588 100644 --- a/docs/source/adding-a-new-metric.mdx +++ b/docs/source/adding-a-new-metric.mdx @@ -92,4 +92,3 @@ if __name__ == "__main__": You can then give your custom metric to lighteval by using `--custom-tasks path_to_your_file` when launching it. - diff --git a/docs/source/contributing-to-multilingual-evaluations.mdx b/docs/source/contributing-to-multilingual-evaluations.mdx index 0d0855d7..4db1c935 100644 --- a/docs/source/contributing-to-multilingual-evaluations.mdx +++ b/docs/source/contributing-to-multilingual-evaluations.mdx @@ -8,7 +8,7 @@ We welcome translations in your language! To contribute, you'll need to 1. Open the [translation_literals](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/templates/utils/translation_literals.py) file -2. Edit the file to add or expand the literal for your language of interest. +2. Edit the file to add or expand the literal for your language of interest. ```python Language.ENGLISH: TranslationLiterals( @@ -42,7 +42,7 @@ To contribute, you'll need to ## Contributing a new multilingual task -You should first read our guide on [adding a custom task](adding-a-custom-task), to better understand the different parameters we use. +You should first read our guide on [adding a custom task](adding-a-custom-task), to better understand the different parameters we use. Then, you should take a look at the current [multilingual tasks](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/multilingual/tasks.py) file, to understand how they are defined. For multilingual evaluations the `prompt_function` should be implemented by language-adapted template. The template will take care of correct formatting, correct and consistent usage of language adjusted prompt anchors (e.g Question/Answer) and punctuation. @@ -58,7 +58,7 @@ your_tasks = [ LightevalTaskConfig( # Name of your evaluation name=f"evalname_{language.value}_{formulation.name.lower()}", - # The evaluation is community contributed + # The evaluation is community contributed suite=["community"], # This will automatically get the correct metrics for your chosen formulation metric=get_metrics_for_formulation( @@ -72,7 +72,7 @@ your_tasks = [ # In this function, you choose which template to follow and for which language and formulation prompt_function=get_template_prompt_function( language=language, - # then use the adapter to define the mapping between the + # then use the adapter to define the mapping between the # keys of the template (left), and the keys of your dataset # (right) # To know which template keys are required and available, @@ -83,9 +83,9 @@ your_tasks = [ }, formulation=formulation, ), - # You can also add specific filters to remove irrelevant samples + # You can also add specific filters to remove irrelevant samples hf_filter=lambda line: line["label"] in , - # You then select your huggingface dataset as well as + # You then select your huggingface dataset as well as # the splits available for evaluation hf_repo=, hf_subset=, diff --git a/docs/source/using-the-python-api.mdx b/docs/source/using-the-python-api.mdx index 8c44050f..583da5f5 100644 --- a/docs/source/using-the-python-api.mdx +++ b/docs/source/using-the-python-api.mdx @@ -35,7 +35,7 @@ def main(): env_config=EnvConfig(cache_dir="tmp/"), # Remove the 2 parameters below once your configuration is tested override_batch_size=1, - max_samples=10 + max_samples=10 ) model_config = VLLMModelConfig( diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 69532c09..834e8170 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -148,10 +148,10 @@ def task_registry(self): intersection = set(default_tasks_registry.keys()).intersection(set(custom_tasks_registry.keys())) if len(intersection) > 0: logger.warning( - f"Following tasks ({intersection}) exists both in the default and custom tasks. Will use the default ones on conflict." + f"Following tasks ({intersection}) exists both in the default and custom tasks. Will use the custom ones on conflict." ) - # Defaults tasks should overwrite custom tasks + # Custom tasks overwrite defaults tasks return {**default_tasks_registry, **custom_tasks_registry} @property