update doc

modelscope · Dec 31, 2024 · 7c572d0 · 7c572d0
1 parent c10595f
commit 7c572d0
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 9 deletions.
diff --git a/docs/en/advanced_guides/add_benchmark.md b/docs/en/advanced_guides/add_benchmark.md
@@ -79,13 +79,13 @@ The sample code is as follows:
 ```python
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.metrics import WeightedAverageAccuracy
-from evalscope.models import MultiChoiceModelAdapter
+from evalscope.models import ChatGenerationModelAdapter
 
 
 @Benchmark.register(
     name='mmlu_pro',
     dataset_id='modelscope/mmlu-pro',
-    model_adapter=MultiChoiceModelAdapter,
+    model_adapter=ChatGenerationModelAdapter,
     subset_list=['default'],
     metric_list=[WeightedAverageAccuracy],
     few_shot_num=0,

diff --git a/docs/zh/advanced_guides/add_benchmark.md b/docs/zh/advanced_guides/add_benchmark.md
@@ -78,13 +78,13 @@ evalscope/benchmarks/
 ```python
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.metrics import WeightedAverageAccuracy
-from evalscope.models import MultiChoiceModelAdapter
+from evalscope.models import ChatGenerationModelAdapter
 
 
 @Benchmark.register(
     name='mmlu_pro',
     dataset_id='modelscope/mmlu-pro',
-    model_adapter=MultiChoiceModelAdapter,
+    model_adapter=ChatGenerationModelAdapter,
     subset_list=['default'],
     metric_list=[WeightedAverageAccuracy],
     few_shot_num=0,

diff --git a/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py b/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py
@@ -103,9 +103,9 @@ def get_sys_prompt(inp: dict) -> str:
         few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
         context: str = '\n'.join(few_shot_prompts) + '\n'
         context += self._generate_prompt(input_d=input_d, include_answer=False)
-        full_prompt = prompt + context
+        full_prompt = context
 
-        return {'data': [full_prompt]}
+        return {'data': [full_prompt], 'system_prompt': prompt}
 
     def get_gold_answer(self, input_d: dict) -> list:
         # Get the gold choice

diff --git a/tests/cli/test_run.py b/tests/cli/test_run.py
@@ -73,16 +73,16 @@ def test_run_eval_with_args(self):
     def test_run_task(self):
         task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct',
                     'datasets': [
-                        'mmlu_pro',
+                        # 'mmlu_pro',
                         # 'bbh',
                         # 'hellaswag',
                         # 'gsm8k',
                         # 'arc'
                         # 'race',
                         # 'truthful_qa',
-                        # 'trivia_qa',
+                        'trivia_qa',
                         ],
-                    'limit': 2,
+                    'limit': 20,
                     'debug': True}
         run_task(task_cfg=task_cfg)