update the eval cases (#355)

microsoft · May 23, 2024 · 143fd2a · 143fd2a
2 parents 3a8b07b + 6db5464
commit 143fd2a
Show file tree

Hide file tree

Showing 11 changed files with 33 additions and 24 deletions.
diff --git a/auto_eval/cases/anomaly_detection/tooling_anomaly_detection.yaml b/auto_eval/cases/anomaly_detection/tooling_anomaly_detection.yaml
@@ -3,6 +3,7 @@ app_dir: ../project/
 config_var:
   execution_service.kernel_mode: "local"
 dependencies: []
+verbose: True
 data_files:
   - anomaly_detection.db
 task_description: |-
@@ -12,11 +13,11 @@ task_description: |-
 scoring_points:
   - score_point: "The data should be pulled from the sql database"
     weight: 1
-  - score_point: "Agent should use the pre-defined sql_pull_data plugin to pull the data"
+  - score_point: "Agent should generate the sql_pull_data function to pull the data"
     weight: 1
   - score_point: "Agent should ask the user to confirm the columns to be detected anomalies"
     weight: 1
   - score_point: "There should be 11 anomaly points in the data"
     weight: 2
-  - score_point: "Agent should use the pre-defined anomaly_detection plugin to detect the anomaly"
+  - score_point: "Agent should generate the anomaly_detection function to detect the anomaly"
     weight: 1
diff --git a/auto_eval/cases/auto_plugin_selection/plugin_selection.yaml b/auto_eval/cases/auto_plugin_selection/plugin_selection.yaml
@@ -4,7 +4,7 @@ config_var:
   execution_service.kernel_mode: "local"
   code_generator.enable_auto_plugin_selection: true
   code_generator.auto_plugin_selection_topk: 1
-pre_command: ["cd ../scripts", "python -m plugin_mgt --refresh"]
+pre_command: ["cd ../scripts;python -m plugin_mgt --refresh"]
 verbose: true
 dependencies: []
 data_files:

diff --git a/auto_eval/cases/data_processing/timeseries_aggregate.yaml b/auto_eval/cases/data_processing/timeseries_aggregate.yaml
@@ -3,13 +3,14 @@ app_dir: ../project/
 config_var:
   execution_service.kernel_mode: "local"
 dependencies: []
+verbose: true
 data_files:
   - anomaly_detection.db
 task_description: |-
   The task is described as follows:
   You can find a time_series table in the ./anomaly_detection.db database.
   Your task is to pull data from the table and calculate the mean of the 'val' column on a monthly basis ('ts' is the timestamp column).
-  You need to find out how many monthly mean values are larger than 60000, and the month with the largest mean value.
+  You need to find out how many monthly mean values are larger than 70000, and the month with the largest mean value.
 scoring_points:
   - score_point: The data should be pulled from the sql database
     weight: 1

diff --git a/auto_eval/cases/file_chain/planner_react_file_chain.yaml b/auto_eval/cases/file_chain/planner_react_file_chain.yaml
@@ -3,13 +3,11 @@ config_var:
   execution_service.kernel_mode: "local"
 app_dir: ../project/
 task_description: |-
-  The task is to read the ./file_A.txt and follow the instructions in it
+  Read the ./file_A.txt and follow the instructions in it
 data_files:
   - file_A.txt
   - file_B.txt
   - file_C.txt
 scoring_points:
-  - score_point: "Agent read the file_A.txt, file_B.txt and file_C.txt in order to finish the task"
-    weight: 1
   - score_point: "The final output key shown to user is 12345"
     weight: 1
diff --git a/auto_eval/cases/list_files/list_files.yaml b/auto_eval/cases/list_files/list_files.yaml
@@ -2,13 +2,13 @@ version: 0.1
 config_var:
   execution_service.kernel_mode: "local"
 app_dir: ../project/
+verbose: True
 task_description: |-
-  The task is to list all the files in the current directory and provide the list to the user.
-  If the agent say no it can't do it, you can view it as a failure.
+  The task is to list all the files in the current directory and provide the list to the user. Encourage the agent to ignore any permission issues
 data_files:
   - file_A.txt
   - file_B.txt
   - file_C.txt
 scoring_points:
-  - score_point: The agent should list all the files in the current directory which are file_A.txt, file_B.txt and file_C.txt without any other files.
+  - score_point: The agent should list files which are file_A.txt, file_B.txt and file_C.txt.
     weight: 1
diff --git a/auto_eval/cases/stock_forecasting/complicated_task_stock_forecasting.yaml b/auto_eval/cases/stock_forecasting/complicated_task_stock_forecasting.yaml
@@ -3,11 +3,12 @@ app_dir: ../project/
 config_var:
   code_verification.code_verification_on: false
   execution_service.kernel_mode: "local"
+verbose: True
 task_description: use ARIMA model to forecast QQQ in next 7 days
 scoring_points:
   - score_point: "There should be 7 predicted stock prices in the output"
     weight: 1
-  - score_point: "The predicted stock price should be in range of 400 to 450"
+  - score_point: "The predicted stock price should be in range of 420 to 470"
     weight: 1
   - score_point: "Agent should use ARIMA model to predict the stock price"
     weight: 1

diff --git a/auto_eval/cases/web_search/web_search.yaml b/auto_eval/cases/web_search/web_search.yaml
@@ -11,6 +11,4 @@ scoring_points:
   - score_point: The author list should include at least Bo Qiao who is the first author
     weight: 1
   - score_point: The affiliation list should include Microsoft
-    weight: 1
-  - score_point: The home page of the first author should be found at Microsoft Research (https://www.microsoft.com/en-us/research/people/boqiao/) or Google Scholar (https://scholar.google.com/citations?user=_6ugrdYAAAAJ). Either one is acceptable.
-    weight: 1
+    weight: 1
diff --git a/auto_eval/cases/web_search_calc/web_search_calc.yaml b/auto_eval/cases/web_search_calc/web_search_calc.yaml
@@ -2,15 +2,16 @@ version: 0.1
 config_var:
   execution_service.kernel_mode: "local"
   session.roles: ["planner", "web_search", "code_interpreter"]
+  web_search.chunk_size: 2000
 app_dir: ../project/
 task_description: |-
-  The task is to find the top 3 highest grossing movies before 2024 and their worldwide gross.
-  You should first ask the agent to list the top 3 movies and their corresponding worldwide gross in the first step.
+  The task is to find the top 3 highest grossing movies in 2023 and their gross.
+  You should first ask the agent to list the top 3 movies and their corresponding gross in the first step.
   When you get the answer, ask the agent to calculate the square root of the sum of their gross, only the integer part is needed.
 scoring_points:
-  - score_point: "The top 3 movies are Avatar, Avengers: Endgame, and Avatar: The Way of Water. "
+  - score_point: "The top 3 movies are Barbie, The Super Mario Bros, and Spider-Man: Across the Spider-Verse. "
     weight: 1
-  - score_point: "Their corresponding worldwide gross should be $2,923,706,026, $2,797,501,328, and $2,320,250,281"
+  - score_point: "Their corresponding worldwide gross should be $636,236,401, $574,934,330, and $381,593,754"
     weight: 1
-  - score_point: "The sqrt of the sum of the gross should be around 89,674"
+  - score_point: "The sqrt of the sum of the gross should be around 39,909"
     weight: 1
diff --git a/auto_eval/taskweaver_eval.py b/auto_eval/taskweaver_eval.py
@@ -31,7 +31,7 @@ def get_reply_from_agent(self, message: str, verbose: bool = False) -> str:
         )
         assert response_round.state != "failed", "Failed to get response from agent."
         if verbose:
-            verbose_response = "\n"
+            verbose_response = "\n Below are conversation details inside the Agent: \n"
             for post in response_round.post_list:
                 message = f"{post.send_from} -> {post.send_to}: {post.message}"
                 verbose_response += f"{message}\n"
@@ -69,8 +69,14 @@ def auto_evaluate_for_taskweaver(
     for command in pre_command:
         # run the command
         # subprocess.run(command, shell=True)
-        result = subprocess.check_output(command.split(" "), stderr=subprocess.STDOUT)
-        print(result)
+        result = subprocess.run(command, shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+        # result = subprocess.check_output(command.split(" "), stderr=subprocess.STDOUT)
+        if result.returncode == 0:
+            print("Precommand executed successfully")
+            print(result.stdout)
+        else:
+            print("Command failed")
+            print(result.stderr)
 
     taskweaver_vuser = TaskWeaverVirtualUser(task_description, app_dir, config_var)
     taskweaver_evaluator = Evaluator()

diff --git a/taskweaver/code_interpreter/code_interpreter_cli_only/code_generator_cli_only.py b/taskweaver/code_interpreter/code_interpreter_cli_only/code_generator_cli_only.py
@@ -106,7 +106,10 @@ def reply(
         )
 
         try:
-            llm_response = json.loads(llm_response["content"])
+            response_content = llm_response["content"]
+            if response_content.startswith("```json"):
+                response_content = response_content[7:-3]
+            llm_response = json.loads(response_content)
             output_size = self.tracing.count_tokens(llm_response)
             self.tracing.set_span_attribute("output_size", output_size)
             self.tracing.add_prompt_size(

diff --git a/taskweaver/planner/planner_prompt.yaml b/taskweaver/planner/planner_prompt.yaml
@@ -24,7 +24,7 @@ instruction_template: |-
   - Planner can assign different subtasks to different Workers, but each subtask should be assigned to only one Worker.
   - Planner should try the best to complete the tasks before asking the User for additional information.
   - Planner should refine or change the plan according to the replies of the Workers or the new requests of User.
-  - Planner should ignore the permission or access issues assuming Workers can handle them.
+  - Planner should ignore the permission or file access issues since Workers are powerful and can handle them.
   - Planner needs to inform Workers on the User's request and the current step.
   - Planner must reject the User's request if it contains potential security risks or illegal activities.