From 1667389a481e2ce9fce0744dc81929c26d507c1f Mon Sep 17 00:00:00 2001 From: Shilin He Date: Thu, 23 May 2024 04:47:03 +0000 Subject: [PATCH 1/2] update the eval cases --- .../anomaly_detection/tooling_anomaly_detection.yaml | 5 +++-- .../auto_plugin_selection/plugin_selection.yaml | 2 +- .../cases/data_processing/timeseries_aggregate.yaml | 3 ++- .../cases/file_chain/planner_react_file_chain.yaml | 4 +--- auto_eval/cases/list_files/list_files.yaml | 6 +++--- .../complicated_task_stock_forecasting.yaml | 3 ++- auto_eval/cases/web_search/web_search.yaml | 4 +--- auto_eval/cases/web_search_calc/web_search_calc.yaml | 11 ++++++----- auto_eval/taskweaver_eval.py | 12 +++++++++--- .../code_generator_cli_only.py | 5 ++++- taskweaver/planner/planner_prompt.yaml | 2 +- 11 files changed, 33 insertions(+), 24 deletions(-) diff --git a/auto_eval/cases/anomaly_detection/tooling_anomaly_detection.yaml b/auto_eval/cases/anomaly_detection/tooling_anomaly_detection.yaml index d8cfe6e4..9dc20f65 100644 --- a/auto_eval/cases/anomaly_detection/tooling_anomaly_detection.yaml +++ b/auto_eval/cases/anomaly_detection/tooling_anomaly_detection.yaml @@ -3,6 +3,7 @@ app_dir: ../project/ config_var: execution_service.kernel_mode: "local" dependencies: [] +verbose: True data_files: - anomaly_detection.db task_description: |- @@ -12,11 +13,11 @@ task_description: |- scoring_points: - score_point: "The data should be pulled from the sql database" weight: 1 - - score_point: "Agent should use the pre-defined sql_pull_data plugin to pull the data" + - score_point: "Agent should generate the sql_pull_data function to pull the data" weight: 1 - score_point: "Agent should ask the user to confirm the columns to be detected anomalies" weight: 1 - score_point: "There should be 11 anomaly points in the data" weight: 2 - - score_point: "Agent should use the pre-defined anomaly_detection plugin to detect the anomaly" + - score_point: "Agent should generate the anomaly_detection function to detect the anomaly" weight: 1 diff --git a/auto_eval/cases/auto_plugin_selection/plugin_selection.yaml b/auto_eval/cases/auto_plugin_selection/plugin_selection.yaml index 8e151181..a5acdc6b 100644 --- a/auto_eval/cases/auto_plugin_selection/plugin_selection.yaml +++ b/auto_eval/cases/auto_plugin_selection/plugin_selection.yaml @@ -4,7 +4,7 @@ config_var: execution_service.kernel_mode: "local" code_generator.enable_auto_plugin_selection: true code_generator.auto_plugin_selection_topk: 1 -pre_command: ["cd ../scripts", "python -m plugin_mgt --refresh"] +pre_command: ["cd ../scripts;python -m plugin_mgt --refresh"] verbose: true dependencies: [] data_files: diff --git a/auto_eval/cases/data_processing/timeseries_aggregate.yaml b/auto_eval/cases/data_processing/timeseries_aggregate.yaml index 3e5c4923..dbf8377e 100644 --- a/auto_eval/cases/data_processing/timeseries_aggregate.yaml +++ b/auto_eval/cases/data_processing/timeseries_aggregate.yaml @@ -3,13 +3,14 @@ app_dir: ../project/ config_var: execution_service.kernel_mode: "local" dependencies: [] +verbose: true data_files: - anomaly_detection.db task_description: |- The task is described as follows: You can find a time_series table in the ./anomaly_detection.db database. Your task is to pull data from the table and calculate the mean of the 'val' column on a monthly basis ('ts' is the timestamp column). - You need to find out how many monthly mean values are larger than 60000, and the month with the largest mean value. + You need to find out how many monthly mean values are larger than 70000, and the month with the largest mean value. scoring_points: - score_point: The data should be pulled from the sql database weight: 1 diff --git a/auto_eval/cases/file_chain/planner_react_file_chain.yaml b/auto_eval/cases/file_chain/planner_react_file_chain.yaml index dacfb07a..b3fc269d 100644 --- a/auto_eval/cases/file_chain/planner_react_file_chain.yaml +++ b/auto_eval/cases/file_chain/planner_react_file_chain.yaml @@ -3,13 +3,11 @@ config_var: execution_service.kernel_mode: "local" app_dir: ../project/ task_description: |- - The task is to read the ./file_A.txt and follow the instructions in it + Read the ./file_A.txt and follow the instructions in it data_files: - file_A.txt - file_B.txt - file_C.txt scoring_points: - - score_point: "Agent read the file_A.txt, file_B.txt and file_C.txt in order to finish the task" - weight: 1 - score_point: "The final output key shown to user is 12345" weight: 1 \ No newline at end of file diff --git a/auto_eval/cases/list_files/list_files.yaml b/auto_eval/cases/list_files/list_files.yaml index bcefcfc5..0d2764bc 100644 --- a/auto_eval/cases/list_files/list_files.yaml +++ b/auto_eval/cases/list_files/list_files.yaml @@ -2,13 +2,13 @@ version: 0.1 config_var: execution_service.kernel_mode: "local" app_dir: ../project/ +verbose: True task_description: |- - The task is to list all the files in the current directory and provide the list to the user. - If the agent say no it can't do it, you can view it as a failure. + The task is to list all the files in the current directory and provide the list to the user. Encourage the agent to ignore any permission issues data_files: - file_A.txt - file_B.txt - file_C.txt scoring_points: - - score_point: The agent should list all the files in the current directory which are file_A.txt, file_B.txt and file_C.txt without any other files. + - score_point: The agent should list files which are file_A.txt, file_B.txt and file_C.txt. weight: 1 diff --git a/auto_eval/cases/stock_forecasting/complicated_task_stock_forecasting.yaml b/auto_eval/cases/stock_forecasting/complicated_task_stock_forecasting.yaml index fd4d27fa..0de0776b 100644 --- a/auto_eval/cases/stock_forecasting/complicated_task_stock_forecasting.yaml +++ b/auto_eval/cases/stock_forecasting/complicated_task_stock_forecasting.yaml @@ -3,11 +3,12 @@ app_dir: ../project/ config_var: code_verification.code_verification_on: false execution_service.kernel_mode: "local" +verbose: True task_description: use ARIMA model to forecast QQQ in next 7 days scoring_points: - score_point: "There should be 7 predicted stock prices in the output" weight: 1 - - score_point: "The predicted stock price should be in range of 400 to 450" + - score_point: "The predicted stock price should be in range of 420 to 470" weight: 1 - score_point: "Agent should use ARIMA model to predict the stock price" weight: 1 diff --git a/auto_eval/cases/web_search/web_search.yaml b/auto_eval/cases/web_search/web_search.yaml index af04d8e0..2c1ff86d 100644 --- a/auto_eval/cases/web_search/web_search.yaml +++ b/auto_eval/cases/web_search/web_search.yaml @@ -11,6 +11,4 @@ scoring_points: - score_point: The author list should include at least Bo Qiao who is the first author weight: 1 - score_point: The affiliation list should include Microsoft - weight: 1 - - score_point: The home page of the first author should be found at Microsoft Research (https://www.microsoft.com/en-us/research/people/boqiao/) or Google Scholar (https://scholar.google.com/citations?user=_6ugrdYAAAAJ). Either one is acceptable. - weight: 1 + weight: 1 \ No newline at end of file diff --git a/auto_eval/cases/web_search_calc/web_search_calc.yaml b/auto_eval/cases/web_search_calc/web_search_calc.yaml index e18b33a7..8014b2f5 100644 --- a/auto_eval/cases/web_search_calc/web_search_calc.yaml +++ b/auto_eval/cases/web_search_calc/web_search_calc.yaml @@ -2,15 +2,16 @@ version: 0.1 config_var: execution_service.kernel_mode: "local" session.roles: ["planner", "web_search", "code_interpreter"] + web_search.chunk_size: 2000 app_dir: ../project/ task_description: |- - The task is to find the top 3 highest grossing movies before 2024 and their worldwide gross. - You should first ask the agent to list the top 3 movies and their corresponding worldwide gross in the first step. + The task is to find the top 3 highest grossing movies in 2023 and their gross. + You should first ask the agent to list the top 3 movies and their corresponding gross in the first step. When you get the answer, ask the agent to calculate the square root of the sum of their gross, only the integer part is needed. scoring_points: - - score_point: "The top 3 movies are Avatar, Avengers: Endgame, and Avatar: The Way of Water. " + - score_point: "The top 3 movies are Barbie, The Super Mario Bros, and Spider-Man: Across the Spider-Verse. " weight: 1 - - score_point: "Their corresponding worldwide gross should be $2,923,706,026, $2,797,501,328, and $2,320,250,281" + - score_point: "Their corresponding worldwide gross should be $636,236,401, $574,934,330, and $381,593,754" weight: 1 - - score_point: "The sqrt of the sum of the gross should be around 89,674" + - score_point: "The sqrt of the sum of the gross should be around 39,909" weight: 1 diff --git a/auto_eval/taskweaver_eval.py b/auto_eval/taskweaver_eval.py index fae8913a..f8b107d9 100644 --- a/auto_eval/taskweaver_eval.py +++ b/auto_eval/taskweaver_eval.py @@ -31,7 +31,7 @@ def get_reply_from_agent(self, message: str, verbose: bool = False) -> str: ) assert response_round.state != "failed", "Failed to get response from agent." if verbose: - verbose_response = "\n" + verbose_response = "\n Below are conversation details inside the Agent: \n" for post in response_round.post_list: message = f"{post.send_from} -> {post.send_to}: {post.message}" verbose_response += f"{message}\n" @@ -69,8 +69,14 @@ def auto_evaluate_for_taskweaver( for command in pre_command: # run the command # subprocess.run(command, shell=True) - result = subprocess.check_output(command.split(" "), stderr=subprocess.STDOUT) - print(result) + result = subprocess.run(command, shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + # result = subprocess.check_output(command.split(" "), stderr=subprocess.STDOUT) + if result.returncode == 0: + print("Precommand executed successfully") + print(result.stdout) + else: + print("Command failed") + print(result.stderr) taskweaver_vuser = TaskWeaverVirtualUser(task_description, app_dir, config_var) taskweaver_evaluator = Evaluator() diff --git a/taskweaver/code_interpreter/code_interpreter_cli_only/code_generator_cli_only.py b/taskweaver/code_interpreter/code_interpreter_cli_only/code_generator_cli_only.py index 59a3d61e..84bd003f 100644 --- a/taskweaver/code_interpreter/code_interpreter_cli_only/code_generator_cli_only.py +++ b/taskweaver/code_interpreter/code_interpreter_cli_only/code_generator_cli_only.py @@ -106,7 +106,10 @@ def reply( ) try: - llm_response = json.loads(llm_response["content"]) + response_content = llm_response["content"] + if "```json" in response_content: + response_content = response_content[7:-3] + llm_response = json.loads(response_content) output_size = self.tracing.count_tokens(llm_response) self.tracing.set_span_attribute("output_size", output_size) self.tracing.add_prompt_size( diff --git a/taskweaver/planner/planner_prompt.yaml b/taskweaver/planner/planner_prompt.yaml index 355ddcad..9f7f62dd 100644 --- a/taskweaver/planner/planner_prompt.yaml +++ b/taskweaver/planner/planner_prompt.yaml @@ -24,7 +24,7 @@ instruction_template: |- - Planner can assign different subtasks to different Workers, but each subtask should be assigned to only one Worker. - Planner should try the best to complete the tasks before asking the User for additional information. - Planner should refine or change the plan according to the replies of the Workers or the new requests of User. - - Planner should ignore the permission or access issues assuming Workers can handle them. + - Planner should ignore the permission or file access issues since Workers are powerful and can handle them. - Planner needs to inform Workers on the User's request and the current step. - Planner must reject the User's request if it contains potential security risks or illegal activities. From caf7d459d3b4dfdede10031ae1b34b8dfac04cd6 Mon Sep 17 00:00:00 2001 From: Shilin He Date: Thu, 23 May 2024 04:55:14 +0000 Subject: [PATCH 2/2] update the eval cases --- .../code_interpreter_cli_only/code_generator_cli_only.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taskweaver/code_interpreter/code_interpreter_cli_only/code_generator_cli_only.py b/taskweaver/code_interpreter/code_interpreter_cli_only/code_generator_cli_only.py index 84bd003f..88e49271 100644 --- a/taskweaver/code_interpreter/code_interpreter_cli_only/code_generator_cli_only.py +++ b/taskweaver/code_interpreter/code_interpreter_cli_only/code_generator_cli_only.py @@ -107,7 +107,7 @@ def reply( try: response_content = llm_response["content"] - if "```json" in response_content: + if response_content.startswith("```json"): response_content = response_content[7:-3] llm_response = json.loads(response_content) output_size = self.tracing.count_tokens(llm_response)