From 1f23dc89b6edc661395a3ddb21c4ba352758a39b Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 25 Oct 2024 11:41:30 -0500 Subject: [PATCH] fix(eval): add runtime.connect to all eval harness (#4565) --- docs/modules/usage/how-to/evaluation-harness.md | 2 ++ evaluation/EDA/run_infer.py | 2 ++ evaluation/agent_bench/run_infer.py | 2 ++ evaluation/aider_bench/run_infer.py | 2 ++ evaluation/biocoder/run_infer.py | 3 ++- evaluation/bird/run_infer.py | 2 ++ evaluation/gaia/run_infer.py | 2 ++ evaluation/gorilla/run_infer.py | 2 ++ evaluation/gpqa/run_infer.py | 3 ++- evaluation/humanevalfix/run_infer.py | 2 ++ evaluation/integration_tests/run_infer.py | 2 ++ evaluation/logic_reasoning/run_infer.py | 2 ++ evaluation/miniwob/run_infer.py | 2 ++ evaluation/mint/run_infer.py | 2 ++ evaluation/ml_bench/run_infer.py | 2 ++ evaluation/swe_bench/eval_infer.py | 3 ++- evaluation/swe_bench/run_infer.py | 2 ++ evaluation/toolqa/run_infer.py | 2 ++ evaluation/webarena/run_infer.py | 2 ++ openhands/core/main.py | 2 +- openhands/runtime/utils/shutdown_listener.py | 8 ++++++-- 21 files changed, 45 insertions(+), 6 deletions(-) diff --git a/docs/modules/usage/how-to/evaluation-harness.md b/docs/modules/usage/how-to/evaluation-harness.md index 32717675e3d0..daf144d11e88 100644 --- a/docs/modules/usage/how-to/evaluation-harness.md +++ b/docs/modules/usage/how-to/evaluation-harness.md @@ -134,9 +134,11 @@ To create an evaluation workflow for your benchmark, follow these steps: 4. Create a function to process each instance: ```python + from openhands.utils.async_utils import call_async_from_sync def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput: config = get_config(instance, metadata) runtime = create_runtime(config) + call_async_from_sync(runtime.connect) initialize_runtime(runtime, instance) instruction = get_instruction(instance, metadata) diff --git a/evaluation/EDA/run_infer.py b/evaluation/EDA/run_infer.py index 81c7455e0041..2c896939a751 100644 --- a/evaluation/EDA/run_infer.py +++ b/evaluation/EDA/run_infer.py @@ -23,6 +23,7 @@ from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller from openhands.events.action import MessageAction +from openhands.utils.async_utils import call_async_from_sync game = None @@ -119,6 +120,7 @@ def process_instance( # Here's how you can run the agent (similar to the `main` function) and get the final task state runtime = create_runtime(config) + call_async_from_sync(runtime.connect) state: State | None = asyncio.run( run_controller( diff --git a/evaluation/agent_bench/run_infer.py b/evaluation/agent_bench/run_infer.py index f0ea180f4ba0..d6fcc62e0798 100644 --- a/evaluation/agent_bench/run_infer.py +++ b/evaluation/agent_bench/run_infer.py @@ -33,6 +33,7 @@ from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync def get_config( @@ -210,6 +211,7 @@ def process_instance( # ============================================= runtime: Runtime = create_runtime(config) + call_async_from_sync(runtime.connect) initialize_runtime(runtime, instance=instance) diff --git a/evaluation/aider_bench/run_infer.py b/evaluation/aider_bench/run_infer.py index c59de4f441f8..fa1bb9534a83 100644 --- a/evaluation/aider_bench/run_infer.py +++ b/evaluation/aider_bench/run_infer.py @@ -33,6 +33,7 @@ from openhands.events.action import CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync # Configure visibility of unit tests to the Agent. USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'false').lower() == 'true' @@ -207,6 +208,7 @@ def process_instance( # ============================================= runtime: Runtime = create_runtime(config) + call_async_from_sync(runtime.connect) initialize_runtime(runtime, instance=instance) diff --git a/evaluation/biocoder/run_infer.py b/evaluation/biocoder/run_infer.py index 9dcff6d6ef42..4535ccba4e4e 100644 --- a/evaluation/biocoder/run_infer.py +++ b/evaluation/biocoder/run_infer.py @@ -30,6 +30,7 @@ from openhands.events.action import CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { 'CodeActAgent': functools.partial( @@ -275,7 +276,7 @@ def process_instance( instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] runtime = create_runtime(config) - + call_async_from_sync(runtime.connect) initialize_runtime(runtime, instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state diff --git a/evaluation/bird/run_infer.py b/evaluation/bird/run_infer.py index 0fba8c5c6456..adb498cd2eb1 100644 --- a/evaluation/bird/run_infer.py +++ b/evaluation/bird/run_infer.py @@ -33,6 +33,7 @@ from openhands.events.action import CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync def codeact_user_response(state: State) -> str: @@ -403,6 +404,7 @@ def execute_sql(db_path, sql): instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] runtime = create_runtime(config) + call_async_from_sync(runtime.connect) initialize_runtime(runtime, instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state diff --git a/evaluation/gaia/run_infer.py b/evaluation/gaia/run_infer.py index f5794d3ad3cb..c02cd0aee737 100644 --- a/evaluation/gaia/run_infer.py +++ b/evaluation/gaia/run_infer.py @@ -29,6 +29,7 @@ from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction from openhands.events.observation import CmdOutputObservation from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync DATASET_CACHE_DIR = os.path.join(os.path.dirname(__file__), 'data') @@ -142,6 +143,7 @@ def process_instance( logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'}) runtime = create_runtime(config) + call_async_from_sync(runtime.connect) initialize_runtime(runtime, instance) # Here's how you can run the agent (similar to the `main` function) and get the final task state diff --git a/evaluation/gorilla/run_infer.py b/evaluation/gorilla/run_infer.py index ac49a8078d5b..873cb7f89694 100644 --- a/evaluation/gorilla/run_infer.py +++ b/evaluation/gorilla/run_infer.py @@ -25,6 +25,7 @@ from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller from openhands.events.action import MessageAction +from openhands.utils.async_utils import call_async_from_sync AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { 'CodeActAgent': codeact_user_response, @@ -81,6 +82,7 @@ def process_instance( # Here's how you can run the agent (similar to the `main` function) and get the final task state runtime = create_runtime(config) + call_async_from_sync(runtime.connect) state: State | None = asyncio.run( run_controller( config=config, diff --git a/evaluation/gpqa/run_infer.py b/evaluation/gpqa/run_infer.py index fe7ff4bf1c18..8fd4034c9d5e 100644 --- a/evaluation/gpqa/run_infer.py +++ b/evaluation/gpqa/run_infer.py @@ -48,6 +48,7 @@ MessageAction, ) from openhands.events.observation import Observation +from openhands.utils.async_utils import call_async_from_sync ACTION_FORMAT = """ < bool: