From 1f23dc89b6edc661395a3ddb21c4ba352758a39b Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Fri, 25 Oct 2024 11:41:30 -0500
Subject: [PATCH] fix(eval): add runtime.connect to all eval harness (#4565)

---
 docs/modules/usage/how-to/evaluation-harness.md | 2 ++
 evaluation/EDA/run_infer.py                     | 2 ++
 evaluation/agent_bench/run_infer.py             | 2 ++
 evaluation/aider_bench/run_infer.py             | 2 ++
 evaluation/biocoder/run_infer.py                | 3 ++-
 evaluation/bird/run_infer.py                    | 2 ++
 evaluation/gaia/run_infer.py                    | 2 ++
 evaluation/gorilla/run_infer.py                 | 2 ++
 evaluation/gpqa/run_infer.py                    | 3 ++-
 evaluation/humanevalfix/run_infer.py            | 2 ++
 evaluation/integration_tests/run_infer.py       | 2 ++
 evaluation/logic_reasoning/run_infer.py         | 2 ++
 evaluation/miniwob/run_infer.py                 | 2 ++
 evaluation/mint/run_infer.py                    | 2 ++
 evaluation/ml_bench/run_infer.py                | 2 ++
 evaluation/swe_bench/eval_infer.py              | 3 ++-
 evaluation/swe_bench/run_infer.py               | 2 ++
 evaluation/toolqa/run_infer.py                  | 2 ++
 evaluation/webarena/run_infer.py                | 2 ++
 openhands/core/main.py                          | 2 +-
 openhands/runtime/utils/shutdown_listener.py    | 8 ++++++--
 21 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/docs/modules/usage/how-to/evaluation-harness.md b/docs/modules/usage/how-to/evaluation-harness.md
index 32717675e3d0..daf144d11e88 100644
--- a/docs/modules/usage/how-to/evaluation-harness.md
+++ b/docs/modules/usage/how-to/evaluation-harness.md
@@ -134,9 +134,11 @@ To create an evaluation workflow for your benchmark, follow these steps:
 
 4. Create a function to process each instance:
    ```python
+   from openhands.utils.async_utils import call_async_from_sync
    def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput:
        config = get_config(instance, metadata)
        runtime = create_runtime(config)
+       call_async_from_sync(runtime.connect)
        initialize_runtime(runtime, instance)
 
        instruction = get_instruction(instance, metadata)
diff --git a/evaluation/EDA/run_infer.py b/evaluation/EDA/run_infer.py
index 81c7455e0041..2c896939a751 100644
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -23,6 +23,7 @@
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import MessageAction
+from openhands.utils.async_utils import call_async_from_sync
 
 game = None
 
@@ -119,6 +120,7 @@ def process_instance(
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
     runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
 
     state: State | None = asyncio.run(
         run_controller(
diff --git a/evaluation/agent_bench/run_infer.py b/evaluation/agent_bench/run_infer.py
index f0ea180f4ba0..d6fcc62e0798 100644
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@@ -33,6 +33,7 @@
 from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
 
 
 def get_config(
@@ -210,6 +211,7 @@ def process_instance(
     # =============================================
 
     runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
 
     initialize_runtime(runtime, instance=instance)
 
diff --git a/evaluation/aider_bench/run_infer.py b/evaluation/aider_bench/run_infer.py
index c59de4f441f8..fa1bb9534a83 100644
--- a/evaluation/aider_bench/run_infer.py
+++ b/evaluation/aider_bench/run_infer.py
@@ -33,6 +33,7 @@
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
 
 # Configure visibility of unit tests to the Agent.
 USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'false').lower() == 'true'
@@ -207,6 +208,7 @@ def process_instance(
     # =============================================
 
     runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
 
     initialize_runtime(runtime, instance=instance)
 
diff --git a/evaluation/biocoder/run_infer.py b/evaluation/biocoder/run_infer.py
index 9dcff6d6ef42..4535ccba4e4e 100644
--- a/evaluation/biocoder/run_infer.py
+++ b/evaluation/biocoder/run_infer.py
@@ -30,6 +30,7 @@
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': functools.partial(
@@ -275,7 +276,7 @@ def process_instance(
     instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
 
     runtime = create_runtime(config)
-
+    call_async_from_sync(runtime.connect)
     initialize_runtime(runtime, instance)
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
diff --git a/evaluation/bird/run_infer.py b/evaluation/bird/run_infer.py
index 0fba8c5c6456..adb498cd2eb1 100644
--- a/evaluation/bird/run_infer.py
+++ b/evaluation/bird/run_infer.py
@@ -33,6 +33,7 @@
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
 
 
 def codeact_user_response(state: State) -> str:
@@ -403,6 +404,7 @@ def execute_sql(db_path, sql):
     instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
 
     runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
     initialize_runtime(runtime, instance)
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
diff --git a/evaluation/gaia/run_infer.py b/evaluation/gaia/run_infer.py
index f5794d3ad3cb..c02cd0aee737 100644
--- a/evaluation/gaia/run_infer.py
+++ b/evaluation/gaia/run_infer.py
@@ -29,6 +29,7 @@
 from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
 
 DATASET_CACHE_DIR = os.path.join(os.path.dirname(__file__), 'data')
 
@@ -142,6 +143,7 @@ def process_instance(
     logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
 
     runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
     initialize_runtime(runtime, instance)
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
diff --git a/evaluation/gorilla/run_infer.py b/evaluation/gorilla/run_infer.py
index ac49a8078d5b..873cb7f89694 100644
--- a/evaluation/gorilla/run_infer.py
+++ b/evaluation/gorilla/run_infer.py
@@ -25,6 +25,7 @@
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import MessageAction
+from openhands.utils.async_utils import call_async_from_sync
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,
@@ -81,6 +82,7 @@ def process_instance(
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
     runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
     state: State | None = asyncio.run(
         run_controller(
             config=config,
diff --git a/evaluation/gpqa/run_infer.py b/evaluation/gpqa/run_infer.py
index fe7ff4bf1c18..8fd4034c9d5e 100644
--- a/evaluation/gpqa/run_infer.py
+++ b/evaluation/gpqa/run_infer.py
@@ -48,6 +48,7 @@
     MessageAction,
 )
 from openhands.events.observation import Observation
+from openhands.utils.async_utils import call_async_from_sync
 
 ACTION_FORMAT = """
 <<FINAL_ANSWER||
@@ -215,7 +216,7 @@ def process_instance(
 """
 
     runtime = create_runtime(config)
-
+    call_async_from_sync(runtime.connect)
     state: State | None = asyncio.run(
         run_controller(
             config=config,
diff --git a/evaluation/humanevalfix/run_infer.py b/evaluation/humanevalfix/run_infer.py
index 9b0abbc298a8..25fee65561fc 100644
--- a/evaluation/humanevalfix/run_infer.py
+++ b/evaluation/humanevalfix/run_infer.py
@@ -38,6 +38,7 @@
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
 
 IMPORT_HELPER = {
     'python': [
@@ -233,6 +234,7 @@ def process_instance(
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
     runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
     initialize_runtime(runtime, instance)
     state: State | None = asyncio.run(
         run_controller(
diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
index 5c42efb8026c..a530041f92f7 100644
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -25,6 +25,7 @@
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import MessageAction
 from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
 
 FAKE_RESPONSES = {
     'CodeActAgent': codeact_user_response,
@@ -101,6 +102,7 @@ def process_instance(
     # =============================================
 
     runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
 
     test_class.initialize_runtime(runtime)
 
diff --git a/evaluation/logic_reasoning/run_infer.py b/evaluation/logic_reasoning/run_infer.py
index b07af7b08cdf..5b7d35f21130 100644
--- a/evaluation/logic_reasoning/run_infer.py
+++ b/evaluation/logic_reasoning/run_infer.py
@@ -30,6 +30,7 @@
 )
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,
@@ -202,6 +203,7 @@ def process_instance(
     instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
 
     runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
     initialize_runtime(runtime, instance)
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
diff --git a/evaluation/miniwob/run_infer.py b/evaluation/miniwob/run_infer.py
index ccd36f2d389b..9c2aaf1e0963 100644
--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@@ -35,6 +35,7 @@
     BROWSER_EVAL_GET_GOAL_ACTION,
     BROWSER_EVAL_GET_REWARDS_ACTION,
 )
+from openhands.utils.async_utils import call_async_from_sync
 
 SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
 
@@ -127,6 +128,7 @@ def process_instance(
         logger.info(f'Starting evaluation for instance {env_id}.')
 
     runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
     task_str = initialize_runtime(runtime)
     state: State | None = asyncio.run(
         run_controller(
diff --git a/evaluation/mint/run_infer.py b/evaluation/mint/run_infer.py
index 80127420a431..8017b194d8d8 100644
--- a/evaluation/mint/run_infer.py
+++ b/evaluation/mint/run_infer.py
@@ -33,6 +33,7 @@
 )
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
 
 
 def codeact_user_response_mint(state: State, task: Task, task_config: dict[str, int]):
@@ -176,6 +177,7 @@ def process_instance(
     )
 
     runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
     initialize_runtime(runtime)
 
     state: State | None = asyncio.run(
diff --git a/evaluation/ml_bench/run_infer.py b/evaluation/ml_bench/run_infer.py
index 0f1ba0ab957a..deec068f3392 100644
--- a/evaluation/ml_bench/run_infer.py
+++ b/evaluation/ml_bench/run_infer.py
@@ -42,6 +42,7 @@
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
 
 config = load_app_config()
 
@@ -233,6 +234,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
     instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
 
     runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
     initialize_runtime(runtime, instance)
 
     # Run the agent
diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/swe_bench/eval_infer.py
index 14429fc9859d..cf6d71d3b3ee 100644
--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/swe_bench/eval_infer.py
@@ -28,6 +28,7 @@
 from openhands.core.main import create_runtime
 from openhands.events.action import CmdRunAction
 from openhands.events.observation import CmdOutputObservation
+from openhands.utils.async_utils import call_async_from_sync
 
 # TODO: migrate all swe-bench docker to ghcr.io/openhands
 DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/')
@@ -128,7 +129,7 @@ def process_instance(
         )
 
     runtime = create_runtime(config)
-
+    call_async_from_sync(runtime.connect)
     # Get patch and save it to /tmp/patch.diff
     with tempfile.TemporaryDirectory() as temp_dir:
         # Patch file
diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index a05d17dd5c20..9ac1e0cf6639 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -35,6 +35,7 @@
 from openhands.events.serialization.event import event_to_dict
 from openhands.runtime.base import Runtime
 from openhands.runtime.utils.shutdown_listener import sleep_if_should_continue
+from openhands.utils.async_utils import call_async_from_sync
 
 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
 USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
@@ -380,6 +381,7 @@ def process_instance(
         logger.info(f'Starting evaluation for instance {instance.instance_id}.')
 
     runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
 
     try:
         initialize_runtime(runtime, instance)
diff --git a/evaluation/toolqa/run_infer.py b/evaluation/toolqa/run_infer.py
index 85f565120ac9..5c2c53422785 100644
--- a/evaluation/toolqa/run_infer.py
+++ b/evaluation/toolqa/run_infer.py
@@ -26,6 +26,7 @@
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,
@@ -103,6 +104,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
     logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
 
     runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
     initialize_runtime(runtime)
 
     # Here's how you can run the agent (similar to the `main` function) and get the final task state
diff --git a/evaluation/webarena/run_infer.py b/evaluation/webarena/run_infer.py
index 2291aa08c8d4..cfc2bdae493a 100644
--- a/evaluation/webarena/run_infer.py
+++ b/evaluation/webarena/run_infer.py
@@ -35,6 +35,7 @@
     BROWSER_EVAL_GET_GOAL_ACTION,
     BROWSER_EVAL_GET_REWARDS_ACTION,
 )
+from openhands.utils.async_utils import call_async_from_sync
 
 SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
 
@@ -143,6 +144,7 @@ def process_instance(
         logger.info(f'Starting evaluation for instance {env_id}.')
 
     runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
     task_str = initialize_runtime(runtime)
 
     state: State | None = asyncio.run(
diff --git a/openhands/core/main.py b/openhands/core/main.py
index 23763bc45721..110856d6e66f 100644
--- a/openhands/core/main.py
+++ b/openhands/core/main.py
@@ -122,7 +122,7 @@ async def run_controller(
 
     if runtime is None:
         runtime = create_runtime(config, sid=sid)
-    await runtime.connect()
+        await runtime.connect()
 
     event_stream = runtime.event_stream
     # restore cli session if enabled
diff --git a/openhands/runtime/utils/shutdown_listener.py b/openhands/runtime/utils/shutdown_listener.py
index 9941c9f27372..3aedd2672270 100644
--- a/openhands/runtime/utils/shutdown_listener.py
+++ b/openhands/runtime/utils/shutdown_listener.py
@@ -4,6 +4,7 @@
 
 import asyncio
 import signal
+import threading
 import time
 from types import FrameType
 
@@ -29,8 +30,11 @@ def _register_signal_handlers():
     if _should_exit is not None:
         return
     _should_exit = False
-    for sig in HANDLED_SIGNALS:
-        _register_signal_handler(sig)
+
+    # Check if we're in the main thread of the main interpreter
+    if threading.current_thread() is threading.main_thread():
+        for sig in HANDLED_SIGNALS:
+            _register_signal_handler(sig)
 
 
 def should_exit() -> bool: