Skip to content

Commit

Permalink
fix(eval): add runtime.connect to all eval harness (All-Hands-AI#4565)
Browse files Browse the repository at this point in the history
  • Loading branch information
xingyaoww authored Oct 25, 2024
1 parent 7340b78 commit 1f23dc8
Show file tree
Hide file tree
Showing 21 changed files with 45 additions and 6 deletions.
2 changes: 2 additions & 0 deletions docs/modules/usage/how-to/evaluation-harness.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,11 @@ To create an evaluation workflow for your benchmark, follow these steps:

4. Create a function to process each instance:
```python
from openhands.utils.async_utils import call_async_from_sync
def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput:
config = get_config(instance, metadata)
runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)

instruction = get_instruction(instance, metadata)
Expand Down
2 changes: 2 additions & 0 deletions evaluation/EDA/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from openhands.core.logger import openhands_logger as logger
from openhands.core.main import create_runtime, run_controller
from openhands.events.action import MessageAction
from openhands.utils.async_utils import call_async_from_sync

game = None

Expand Down Expand Up @@ -119,6 +120,7 @@ def process_instance(

# Here's how you can run the agent (similar to the `main` function) and get the final task state
runtime = create_runtime(config)
call_async_from_sync(runtime.connect)

state: State | None = asyncio.run(
run_controller(
Expand Down
2 changes: 2 additions & 0 deletions evaluation/agent_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync


def get_config(
Expand Down Expand Up @@ -210,6 +211,7 @@ def process_instance(
# =============================================

runtime: Runtime = create_runtime(config)
call_async_from_sync(runtime.connect)

initialize_runtime(runtime, instance=instance)

Expand Down
2 changes: 2 additions & 0 deletions evaluation/aider_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

# Configure visibility of unit tests to the Agent.
USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'false').lower() == 'true'
Expand Down Expand Up @@ -207,6 +208,7 @@ def process_instance(
# =============================================

runtime: Runtime = create_runtime(config)
call_async_from_sync(runtime.connect)

initialize_runtime(runtime, instance=instance)

Expand Down
3 changes: 2 additions & 1 deletion evaluation/biocoder/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': functools.partial(
Expand Down Expand Up @@ -275,7 +276,7 @@ def process_instance(
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

runtime = create_runtime(config)

call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
Expand Down
2 changes: 2 additions & 0 deletions evaluation/bird/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync


def codeact_user_response(state: State) -> str:
Expand Down Expand Up @@ -403,6 +404,7 @@ def execute_sql(db_path, sql):
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
Expand Down
2 changes: 2 additions & 0 deletions evaluation/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

DATASET_CACHE_DIR = os.path.join(os.path.dirname(__file__), 'data')

Expand Down Expand Up @@ -142,6 +143,7 @@ def process_instance(
logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
Expand Down
2 changes: 2 additions & 0 deletions evaluation/gorilla/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from openhands.core.logger import openhands_logger as logger
from openhands.core.main import create_runtime, run_controller
from openhands.events.action import MessageAction
from openhands.utils.async_utils import call_async_from_sync

AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
Expand Down Expand Up @@ -81,6 +82,7 @@ def process_instance(

# Here's how you can run the agent (similar to the `main` function) and get the final task state
runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
state: State | None = asyncio.run(
run_controller(
config=config,
Expand Down
3 changes: 2 additions & 1 deletion evaluation/gpqa/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
MessageAction,
)
from openhands.events.observation import Observation
from openhands.utils.async_utils import call_async_from_sync

ACTION_FORMAT = """
<<FINAL_ANSWER||
Expand Down Expand Up @@ -215,7 +216,7 @@ def process_instance(
"""

runtime = create_runtime(config)

call_async_from_sync(runtime.connect)
state: State | None = asyncio.run(
run_controller(
config=config,
Expand Down
2 changes: 2 additions & 0 deletions evaluation/humanevalfix/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

IMPORT_HELPER = {
'python': [
Expand Down Expand Up @@ -233,6 +234,7 @@ def process_instance(

# Here's how you can run the agent (similar to the `main` function) and get the final task state
runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)
state: State | None = asyncio.run(
run_controller(
Expand Down
2 changes: 2 additions & 0 deletions evaluation/integration_tests/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from openhands.core.main import create_runtime, run_controller
from openhands.events.action import MessageAction
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

FAKE_RESPONSES = {
'CodeActAgent': codeact_user_response,
Expand Down Expand Up @@ -101,6 +102,7 @@ def process_instance(
# =============================================

runtime: Runtime = create_runtime(config)
call_async_from_sync(runtime.connect)

test_class.initialize_runtime(runtime)

Expand Down
2 changes: 2 additions & 0 deletions evaluation/logic_reasoning/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
)
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
Expand Down Expand Up @@ -202,6 +203,7 @@ def process_instance(
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
Expand Down
2 changes: 2 additions & 0 deletions evaluation/miniwob/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
BROWSER_EVAL_GET_GOAL_ACTION,
BROWSER_EVAL_GET_REWARDS_ACTION,
)
from openhands.utils.async_utils import call_async_from_sync

SUPPORTED_AGENT_CLS = {'BrowsingAgent'}

Expand Down Expand Up @@ -127,6 +128,7 @@ def process_instance(
logger.info(f'Starting evaluation for instance {env_id}.')

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
task_str = initialize_runtime(runtime)
state: State | None = asyncio.run(
run_controller(
Expand Down
2 changes: 2 additions & 0 deletions evaluation/mint/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
)
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync


def codeact_user_response_mint(state: State, task: Task, task_config: dict[str, int]):
Expand Down Expand Up @@ -176,6 +177,7 @@ def process_instance(
)

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime)

state: State | None = asyncio.run(
Expand Down
2 changes: 2 additions & 0 deletions evaluation/ml_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

config = load_app_config()

Expand Down Expand Up @@ -233,6 +234,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)

# Run the agent
Expand Down
3 changes: 2 additions & 1 deletion evaluation/swe_bench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from openhands.core.main import create_runtime
from openhands.events.action import CmdRunAction
from openhands.events.observation import CmdOutputObservation
from openhands.utils.async_utils import call_async_from_sync

# TODO: migrate all swe-bench docker to ghcr.io/openhands
DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/')
Expand Down Expand Up @@ -128,7 +129,7 @@ def process_instance(
)

runtime = create_runtime(config)

call_async_from_sync(runtime.connect)
# Get patch and save it to /tmp/patch.diff
with tempfile.TemporaryDirectory() as temp_dir:
# Patch file
Expand Down
2 changes: 2 additions & 0 deletions evaluation/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from openhands.events.serialization.event import event_to_dict
from openhands.runtime.base import Runtime
from openhands.runtime.utils.shutdown_listener import sleep_if_should_continue
from openhands.utils.async_utils import call_async_from_sync

USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
Expand Down Expand Up @@ -380,6 +381,7 @@ def process_instance(
logger.info(f'Starting evaluation for instance {instance.instance_id}.')

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)

try:
initialize_runtime(runtime, instance)
Expand Down
2 changes: 2 additions & 0 deletions evaluation/toolqa/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
Expand Down Expand Up @@ -103,6 +104,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
Expand Down
2 changes: 2 additions & 0 deletions evaluation/webarena/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
BROWSER_EVAL_GET_GOAL_ACTION,
BROWSER_EVAL_GET_REWARDS_ACTION,
)
from openhands.utils.async_utils import call_async_from_sync

SUPPORTED_AGENT_CLS = {'BrowsingAgent'}

Expand Down Expand Up @@ -143,6 +144,7 @@ def process_instance(
logger.info(f'Starting evaluation for instance {env_id}.')

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
task_str = initialize_runtime(runtime)

state: State | None = asyncio.run(
Expand Down
2 changes: 1 addition & 1 deletion openhands/core/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ async def run_controller(

if runtime is None:
runtime = create_runtime(config, sid=sid)
await runtime.connect()
await runtime.connect()

event_stream = runtime.event_stream
# restore cli session if enabled
Expand Down
8 changes: 6 additions & 2 deletions openhands/runtime/utils/shutdown_listener.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import asyncio
import signal
import threading
import time
from types import FrameType

Expand All @@ -29,8 +30,11 @@ def _register_signal_handlers():
if _should_exit is not None:
return
_should_exit = False
for sig in HANDLED_SIGNALS:
_register_signal_handler(sig)

# Check if we're in the main thread of the main interpreter
if threading.current_thread() is threading.main_thread():
for sig in HANDLED_SIGNALS:
_register_signal_handler(sig)


def should_exit() -> bool:
Expand Down

0 comments on commit 1f23dc8

Please sign in to comment.