[Eval] DiscoveryBench OpenHands Integration #4
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Workflow that builds, tests and then pushes the OpenHands and runtime docker images to the ghcr.io repository | |
name: Build, Test and Publish RT Image | |
# Always run on "main" | |
# Always run on tags | |
# Always run on PRs | |
# Can also be triggered manually | |
on: | |
push: | |
branches: | |
- main | |
tags: | |
- '*' | |
pull_request: | |
workflow_dispatch: | |
inputs: | |
reason: | |
description: 'Reason for manual trigger' | |
required: true | |
default: '' | |
# If triggered by a PR, it will be in the same group. However, each commit on main will be in its own unique group | |
concurrency: | |
group: ${{ github.workflow }}-${{ (github.head_ref && github.ref) || github.run_id }} | |
cancel-in-progress: true | |
env: | |
BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST: nikolaik/python-nodejs:python3.12-nodejs22 | |
RELEVANT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} | |
jobs: | |
# Builds the OpenHands Docker images | |
ghcr_build_app: | |
name: Build App Image | |
runs-on: ubuntu-latest | |
permissions: | |
contents: read | |
packages: write | |
outputs: | |
hash_from_app_image: ${{ steps.get_hash_in_app_image.outputs.hash_from_app_image }} | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Free Disk Space (Ubuntu) | |
uses: jlumbroso/free-disk-space@main | |
with: | |
# this might remove tools that are actually needed, | |
# if set to "true" but frees about 6 GB | |
tool-cache: true | |
# all of these default to true, but feel free to set to | |
# "false" if necessary for your workflow | |
android: true | |
dotnet: true | |
haskell: true | |
large-packages: true | |
docker-images: false | |
swap-storage: true | |
- name: Set up QEMU | |
uses: docker/[email protected] | |
with: | |
image: tonistiigi/binfmt:latest | |
- name: Login to GHCR | |
uses: docker/login-action@v3 | |
with: | |
registry: ghcr.io | |
username: ${{ github.repository_owner }} | |
password: ${{ secrets.GITHUB_TOKEN }} | |
- name: Set up Docker Buildx | |
id: buildx | |
uses: docker/setup-buildx-action@v3 | |
- name: Build and push app image | |
if: "!github.event.pull_request.head.repo.fork" | |
run: | | |
./containers/build.sh -i openhands -o ${{ github.repository_owner }} --push | |
- name: Build app image | |
if: "github.event.pull_request.head.repo.fork" | |
run: | | |
./containers/build.sh -i openhands -o ${{ github.repository_owner }} --load | |
- name: Get hash in App Image | |
id: get_hash_in_app_image | |
run: | | |
# Lowercase the repository owner | |
export REPO_OWNER=${{ github.repository_owner }} | |
REPO_OWNER=$(echo $REPO_OWNER | tr '[:upper:]' '[:lower:]') | |
# Run the build script in the app image | |
docker run -e SANDBOX_USER_ID=0 -v /var/run/docker.sock:/var/run/docker.sock ghcr.io/${REPO_OWNER}/openhands:${{ env.RELEVANT_SHA }} /bin/bash -c "mkdir -p containers/runtime; python3 openhands/runtime/utils/runtime_build.py --base_image ${{ env.BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST }} --build_folder containers/runtime --force_rebuild" 2>&1 | tee docker-outputs.txt | |
# Get the hash from the build script | |
hash_from_app_image=$(cat docker-outputs.txt | grep "Hash for docker build directory" | awk -F "): " '{print $2}' | uniq | head -n1) | |
echo "hash_from_app_image=$hash_from_app_image" >> $GITHUB_OUTPUT | |
echo "Hash from app image: $hash_from_app_image" | |
# Builds the runtime Docker images | |
ghcr_build_runtime: | |
name: Build Image | |
runs-on: ubuntu-latest | |
permissions: | |
contents: read | |
packages: write | |
strategy: | |
matrix: | |
base_image: | |
- image: 'nikolaik/python-nodejs:python3.12-nodejs22' | |
tag: nikolaik | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Free Disk Space (Ubuntu) | |
uses: jlumbroso/free-disk-space@main | |
with: | |
# this might remove tools that are actually needed, | |
# if set to "true" but frees about 6 GB | |
tool-cache: true | |
# all of these default to true, but feel free to set to | |
# "false" if necessary for your workflow | |
android: true | |
dotnet: true | |
haskell: true | |
large-packages: true | |
docker-images: false | |
swap-storage: true | |
- name: Set up QEMU | |
uses: docker/[email protected] | |
with: | |
image: tonistiigi/binfmt:latest | |
- name: Login to GHCR | |
uses: docker/login-action@v3 | |
with: | |
registry: ghcr.io | |
username: ${{ github.repository_owner }} | |
password: ${{ secrets.GITHUB_TOKEN }} | |
- name: Set up Docker Buildx | |
id: buildx | |
uses: docker/setup-buildx-action@v3 | |
- name: Set up Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.12' | |
- name: Cache Poetry dependencies | |
uses: actions/cache@v4 | |
with: | |
path: | | |
~/.cache/pypoetry | |
~/.virtualenvs | |
key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }} | |
restore-keys: | | |
${{ runner.os }}-poetry- | |
- name: Install poetry via pipx | |
run: pipx install poetry | |
- name: Install Python dependencies using Poetry | |
run: make install-python-dependencies | |
- name: Create source distribution and Dockerfile | |
run: poetry run python3 openhands/runtime/utils/runtime_build.py --base_image ${{ matrix.base_image.image }} --build_folder containers/runtime --force_rebuild | |
- name: Build and push runtime image ${{ matrix.base_image.image }} | |
if: github.event.pull_request.head.repo.fork != true | |
run: | | |
./containers/build.sh -i runtime -o ${{ github.repository_owner }} --push -t ${{ matrix.base_image.tag }} | |
# Forked repos can't push to GHCR, so we need to upload the image as an artifact | |
- name: Build runtime image ${{ matrix.base_image.image }} for fork | |
if: github.event.pull_request.head.repo.fork | |
uses: docker/build-push-action@v6 | |
with: | |
tags: ghcr.io/all-hands-ai/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image.tag }} | |
outputs: type=docker,dest=/tmp/runtime-${{ matrix.base_image.tag }}.tar | |
context: containers/runtime | |
- name: Upload runtime image for fork | |
if: github.event.pull_request.head.repo.fork | |
uses: actions/upload-artifact@v4 | |
with: | |
name: runtime-${{ matrix.base_image.tag }} | |
path: /tmp/runtime-${{ matrix.base_image.tag }}.tar | |
verify_hash_equivalence_in_runtime_and_app: | |
name: Verify Hash Equivalence in Runtime and Docker images | |
runs-on: ubuntu-latest | |
needs: [ghcr_build_runtime, ghcr_build_app] | |
strategy: | |
fail-fast: false | |
matrix: | |
base_image: ['nikolaik'] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Cache Poetry dependencies | |
uses: actions/cache@v4 | |
with: | |
path: | | |
~/.cache/pypoetry | |
~/.virtualenvs | |
key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }} | |
restore-keys: | | |
${{ runner.os }}-poetry- | |
- name: Set up Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.12' | |
- name: Install poetry via pipx | |
run: pipx install poetry | |
- name: Install Python dependencies using Poetry | |
run: make install-python-dependencies | |
- name: Get hash in App Image | |
run: | | |
echo "Hash from app image: ${{ needs.ghcr_build_app.outputs.hash_from_app_image }}" | |
echo "hash_from_app_image=${{ needs.ghcr_build_app.outputs.hash_from_app_image }}" >> $GITHUB_ENV | |
- name: Get hash using code (development mode) | |
run: | | |
mkdir -p containers/runtime | |
poetry run python3 openhands/runtime/utils/runtime_build.py --base_image ${{ env.BASE_IMAGE_FOR_HASH_EQUIVALENCE_TEST }} --build_folder containers/runtime --force_rebuild > output.txt 2>&1 | |
hash_from_code=$(cat output.txt | grep "Hash for docker build directory" | awk -F "): " '{print $2}' | uniq | head -n1) | |
echo "hash_from_code=$hash_from_code" >> $GITHUB_ENV | |
- name: Compare hashes | |
run: | | |
echo "Hash from App Image: ${{ env.hash_from_app_image }}" | |
echo "Hash from Code: ${{ env.hash_from_code }}" | |
if [ "${{ env.hash_from_app_image }}" = "${{ env.hash_from_code }}" ]; then | |
echo "Hashes match!" | |
else | |
echo "Hashes do not match!" | |
exit 1 | |
fi | |
# Run unit tests with the EventStream runtime Docker images as root | |
test_runtime_root: | |
name: RT Unit Tests (Root) | |
needs: [ghcr_build_runtime] | |
runs-on: ubuntu-latest | |
strategy: | |
fail-fast: false | |
matrix: | |
base_image: ['nikolaik'] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Free Disk Space (Ubuntu) | |
uses: jlumbroso/free-disk-space@main | |
with: | |
# this might remove tools that are actually needed, | |
# if set to "true" but frees about 6 GB | |
tool-cache: true | |
# all of these default to true, but feel free to set to | |
# "false" if necessary for your workflow | |
android: true | |
dotnet: true | |
haskell: true | |
large-packages: true | |
docker-images: false | |
swap-storage: true | |
- name: Set up Docker Buildx | |
id: buildx | |
uses: docker/setup-buildx-action@v3 | |
# Forked repos can't push to GHCR, so we need to download the image as an artifact | |
- name: Download runtime image for fork | |
if: github.event.pull_request.head.repo.fork | |
uses: actions/download-artifact@v4 | |
with: | |
name: runtime-${{ matrix.base_image }} | |
path: /tmp | |
- name: Load runtime image for fork | |
if: github.event.pull_request.head.repo.fork | |
run: | | |
docker load --input /tmp/runtime-${{ matrix.base_image }}.tar | |
- name: Cache Poetry dependencies | |
uses: actions/cache@v4 | |
with: | |
path: | | |
~/.cache/pypoetry | |
~/.virtualenvs | |
key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }} | |
restore-keys: | | |
${{ runner.os }}-poetry- | |
- name: Set up Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.12' | |
- name: Install poetry via pipx | |
run: pipx install poetry | |
- name: Install Python dependencies using Poetry | |
run: make install-python-dependencies | |
- name: Run runtime tests | |
run: | | |
# We install pytest-xdist in order to run tests across CPUs | |
poetry run pip install pytest-xdist | |
# Install to be able to retry on failures for flaky tests | |
poetry run pip install pytest-rerunfailures | |
image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }} | |
image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]') | |
SKIP_CONTAINER_LOGS=true \ | |
TEST_RUNTIME=eventstream \ | |
SANDBOX_USER_ID=$(id -u) \ | |
SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \ | |
TEST_IN_CI=true \ | |
RUN_AS_OPENHANDS=false \ | |
poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime | |
- name: Upload coverage to Codecov | |
uses: codecov/codecov-action@v4 | |
env: | |
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} | |
# Run unit tests with the EventStream runtime Docker images as openhands user | |
test_runtime_oh: | |
name: RT Unit Tests (openhands) | |
runs-on: ubuntu-latest | |
needs: [ghcr_build_runtime] | |
strategy: | |
matrix: | |
base_image: ['nikolaik'] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Free Disk Space (Ubuntu) | |
uses: jlumbroso/free-disk-space@main | |
with: | |
# this might remove tools that are actually needed, | |
# if set to "true" but frees about 6 GB | |
tool-cache: true | |
# all of these default to true, but feel free to set to | |
# "false" if necessary for your workflow | |
android: true | |
dotnet: true | |
haskell: true | |
large-packages: true | |
docker-images: false | |
swap-storage: true | |
- name: Set up Docker Buildx | |
id: buildx | |
uses: docker/setup-buildx-action@v3 | |
# Forked repos can't push to GHCR, so we need to download the image as an artifact | |
- name: Download runtime image for fork | |
if: github.event.pull_request.head.repo.fork | |
uses: actions/download-artifact@v4 | |
with: | |
name: runtime-${{ matrix.base_image }} | |
path: /tmp | |
- name: Load runtime image for fork | |
if: github.event.pull_request.head.repo.fork | |
run: | | |
docker load --input /tmp/runtime-${{ matrix.base_image }}.tar | |
- name: Cache Poetry dependencies | |
uses: actions/cache@v4 | |
with: | |
path: | | |
~/.cache/pypoetry | |
~/.virtualenvs | |
key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }} | |
restore-keys: | | |
${{ runner.os }}-poetry- | |
- name: Set up Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.12' | |
- name: Install poetry via pipx | |
run: pipx install poetry | |
- name: Install Python dependencies using Poetry | |
run: make install-python-dependencies | |
- name: Run runtime tests | |
run: | | |
# We install pytest-xdist in order to run tests across CPUs | |
poetry run pip install pytest-xdist | |
# Install to be able to retry on failures for flaky tests | |
poetry run pip install pytest-rerunfailures | |
image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }} | |
image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]') | |
SKIP_CONTAINER_LOGS=true \ | |
TEST_RUNTIME=eventstream \ | |
SANDBOX_USER_ID=$(id -u) \ | |
SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \ | |
TEST_IN_CI=true \ | |
RUN_AS_OPENHANDS=true \ | |
poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime | |
- name: Upload coverage to Codecov | |
uses: codecov/codecov-action@v4 | |
env: | |
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} | |
# The two following jobs (named identically) are to check whether all the runtime tests have passed as the | |
# "All Runtime Tests Passed" is a required job for PRs to merge | |
# Due to this bug: https://github.com/actions/runner/issues/2566, we want to create a job that runs when the | |
# prerequisites have been cancelled or failed so merging is disallowed, otherwise Github considers "skipped" as "success" | |
runtime_tests_check_success: | |
name: All Runtime Tests Passed | |
if: ${{ !cancelled() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }} | |
runs-on: ubuntu-latest | |
needs: [test_runtime_root, test_runtime_oh, verify_hash_equivalence_in_runtime_and_app] | |
steps: | |
- name: All tests passed | |
run: echo "All runtime tests have passed successfully!" | |
runtime_tests_check_fail: | |
name: All Runtime Tests Passed | |
if: ${{ cancelled() || contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') }} | |
runs-on: ubuntu-latest | |
needs: [test_runtime_root, test_runtime_oh, verify_hash_equivalence_in_runtime_and_app] | |
steps: | |
- name: Some tests failed | |
run: | | |
echo "Some runtime tests failed or were cancelled" | |
exit 1 |