Run SWE-Bench Evaluation #12
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Run Evaluation | |
on: | |
pull_request: | |
types: [labeled] | |
schedule: | |
- cron: "0 1 * * *" # Run daily at 1 AM UTC | |
workflow_dispatch: | |
inputs: | |
reason: | |
description: "Reason for manual trigger" | |
required: true | |
default: "" | |
env: | |
N_PROCESSES: 32 # Global configuration for number of parallel processes for evaluation | |
jobs: | |
run-evaluation: | |
if: github.event.label.name == 'eval-this' || github.event_name != 'pull_request' | |
runs-on: ubuntu-latest | |
permissions: | |
contents: "read" | |
id-token: "write" | |
pull-requests: "write" | |
issues: "write" | |
strategy: | |
matrix: | |
python-version: ["3.12"] | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- name: Install poetry via pipx | |
run: pipx install poetry | |
- name: Set up Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ matrix.python-version }} | |
cache: "poetry" | |
- name: Comment on PR if 'eval-this' label is present | |
if: github.event_name == 'pull_request' && github.event.label.name == 'eval-this' | |
uses: KeisukeYamashita/create-comment@v1 | |
with: | |
unique: false | |
comment: | | |
Hi! I started running the evaluation on your PR. You will receive a comment with the results shortly. | |
- name: Install Python dependencies using Poetry | |
run: poetry install | |
- name: Configure config.toml for evaluation | |
env: | |
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_LLM_API_KEY }} | |
run: | | |
echo "[llm.eval]" > config.toml | |
echo "model = \"deepseek/deepseek-chat\"" >> config.toml | |
echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml | |
echo "temperature = 0.0" >> config.toml | |
- name: Run integration test evaluation | |
env: | |
ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} | |
RUNTIME: remote | |
SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev | |
EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images | |
run: | | |
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES | |
# get evaluation report | |
REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1) | |
echo "REPORT_FILE: $REPORT_FILE" | |
echo "INTEGRATION_TEST_REPORT<<EOF" >> $GITHUB_ENV | |
cat $REPORT_FILE >> $GITHUB_ENV | |
echo >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
- name: Run SWE-Bench evaluation | |
env: | |
ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} | |
RUNTIME: remote | |
SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev | |
EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images | |
run: | | |
poetry run ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test | |
OUTPUT_FOLDER=$(find evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Lite-test/CodeActAgent -name "deepseek-chat_maxiter_50_N_*-no-hint-run_1" -type d | head -n 1) | |
echo "OUTPUT_FOLDER for SWE-bench evaluation: $OUTPUT_FOLDER" | |
poetry run ./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FOLDER/output.jsonl $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test | |
poetry run ./evaluation/swe_bench/scripts/eval/summarize_outputs.py $OUTPUT_FOLDER/output.jsonl > summarize_outputs.log 2>&1 | |
echo "SWEBENCH_REPORT<<EOF" >> $GITHUB_ENV | |
cat summarize_outputs.log >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
- name: Create tar.gz of evaluation outputs | |
run: | | |
TIMESTAMP=$(date +'%y-%m-%d-%H-%M') | |
tar -czvf evaluation_outputs_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs | |
- name: Upload evaluation results as artifact | |
uses: actions/upload-artifact@v4 | |
id: upload_results_artifact | |
with: | |
name: evaluation-outputs | |
path: evaluation_outputs_*.tar.gz | |
- name: Get artifact URL | |
run: echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV | |
- name: Authenticate to Google Cloud | |
uses: 'google-github-actions/auth@v2' | |
with: | |
credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }} | |
- name: Set timestamp and trigger reason | |
run: | | |
echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV | |
if [[ "${{ github.event_name }}" == "pull_request" ]]; then | |
echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV | |
elif [[ "${{ github.event_name }}" == "schedule" ]]; then | |
echo "TRIGGER_REASON=schedule" >> $GITHUB_ENV | |
else | |
echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV | |
fi | |
- name: Upload evaluation results to Google Cloud Storage | |
uses: 'google-github-actions/upload-cloud-storage@v2' | |
with: | |
path: 'evaluation/evaluation_outputs/outputs' | |
destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}' | |
- name: Comment with evaluation results and artifact link | |
id: create_comment | |
uses: KeisukeYamashita/create-comment@v1 | |
with: | |
number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }} | |
unique: false | |
comment: | | |
Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }} | |
Commit: ${{ github.sha }} | |
**SWE-Bench Evaluation Report** | |
${{ env.SWEBENCH_REPORT }} | |
--- | |
**Integration Tests Evaluation Report** | |
${{ env.INTEGRATION_TEST_REPORT }} | |
--- | |
You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}). | |
- name: Post to a Slack channel | |
id: slack | |
uses: slackapi/[email protected] | |
with: | |
channel-id: 'C07SVQSCR6F' | |
slack-message: "*Evaluation Trigger:* ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }}\n\nLink to summary: [here](https://github.com/${{ github.repository }}/issues/${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}#issuecomment-${{ steps.create_comment.outputs.comment-id }})" | |
env: | |
SLACK_BOT_TOKEN: ${{ secrets.EVAL_NOTIF_SLACK_BOT_TOKEN }} |