Implement GPU Support #1413
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
# These are end-to-end tests running on ephemeral DigitalOcean "Droplet" virtual machines | |
# with the different operating systems that are supported. | |
# | |
# The main focus of these tests is to ensure that the packaging works on all supported platforms | |
# and to ensure the compatibility of dependencies (system and vendored) across these platforms. | |
name: "Testing on DigitalOcean Droplets" | |
# Run automatically on main branches, Pull Request updates and allow manual execution using `workflow_dispatch`. | |
on: | |
push: | |
branches: | |
- main | |
pull_request: | |
types: | |
- "opened" | |
- "reopened" | |
- "synchronize" | |
- "ready_for_review" | |
workflow_dispatch: | |
jobs: | |
run_on_droplet: | |
name: "Test Droplet with ${{ matrix.os_config.os_name }}-${{ matrix.check_vm.alias\ | |
\ }}" | |
runs-on: ubuntu-latest | |
concurrency: "${{ matrix.os_config.concurrency_group }}-${{ matrix.check_vm.alias\ | |
\ }}" | |
timeout-minutes: 10 | |
strategy: | |
fail-fast: false | |
matrix: | |
# Check compatibility with all supported OSes. | |
os_config: | |
- os_name: "Debian 12" | |
os_image: "debian-12-x64" | |
alias: "debian-12" | |
package_build_command: "all-podman-debian-12" | |
package_name: "aleph-vm.debian-12.deb" | |
concurrency_group: "droplet-aleph-vm-debian-12" | |
- os_name: "Ubuntu 22.04" | |
os_image: "ubuntu-22-04-x64" | |
alias: "ubuntu-22-04" | |
package_build_command: "all-podman-ubuntu-2204" | |
package_name: "aleph-vm.ubuntu-22.04.deb" | |
concurrency_group: "droplet-aleph-vm-ubuntu-22-04" | |
- os_name: "Ubuntu 24.04" | |
os_image: "ubuntu-24-04-x64" | |
alias: "ubuntu-24-04" | |
package_build_command: "all-podman-ubuntu-2404" | |
package_name: "aleph-vm.ubuntu-24.04.deb" | |
concurrency_group: "droplet-aleph-vm-ubuntu-24-04" | |
# Check compatibility with all supported runtimes. | |
check_vm: | |
- alias: "runtime-6770" # Old runtime, using Debian 11 | |
item_hash: "67705389842a0a1b95eaa408b009741027964edc805997475e95c505d642edd8" | |
query_params: "?retro-compatibility=true" | |
- alias: "runtime-3fc0" # Newer runtime, using Debian 12 but now old SDK | |
item_hash: "3fc0aa9569da840c43e7bd2033c3c580abb46b007527d6d20f2d4e98e867f7af" | |
query_params: "?retro-compatibility=true" | |
- alias: "runtime-63fa" # Latest runtime, using Debian 12 and SDK 0.9.0 | |
item_hash: "63faf8b5db1cf8d965e6a464a0cb8062af8e7df131729e48738342d956f29ace" | |
query_params: "" | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
with: | |
fetch-depth: 0 | |
submodules: true | |
- name: Install doctl | |
uses: digitalocean/action-doctl@v2 | |
with: | |
token: ${{ secrets.DIGITALOCEAN_ACCESS_TOKEN }} | |
- name: Setup SSH private key | |
run: | | |
mkdir ~/.ssh | |
echo $DIGITALOCEAN_SSH_PRIVATE_KEY | base64 --decode > ~/.ssh/id_ed25519 | |
chmod 0700 ~/.ssh | |
chmod 0600 ~/.ssh/id_ed25519 | |
env: | |
DIGITALOCEAN_SSH_PRIVATE_KEY: ${{ secrets.DIGITALOCEAN_SSH_PRIVATE_KEY }} | |
- name: Create the Droplet | |
run: | | |
doctl compute droplet create \ | |
--image ${{ matrix.os_config.os_image }} \ | |
--size c-4 \ | |
--region ams3 \ | |
--vpc-uuid 5976b7bd-4417-49e8-8522-672aaa920c30 \ | |
--enable-ipv6 \ | |
--ssh-keys ab:2b:25:16:46:6f:25:d0:80:63:e5:be:67:04:cb:64 \ | |
aleph-vm-ci-${{ matrix.os_config.alias }}-${{ matrix.check_vm.alias }} | |
- name: Build Package | |
run: | | |
cd packaging && make ${{ matrix.os_config.package_build_command }} && cd .. | |
ls packaging/target | |
- name: Get droplet ip and export it in env | |
run: | | |
echo "DROPLET_IPV4=$(doctl compute droplet get aleph-vm-ci-${{ matrix.os_config.alias }}-${{ matrix.check_vm.alias }} --output json | ./.github/scripts/extract_droplet_ipv4.py)" >> "$GITHUB_ENV" | |
- name: Wait for the system to setup and boot | |
run: | | |
until ssh-keyscan -H ${DROPLET_IPV4}; do sleep 1; done | |
timeout-minutes: 3 | |
- name: Install Aleph-VM on the Droplet | |
run: | | |
ssh-keyscan -H ${DROPLET_IPV4} > ~/.ssh/known_hosts | |
# Configuration | |
echo ALEPH_VM_SUPERVISOR_HOST=0.0.0.0 >> supervisor.env | |
echo ALEPH_VM_ALLOCATION_TOKEN_HASH=9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08 >> supervisor.env | |
echo ALEPH_VM_CHECK_FASTAPI_VM_ID=${{ matrix.check_vm.item_hash }} >> supervisor.env | |
echo ALEPH_VM_SENTRY_DSN=${{ secrets.SENTRY_DSN }} >> supervisor.env | |
ssh root@${DROPLET_IPV4} mkdir -p /etc/aleph-vm/ | |
scp supervisor.env root@${DROPLET_IPV4}:/etc/aleph-vm/supervisor.env | |
# Wait a few seconds for DigitalOcean to setup the Droplet using apt, which conflicts with our comands: | |
sleep 5 | |
# Wait for /var/lib/apt/lists/lock to be unlocked on the remote host via SSH. | |
while ssh root@${DROPLET_IPV4} lsof /var/lib/apt/lists/lock; do sleep 1; done | |
ssh root@${DROPLET_IPV4} DEBIAN_FRONTEND=noninteractive "apt-get -o DPkg::Lock::Timeout=60 update" | |
ssh root@${DROPLET_IPV4} DEBIAN_FRONTEND=noninteractive "apt-get -o DPkg::Lock::Timeout=60 upgrade -y" | |
ssh root@${DROPLET_IPV4} DEBIAN_FRONTEND=noninteractive "apt-get -o DPkg::Lock::Timeout=60 install -y docker.io apparmor-profiles" | |
ssh root@${DROPLET_IPV4} "docker pull ghcr.io/aleph-im/vm-connector:alpha" | |
ssh root@${DROPLET_IPV4} "docker run -d -p 127.0.0.1:4021:4021/tcp --restart=always --name vm-connector ghcr.io/aleph-im/vm-connector:alpha" | |
scp packaging/target/${{ matrix.os_config.package_name }} root@${DROPLET_IPV4}:/opt | |
# "--force-confold" keeps existing config files during package install/upgrade, avoiding prompts. | |
ssh root@${DROPLET_IPV4} DEBIAN_FRONTEND=noninteractive "apt-get -o DPkg::Lock::Timeout=60 -o Dpkg::Options::="--force-confold" install -y /opt/${{ matrix.os_config.package_name }}" | |
# Allow some time for IPFS Kubo to start | |
sleep 5 | |
- name: Test Aleph-VM on the Droplet | |
id: test-aleph-vm | |
if: always() | |
continue-on-error: true | |
run: | | |
curl --retry 5 --max-time 10 --fail "http://${DROPLET_IPV4}:4020/about/usage/system" | |
curl --retry 5 --max-time 10 --fail "http://${DROPLET_IPV4}:4020/status/check/fastapi${{ matrix.check_vm.query_params }}" | |
- name: Test Aleph-VM on the Droplet again restarting the server first | |
if: steps.test-aleph-vm.outcome == 'failure' | |
run: | | |
# If the first execution fails, restart supervisor and try again | |
ssh root@${DROPLET_IPV4} "systemctl restart aleph-vm-supervisor" | |
sleep 5 | |
curl --retry 5 --max-time 10 --fail "http://${DROPLET_IPV4}:4020/status/check/fastapi${{ matrix.check_vm.query_params }}" | |
- name: Schedule an instance on the Droplet by faking a call from the scheduler | |
run: | | |
curl --retry 5 --max-time 10 --fail -X POST -H "Content-Type: application/json" \ | |
-H "X-Auth-Signature: test" \ | |
-d '{"persistent_vms": [], "instances": ["${{ matrix.check_vm.item_hash }}"]}' \ | |
"http://${DROPLET_IPV4}:4020/control/allocations" | |
- name: Fetch system usage endpoint | |
run: | | |
export DROPLET_IPV4="$(doctl compute droplet get aleph-vm-ci-${{ matrix.os_config.alias }}-${{ matrix.check_vm.alias }} --output json | ./.github/scripts/extract_droplet_ipv4.py)" | |
curl -X GET -H "Content-Type: application/json" \ | |
"http://${DROPLET_IPV4}:4020/about/usage/system" | |
- name: Run the sevctl command to ensure it's properly packaged and working | |
run: | | |
export DROPLET_IPV4="$(doctl compute droplet get aleph-vm-ci-${{ matrix.os_config.alias }}-${{ matrix.check_vm.alias }} --output json | ./.github/scripts/extract_droplet_ipv4.py)" | |
ssh root@${DROPLET_IPV4} "/opt/sevctl --version" | |
- name: Export aleph logs | |
if: always() | |
run: | | |
ssh root@${DROPLET_IPV4} "journalctl -u aleph-vm-supervisor" | |
- name: Cleanup | |
if: always() | |
run: |- | |
DROPLET_IDS=$(doctl compute droplet list --format "ID,Name" --no-header | grep "aleph-vm-ci-${{ matrix.os_config.alias }}-${{ matrix.check_vm.alias }}" | awk '{print $1}') | |
for DROPLET_ID in $DROPLET_IDS; do | |
echo "Deleting droplet with ID: $DROPLET_ID" | |
doctl compute droplet delete --force $DROPLET_ID | |
done |