From aaaec2aab07672188aa36708d7a44cbef26fc4cc Mon Sep 17 00:00:00 2001 From: "Andres D. Molins" Date: Tue, 3 Dec 2024 22:27:21 +0100 Subject: [PATCH 01/11] Problem: If a user wants to assign a GPU to a QEmu VM he cannot do it. Solution: Implement GPU assignation feature that will be pass-though to QEmu VMs with native performance. --- pyproject.toml | 2 +- src/aleph/vm/controllers/configuration.py | 6 ++++ .../vm/controllers/firecracker/executable.py | 5 +++ src/aleph/vm/controllers/qemu/instance.py | 10 +++++- .../controllers/qemu_confidential/instance.py | 5 +++ src/aleph/vm/hypervisors/qemu/qemuvm.py | 34 +++++++++++++++---- .../hypervisors/qemu_confidential/qemuvm.py | 12 +++---- src/aleph/vm/models.py | 31 ++++++++++++++--- src/aleph/vm/pool.py | 19 +++++++++-- 9 files changed, 103 insertions(+), 21 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index faebfb9a..31b767d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ dependencies = [ "aioredis==1.3.1", "aiosqlite==0.19", "alembic==1.13.1", - "aleph-message==0.5", + "aleph-message @ git+https://github.com/aleph-im/aleph-message@andres-feature-add_gpu_requirement", "aleph-superfluid~=0.2.1", "dbus-python==1.3.2", "eth-account~=0.10", diff --git a/src/aleph/vm/controllers/configuration.py b/src/aleph/vm/controllers/configuration.py index da10d839..fb4b4ff1 100644 --- a/src/aleph/vm/controllers/configuration.py +++ b/src/aleph/vm/controllers/configuration.py @@ -23,6 +23,10 @@ class QemuVMHostVolume(BaseModel): read_only: bool +class QemuGPU(BaseModel): + pci_host: str + + class QemuVMConfiguration(BaseModel): qemu_bin_path: str cloud_init_drive_path: str | None @@ -33,6 +37,7 @@ class QemuVMConfiguration(BaseModel): mem_size_mb: int interface_name: str | None host_volumes: list[QemuVMHostVolume] + gpus: list[QemuGPU] class QemuConfidentialVMConfiguration(BaseModel): @@ -45,6 +50,7 @@ class QemuConfidentialVMConfiguration(BaseModel): mem_size_mb: int interface_name: str | None host_volumes: list[QemuVMHostVolume] + gpus: list[QemuGPU] ovmf_path: Path sev_session_file: Path sev_dh_cert_file: Path diff --git a/src/aleph/vm/controllers/firecracker/executable.py b/src/aleph/vm/controllers/firecracker/executable.py index cbbad03c..840dcb9b 100644 --- a/src/aleph/vm/controllers/firecracker/executable.py +++ b/src/aleph/vm/controllers/firecracker/executable.py @@ -72,6 +72,11 @@ class HostVolume: read_only: bool +@dataclass +class HostGPU: + pci_host: str + + @dataclass class BaseConfiguration: vm_hash: ItemHash diff --git a/src/aleph/vm/controllers/qemu/instance.py b/src/aleph/vm/controllers/qemu/instance.py index dd840e22..51392c5f 100644 --- a/src/aleph/vm/controllers/qemu/instance.py +++ b/src/aleph/vm/controllers/qemu/instance.py @@ -5,7 +5,7 @@ from asyncio import Task from asyncio.subprocess import Process from pathlib import Path -from typing import Generic, TypeVar +from typing import Generic, TypeVar, List import psutil from aleph_message.models import ItemHash @@ -19,11 +19,13 @@ HypervisorType, QemuVMConfiguration, QemuVMHostVolume, + QemuGPU, save_controller_configuration, ) from aleph.vm.controllers.firecracker.executable import ( AlephFirecrackerResources, VmSetupError, + HostGPU, ) from aleph.vm.controllers.interface import AlephVmControllerInterface from aleph.vm.controllers.qemu.cloudinit import CloudInitMixin @@ -36,6 +38,8 @@ class AlephQemuResources(AlephFirecrackerResources): + gpus: list[HostGPU] + async def download_runtime(self) -> None: volume = self.message_content.rootfs parent_image_path = await get_rootfs_base_path(volume.parent.ref) @@ -200,6 +204,10 @@ async def configure(self): ) for volume in self.resources.volumes ], + gpus=[ + QemuGPU(pci_host=gpu.pci_host) + for gpu in self.resources.gpus + ] ) configuration = Configuration( diff --git a/src/aleph/vm/controllers/qemu_confidential/instance.py b/src/aleph/vm/controllers/qemu_confidential/instance.py index f432cff6..2908e87e 100644 --- a/src/aleph/vm/controllers/qemu_confidential/instance.py +++ b/src/aleph/vm/controllers/qemu_confidential/instance.py @@ -13,6 +13,7 @@ Configuration, HypervisorType, QemuConfidentialVMConfiguration, + QemuGPU, QemuVMHostVolume, save_controller_configuration, ) @@ -126,6 +127,10 @@ async def configure(self): ) for volume in self.resources.volumes ], + gpus=[ + QemuGPU(pci_host=gpu.pci_host) + for gpu in self.resources.gpus + ] ) configuration = Configuration( diff --git a/src/aleph/vm/hypervisors/qemu/qemuvm.py b/src/aleph/vm/hypervisors/qemu/qemuvm.py index 5949fbdc..d6c9274d 100644 --- a/src/aleph/vm/hypervisors/qemu/qemuvm.py +++ b/src/aleph/vm/hypervisors/qemu/qemuvm.py @@ -7,7 +7,7 @@ import qmp from systemd import journal -from aleph.vm.controllers.configuration import QemuVMConfiguration +from aleph.vm.controllers.configuration import QemuVMConfiguration, QemuGPU from aleph.vm.controllers.qemu.instance import logger @@ -28,6 +28,7 @@ class QemuVM: interface_name: str qemu_process: Process | None = None host_volumes: list[HostVolume] + gpus: list[QemuGPU] journal_stdout: TextIO | None journal_stderr: TextIO | None @@ -55,6 +56,7 @@ def __init__(self, vm_hash, config: QemuVMConfiguration): ) for volume in config.host_volumes ] + self.gpus = config.gpus @property def _journal_stdout_name(self) -> str: @@ -102,21 +104,23 @@ async def start( # Tell to put the output to std fd, so we can include them in the log "-serial", "stdio", + # Use host-phys-bits-limit argument for GPU support. TODO: Investigate how to get the correct bits size + # + "-cpu", + "host,host-phys-bits-limit=0x28" # Uncomment for debug # "-serial", "telnet:localhost:4321,server,nowait", # "-snapshot", # Do not save anything to disk ] - for volume in self.host_volumes: - args += [ - "-drive", - f"file={volume.path_on_host},format=raw,readonly={'on' if volume.read_only else 'off'},media=disk,if=virtio", - ] if self.interface_name: # script=no, downscript=no tell qemu not to try to set up the network itself args += ["-net", "nic,model=virtio", "-net", f"tap,ifname={self.interface_name},script=no,downscript=no"] if self.cloud_init_drive_path: args += ["-cdrom", f"{self.cloud_init_drive_path}"] + + args += self._get_host_volumes_args() + args += self._get_gpu_args() print(*args) self.qemu_process = proc = await asyncio.create_subprocess_exec( @@ -131,6 +135,24 @@ async def start( ) return proc + def _get_host_volumes_args(self): + args = [] + for volume in self.host_volumes: + args += [ + "-drive", + f"file={volume.path_on_host},format=raw,readonly={'on' if volume.read_only else 'off'},media=disk,if=virtio", + ] + return args + + def _get_gpu_args(self): + args = [] + for gpu in self.gpus: + args += [ + "-device", + f"vfio-pci,host={gpu.pci_host},multifunction=on,x-vga=on", + ] + return args + def _get_qmpclient(self) -> qmp.QEMUMonitorProtocol | None: if not (self.qmp_socket_path and self.qmp_socket_path.exists()): return None diff --git a/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py b/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py index 5e32e899..868c12c1 100644 --- a/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py +++ b/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py @@ -105,24 +105,24 @@ async def start( # raise an error and prevent boot. Passing the argument --cpu host instruct the VM to use the same CPU # model than the host thus the VM's kernel knows which method is used to get random numbers (Intel and # AMD have different methods) and properly boot. + # Use host-phys-bits-limit argument for GPU support. TODO: Investigate how to get the correct bits size "-cpu", - "host", + "host,host-phys-bits-limit=0x28" # Uncomment following for debug # "-serial", "telnet:localhost:4321,server,nowait", # "-snapshot", # Do not save anything to disk ] - for volume in self.host_volumes: - args += [ - "-drive", - f"file={volume.path_on_host},format=raw,readonly={'on' if volume.read_only else 'off'},media=disk,if=virtio", - ] if self.interface_name: # script=no, downscript=no tell qemu not to try to set up the network itself args += ["-net", "nic,model=virtio", "-net", f"tap,ifname={self.interface_name},script=no,downscript=no"] if self.cloud_init_drive_path: args += ["-cdrom", f"{self.cloud_init_drive_path}"] + + args += self._get_host_volumes_args() + args += self._get_gpu_args() print(*args) + self.qemu_process = proc = await asyncio.create_subprocess_exec( *args, stdin=asyncio.subprocess.DEVNULL, diff --git a/src/aleph/vm/models.py b/src/aleph/vm/models.py index 9aee9320..97e846d6 100644 --- a/src/aleph/vm/models.py +++ b/src/aleph/vm/models.py @@ -5,6 +5,7 @@ from collections.abc import Callable, Coroutine from dataclasses import dataclass from datetime import datetime, timezone +from typing import List from aleph_message.models import ( ExecutableContent, @@ -15,11 +16,10 @@ from aleph_message.models.execution.environment import HypervisorType from aleph.vm.conf import settings -from aleph.vm.controllers.firecracker.executable import AlephFirecrackerExecutable +from aleph.vm.controllers.firecracker.executable import AlephFirecrackerExecutable, HostGPU from aleph.vm.controllers.firecracker.instance import AlephInstanceResources from aleph.vm.controllers.firecracker.program import ( AlephFirecrackerProgram, - AlephFirecrackerResources, AlephProgramResources, ) from aleph.vm.controllers.firecracker.snapshot_manager import SnapshotManager @@ -38,6 +38,7 @@ ) from aleph.vm.orchestrator.pubsub import PubSub from aleph.vm.orchestrator.vm import AlephFirecrackerInstance +from aleph.vm.resources import GpuDevice from aleph.vm.systemd import SystemDManager from aleph.vm.utils import create_task_log_exceptions, dumps_for_json @@ -69,8 +70,9 @@ class VmExecution: vm_hash: ItemHash original: ExecutableContent message: ExecutableContent - resources: AlephFirecrackerResources | None = None - vm: AlephFirecrackerExecutable | AlephQemuInstance | None = None + resources: AlephProgramResources | AlephInstanceResources | AlephQemuResources | AlephQemuConfidentialInstance | None = None + vm: AlephFirecrackerExecutable | AlephQemuInstance | AlephQemuConfidentialInstance | None = None + gpus: List[HostGPU] times: VmExecutionTimes @@ -202,6 +204,7 @@ async def prepare(self) -> None: resources = AlephQemuConfidentialResources(self.message, namespace=self.vm_hash) else: resources = AlephQemuResources(self.message, namespace=self.vm_hash) + resources.gpus = self.gpus else: msg = f"Unknown hypervisor type {self.hypervisor}" raise ValueError(msg) @@ -216,6 +219,26 @@ async def prepare(self) -> None: self.times.prepared_at = datetime.now(tz=timezone.utc) self.resources = resources + def prepare_gpus(self, available_gpus: List[GpuDevice]) -> None: + gpus = [] + for gpu in self.message.requirements.gpu: + for available_gpu in available_gpus: + if available_gpu.device_id == gpu.device_id: + gpus.append( + HostGPU( + pci_host=available_gpu.pci_host + ) + ) + break + self.gpus = gpus + + def uses_gpu(self, pci_host: str) -> bool: + for gpu in self.gpus: + if gpu.pci_host == pci_host: + return True + + return False + def create( self, vm_id: int, tap_interface: TapInterface | None = None, prepare: bool = True ) -> AlephVmControllerInterface: diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index dfc4242e..8b7cbef2 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -112,7 +112,11 @@ async def create_a_vm( self.executions[vm_hash] = execution try: + # First assign Host GPUs from the available + execution.prepare_gpus(self.get_available_gpus()) + # Prepare VM general Resources and also the GPUs await execution.prepare() + vm_id = self.get_unique_vm_id() if self.network: @@ -236,6 +240,10 @@ async def load_persistent_executions(self): if execution.is_running: # TODO: Improve the way that we re-create running execution + # Load existing GPUs assigned to VMs + for saved_gpu in saved_execution.gpus: + execution.gpus.append(HostGPU(pci_host=saved_gpu.pci_host)) + # Load and instantiate the rest of resources and already assigned GPUs await execution.prepare() if self.network: vm_type = VmType.from_message_content(execution.message) @@ -288,9 +296,14 @@ def get_instance_executions(self) -> Iterable[VmExecution]: ) return executions or [] - def get_available_gpus(self) -> Iterable[GpuDevice]: - # TODO: Filter already used GPUs on current executions and remove it from available - available_gpus = self.gpus + def get_available_gpus(self) -> List[GpuDevice]: + available_gpus = ( + gpu + for gpu in self.gpus + for _, execution in self.executions.items() + if (isinstance(execution.resources, AlephQemuResources) or isinstance(execution.resources, AlephQemuConfidentialResources)) and not execution.uses_device_gpu(gpu.pci_host) + ) + return available_gpus or [] def get_executions_by_sender(self, payment_type: PaymentType) -> dict[str, dict[str, list[VmExecution]]]: From 24516a2d771f199c71bab6a293b82d38ceee637d Mon Sep 17 00:00:00 2001 From: "Andres D. Molins" Date: Tue, 3 Dec 2024 22:35:45 +0100 Subject: [PATCH 02/11] Fix: Solved code quality issues --- .../vm/controllers/firecracker/executable.py | 5 ----- src/aleph/vm/controllers/qemu/instance.py | 13 +++++-------- .../vm/controllers/qemu_confidential/instance.py | 5 +---- src/aleph/vm/hypervisors/qemu/qemuvm.py | 4 ++-- .../vm/hypervisors/qemu_confidential/qemuvm.py | 2 +- src/aleph/vm/models.py | 14 ++++++-------- src/aleph/vm/pool.py | 16 +++++++--------- src/aleph/vm/resources.py | 6 ++++++ 8 files changed, 28 insertions(+), 37 deletions(-) diff --git a/src/aleph/vm/controllers/firecracker/executable.py b/src/aleph/vm/controllers/firecracker/executable.py index 840dcb9b..cbbad03c 100644 --- a/src/aleph/vm/controllers/firecracker/executable.py +++ b/src/aleph/vm/controllers/firecracker/executable.py @@ -72,11 +72,6 @@ class HostVolume: read_only: bool -@dataclass -class HostGPU: - pci_host: str - - @dataclass class BaseConfiguration: vm_hash: ItemHash diff --git a/src/aleph/vm/controllers/qemu/instance.py b/src/aleph/vm/controllers/qemu/instance.py index 51392c5f..259f8474 100644 --- a/src/aleph/vm/controllers/qemu/instance.py +++ b/src/aleph/vm/controllers/qemu/instance.py @@ -5,7 +5,7 @@ from asyncio import Task from asyncio.subprocess import Process from pathlib import Path -from typing import Generic, TypeVar, List +from typing import Generic, List, TypeVar import psutil from aleph_message.models import ItemHash @@ -17,20 +17,20 @@ from aleph.vm.controllers.configuration import ( Configuration, HypervisorType, + QemuGPU, QemuVMConfiguration, QemuVMHostVolume, - QemuGPU, save_controller_configuration, ) from aleph.vm.controllers.firecracker.executable import ( AlephFirecrackerResources, VmSetupError, - HostGPU, ) from aleph.vm.controllers.interface import AlephVmControllerInterface from aleph.vm.controllers.qemu.cloudinit import CloudInitMixin from aleph.vm.network.firewall import teardown_nftables_for_vm from aleph.vm.network.interfaces import TapInterface +from aleph.vm.resources import HostGPU from aleph.vm.storage import get_rootfs_base_path from aleph.vm.utils import HostNotFoundError, ping, run_in_subprocess @@ -38,7 +38,7 @@ class AlephQemuResources(AlephFirecrackerResources): - gpus: list[HostGPU] + gpus: List[HostGPU] = [] async def download_runtime(self) -> None: volume = self.message_content.rootfs @@ -204,10 +204,7 @@ async def configure(self): ) for volume in self.resources.volumes ], - gpus=[ - QemuGPU(pci_host=gpu.pci_host) - for gpu in self.resources.gpus - ] + gpus=[QemuGPU(pci_host=gpu.pci_host) for gpu in self.resources.gpus], ) configuration = Configuration( diff --git a/src/aleph/vm/controllers/qemu_confidential/instance.py b/src/aleph/vm/controllers/qemu_confidential/instance.py index 2908e87e..37986b10 100644 --- a/src/aleph/vm/controllers/qemu_confidential/instance.py +++ b/src/aleph/vm/controllers/qemu_confidential/instance.py @@ -127,10 +127,7 @@ async def configure(self): ) for volume in self.resources.volumes ], - gpus=[ - QemuGPU(pci_host=gpu.pci_host) - for gpu in self.resources.gpus - ] + gpus=[QemuGPU(pci_host=gpu.pci_host) for gpu in self.resources.gpus], ) configuration = Configuration( diff --git a/src/aleph/vm/hypervisors/qemu/qemuvm.py b/src/aleph/vm/hypervisors/qemu/qemuvm.py index d6c9274d..9b95f187 100644 --- a/src/aleph/vm/hypervisors/qemu/qemuvm.py +++ b/src/aleph/vm/hypervisors/qemu/qemuvm.py @@ -7,7 +7,7 @@ import qmp from systemd import journal -from aleph.vm.controllers.configuration import QemuVMConfiguration, QemuGPU +from aleph.vm.controllers.configuration import QemuGPU, QemuVMConfiguration from aleph.vm.controllers.qemu.instance import logger @@ -107,7 +107,7 @@ async def start( # Use host-phys-bits-limit argument for GPU support. TODO: Investigate how to get the correct bits size # "-cpu", - "host,host-phys-bits-limit=0x28" + "host,host-phys-bits-limit=0x28", # Uncomment for debug # "-serial", "telnet:localhost:4321,server,nowait", # "-snapshot", # Do not save anything to disk diff --git a/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py b/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py index 868c12c1..1ef33e40 100644 --- a/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py +++ b/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py @@ -107,7 +107,7 @@ async def start( # AMD have different methods) and properly boot. # Use host-phys-bits-limit argument for GPU support. TODO: Investigate how to get the correct bits size "-cpu", - "host,host-phys-bits-limit=0x28" + "host,host-phys-bits-limit=0x28", # Uncomment following for debug # "-serial", "telnet:localhost:4321,server,nowait", # "-snapshot", # Do not save anything to disk diff --git a/src/aleph/vm/models.py b/src/aleph/vm/models.py index 97e846d6..b22c3d39 100644 --- a/src/aleph/vm/models.py +++ b/src/aleph/vm/models.py @@ -16,7 +16,7 @@ from aleph_message.models.execution.environment import HypervisorType from aleph.vm.conf import settings -from aleph.vm.controllers.firecracker.executable import AlephFirecrackerExecutable, HostGPU +from aleph.vm.controllers.firecracker.executable import AlephFirecrackerExecutable from aleph.vm.controllers.firecracker.instance import AlephInstanceResources from aleph.vm.controllers.firecracker.program import ( AlephFirecrackerProgram, @@ -38,7 +38,7 @@ ) from aleph.vm.orchestrator.pubsub import PubSub from aleph.vm.orchestrator.vm import AlephFirecrackerInstance -from aleph.vm.resources import GpuDevice +from aleph.vm.resources import GpuDevice, HostGPU from aleph.vm.systemd import SystemDManager from aleph.vm.utils import create_task_log_exceptions, dumps_for_json @@ -70,7 +70,9 @@ class VmExecution: vm_hash: ItemHash original: ExecutableContent message: ExecutableContent - resources: AlephProgramResources | AlephInstanceResources | AlephQemuResources | AlephQemuConfidentialInstance | None = None + resources: ( + AlephProgramResources | AlephInstanceResources | AlephQemuResources | AlephQemuConfidentialInstance | None + ) = None vm: AlephFirecrackerExecutable | AlephQemuInstance | AlephQemuConfidentialInstance | None = None gpus: List[HostGPU] @@ -224,11 +226,7 @@ def prepare_gpus(self, available_gpus: List[GpuDevice]) -> None: for gpu in self.message.requirements.gpu: for available_gpu in available_gpus: if available_gpu.device_id == gpu.device_id: - gpus.append( - HostGPU( - pci_host=available_gpu.pci_host - ) - ) + gpus.append(HostGPU(pci_host=available_gpu.pci_host)) break self.gpus = gpus diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index 8b7cbef2..a4f2f2f9 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -19,7 +19,7 @@ from aleph.vm.controllers.firecracker.snapshot_manager import SnapshotManager from aleph.vm.network.hostnetwork import Network, make_ipv6_allocator from aleph.vm.orchestrator.metrics import get_execution_records -from aleph.vm.resources import GpuDevice, get_gpu_devices +from aleph.vm.resources import GpuDevice, HostGPU, get_gpu_devices from aleph.vm.systemd import SystemDManager from aleph.vm.utils import get_message_executable_content from aleph.vm.vm_type import VmType @@ -297,14 +297,12 @@ def get_instance_executions(self) -> Iterable[VmExecution]: return executions or [] def get_available_gpus(self) -> List[GpuDevice]: - available_gpus = ( - gpu - for gpu in self.gpus - for _, execution in self.executions.items() - if (isinstance(execution.resources, AlephQemuResources) or isinstance(execution.resources, AlephQemuConfidentialResources)) and not execution.uses_device_gpu(gpu.pci_host) - ) - - return available_gpus or [] + available_gpus = [] + for gpu in self.gpus: + for _, execution in self.executions.items(): + if not execution.uses_gpu(gpu.pci_host): + available_gpus.append(gpu) + return available_gpus def get_executions_by_sender(self, payment_type: PaymentType) -> dict[str, dict[str, list[VmExecution]]]: """Return all executions of the given type, grouped by sender and by chain.""" diff --git a/src/aleph/vm/resources.py b/src/aleph/vm/resources.py index 5532c226..b237dc0e 100644 --- a/src/aleph/vm/resources.py +++ b/src/aleph/vm/resources.py @@ -1,4 +1,5 @@ import subprocess +from dataclasses import dataclass from enum import Enum from typing import List, Optional @@ -6,6 +7,11 @@ from pydantic import Extra, Field +@dataclass +class HostGPU: + pci_host: str + + class GpuDeviceClass(str, Enum): VGA_COMPATIBLE_CONTROLLER = "0300" _3D_CONTROLLER = "0302" From 60b2491fae24c7b625b897a3b119f71e0e6b5025 Mon Sep 17 00:00:00 2001 From: "Andres D. Molins" Date: Thu, 5 Dec 2024 14:46:02 +0100 Subject: [PATCH 03/11] Fix: Solved compilation issue and fixed gpu logic. --- packaging/Makefile | 2 +- src/aleph/vm/models.py | 16 +++++++++------- src/aleph/vm/pool.py | 8 ++++++-- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/packaging/Makefile b/packaging/Makefile index 3c4f8a6b..898979b5 100644 --- a/packaging/Makefile +++ b/packaging/Makefile @@ -16,7 +16,7 @@ debian-package-code: cp -r ../examples/data ./aleph-vm/opt/aleph-vm/examples/data mkdir -p ./aleph-vm/opt/aleph-vm/examples/volumes # Fixing this protobuf dependency version to avoid getting CI errors as version 5.29.0 have this compilation issue - pip3 install --progress-bar off --target ./aleph-vm/opt/aleph-vm/ 'aleph-message==0.5.0' 'eth-account==0.10' 'sentry-sdk==1.31.0' 'qmp==1.1.0' 'aleph-superfluid~=0.2.1' 'sqlalchemy[asyncio]>=2.0' 'aiosqlite==0.19.0' 'alembic==1.13.1' 'aiohttp_cors==0.7.0' 'pyroute2==0.7.12' 'python-cpuid==0.1.0' 'solathon==1.0.2' 'protobuf==5.28.3' + pip3 install --progress-bar off --target ./aleph-vm/opt/aleph-vm/ 'aleph-message@git+https://github.com/aleph-im/aleph-message@andres-feature-add_gpu_requirement' 'eth-account==0.10' 'sentry-sdk==1.31.0' 'qmp==1.1.0' 'aleph-superfluid~=0.2.1' 'sqlalchemy[asyncio]>=2.0' 'aiosqlite==0.19.0' 'alembic==1.13.1' 'aiohttp_cors==0.7.0' 'pyroute2==0.7.12' 'python-cpuid==0.1.0' 'solathon==1.0.2' 'protobuf==5.28.3' python3 -m compileall ./aleph-vm/opt/aleph-vm/ debian-package-resources: firecracker-bins vmlinux download-ipfs-kubo target/bin/sevctl diff --git a/src/aleph/vm/models.py b/src/aleph/vm/models.py index b22c3d39..b0858d33 100644 --- a/src/aleph/vm/models.py +++ b/src/aleph/vm/models.py @@ -13,7 +13,7 @@ ItemHash, ProgramContent, ) -from aleph_message.models.execution.environment import HypervisorType +from aleph_message.models.execution.environment import GpuProperties, HypervisorType from aleph.vm.conf import settings from aleph.vm.controllers.firecracker.executable import AlephFirecrackerExecutable @@ -74,7 +74,7 @@ class VmExecution: AlephProgramResources | AlephInstanceResources | AlephQemuResources | AlephQemuConfidentialInstance | None ) = None vm: AlephFirecrackerExecutable | AlephQemuInstance | AlephQemuConfidentialInstance | None = None - gpus: List[HostGPU] + gpus: List[HostGPU] = [] times: VmExecutionTimes @@ -223,11 +223,13 @@ async def prepare(self) -> None: def prepare_gpus(self, available_gpus: List[GpuDevice]) -> None: gpus = [] - for gpu in self.message.requirements.gpu: - for available_gpu in available_gpus: - if available_gpu.device_id == gpu.device_id: - gpus.append(HostGPU(pci_host=available_gpu.pci_host)) - break + if self.message.requirements and self.message.requirements.gpu: + for gpu in self.message.requirements.gpu: + gpu = GpuProperties.parse_obj(gpu) + for available_gpu in available_gpus: + if available_gpu.device_id == gpu.device_id: + gpus.append(HostGPU(pci_host=available_gpu.pci_host)) + break self.gpus = gpus def uses_gpu(self, pci_host: str) -> bool: diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index a4f2f2f9..4268d9ed 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -299,9 +299,13 @@ def get_instance_executions(self) -> Iterable[VmExecution]: def get_available_gpus(self) -> List[GpuDevice]: available_gpus = [] for gpu in self.gpus: + used = False for _, execution in self.executions.items(): - if not execution.uses_gpu(gpu.pci_host): - available_gpus.append(gpu) + if execution.uses_gpu(gpu.pci_host): + used = True + break + if not used: + available_gpus.append(gpu) return available_gpus def get_executions_by_sender(self, payment_type: PaymentType) -> dict[str, dict[str, list[VmExecution]]]: From ba1fc9d3ce28078017a3b2f9841376dd3671bd0f Mon Sep 17 00:00:00 2001 From: "Andres D. Molins" Date: Fri, 6 Dec 2024 11:17:35 +0100 Subject: [PATCH 04/11] Fix: Solved issue getting already running executions with GPU --- src/aleph/vm/models.py | 3 +++ src/aleph/vm/orchestrator/metrics.py | 2 ++ src/aleph/vm/orchestrator/payment.py | 10 +++++++--- src/aleph/vm/orchestrator/tasks.py | 13 +++++++++---- src/aleph/vm/pool.py | 4 ++-- src/aleph/vm/resources.py | 15 ++++++++++----- 6 files changed, 33 insertions(+), 14 deletions(-) diff --git a/src/aleph/vm/models.py b/src/aleph/vm/models.py index b0858d33..7dd59091 100644 --- a/src/aleph/vm/models.py +++ b/src/aleph/vm/models.py @@ -1,4 +1,5 @@ import asyncio +import json import logging import uuid from asyncio import Task @@ -14,6 +15,7 @@ ProgramContent, ) from aleph_message.models.execution.environment import GpuProperties, HypervisorType +from pydantic.json import pydantic_encoder from aleph.vm.conf import settings from aleph.vm.controllers.firecracker.executable import AlephFirecrackerExecutable @@ -460,6 +462,7 @@ async def save(self): message=self.message.json(), original_message=self.original.json(), persistent=self.persistent, + gpus=json.dumps(self.gpus, default=pydantic_encoder), ) ) diff --git a/src/aleph/vm/orchestrator/metrics.py b/src/aleph/vm/orchestrator/metrics.py index f7f16648..6c9b8eea 100644 --- a/src/aleph/vm/orchestrator/metrics.py +++ b/src/aleph/vm/orchestrator/metrics.py @@ -76,6 +76,8 @@ class ExecutionRecord(Base): original_message = Column(JSON, nullable=True) persistent = Column(Boolean, nullable=True) + gpus = Column(JSON, nullable=True) + def __repr__(self): return f"" diff --git a/src/aleph/vm/orchestrator/payment.py b/src/aleph/vm/orchestrator/payment.py index 7194f873..5c074ce0 100644 --- a/src/aleph/vm/orchestrator/payment.py +++ b/src/aleph/vm/orchestrator/payment.py @@ -100,9 +100,13 @@ async def get_stream(sender: str, receiver: str, chain: str) -> Decimal: Get the stream of the user from the Superfluid API. See https://community.aleph.im/t/pay-as-you-go-using-superfluid/98/11 """ - chain_info: ChainInfo = get_chain(chain=chain) - if not chain_info.active: - msg = f"Chain : {chain} is not active for superfluid" + try: + chain_info: ChainInfo = get_chain(chain=chain) + if not chain_info.active: + msg = f"Chain : {chain} is not active for superfluid" + raise InvalidChainError(msg) + except ValueError: + msg = f"Chain : {chain} is invalid" raise InvalidChainError(msg) superfluid_instance = CFA_V1(chain_info.rpc, chain_info.chain_id) diff --git a/src/aleph/vm/orchestrator/tasks.py b/src/aleph/vm/orchestrator/tasks.py index 921a2265..1a0afed8 100644 --- a/src/aleph/vm/orchestrator/tasks.py +++ b/src/aleph/vm/orchestrator/tasks.py @@ -4,6 +4,7 @@ import math import time from collections.abc import AsyncIterable +from decimal import Decimal from typing import TypeVar import aiohttp @@ -175,10 +176,14 @@ async def monitor_payments(app: web.Application): # Check if the balance held in the wallet is sufficient stream tier resources for sender, chains in pool.get_executions_by_sender(payment_type=PaymentType.superfluid).items(): for chain, executions in chains.items(): - stream = await get_stream(sender=sender, receiver=settings.PAYMENT_RECEIVER_ADDRESS, chain=chain) - logger.debug( - f"Get stream flow from Sender {sender} to Receiver {settings.PAYMENT_RECEIVER_ADDRESS} of {stream}" - ) + try: + stream = await get_stream(sender=sender, receiver=settings.PAYMENT_RECEIVER_ADDRESS, chain=chain) + logger.debug( + f"Get stream flow from Sender {sender} to Receiver {settings.PAYMENT_RECEIVER_ADDRESS} of {stream}" + ) + except ValueError as error: + logger.error(f"Error found getting stream for chain {chain} and sender {sender}: {error}") + stream = Decimal(0) required_stream = await compute_required_flow(executions) logger.debug(f"Required stream for Sender {sender} executions: {required_stream}") diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index 4268d9ed..1ffabd41 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -14,6 +14,7 @@ Payment, PaymentType, ) +from pydantic import parse_raw_as from aleph.vm.conf import settings from aleph.vm.controllers.firecracker.snapshot_manager import SnapshotManager @@ -241,8 +242,7 @@ async def load_persistent_executions(self): if execution.is_running: # TODO: Improve the way that we re-create running execution # Load existing GPUs assigned to VMs - for saved_gpu in saved_execution.gpus: - execution.gpus.append(HostGPU(pci_host=saved_gpu.pci_host)) + execution.gpus = parse_raw_as(List[HostGPU], saved_execution.gpus) # Load and instantiate the rest of resources and already assigned GPUs await execution.prepare() if self.network: diff --git a/src/aleph/vm/resources.py b/src/aleph/vm/resources.py index b237dc0e..767b6490 100644 --- a/src/aleph/vm/resources.py +++ b/src/aleph/vm/resources.py @@ -1,18 +1,23 @@ import subprocess -from dataclasses import dataclass from enum import Enum from typing import List, Optional from aleph_message.models import HashableModel -from pydantic import Extra, Field +from pydantic import BaseModel, Extra, Field -@dataclass -class HostGPU: - pci_host: str +class HostGPU(BaseModel): + """Host GPU properties detail.""" + + pci_host: str = Field(description="GPU PCI host address") + + class Config: + extra = Extra.forbid class GpuDeviceClass(str, Enum): + """GPU device class. Look at https://admin.pci-ids.ucw.cz/read/PD/03""" + VGA_COMPATIBLE_CONTROLLER = "0300" _3D_CONTROLLER = "0302" From b06594c68acb7cd4ccdd2e9526b4aa6d8e589d22 Mon Sep 17 00:00:00 2001 From: "Andres D. Molins" Date: Fri, 6 Dec 2024 12:53:21 +0100 Subject: [PATCH 05/11] Fix: Expose GPU support option in `status/config` endpoint --- src/aleph/vm/orchestrator/views/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/aleph/vm/orchestrator/views/__init__.py b/src/aleph/vm/orchestrator/views/__init__.py index 4c9b3866..c627a612 100644 --- a/src/aleph/vm/orchestrator/views/__init__.py +++ b/src/aleph/vm/orchestrator/views/__init__.py @@ -347,6 +347,7 @@ async def status_public_config(request: web.Request): "ENABLE_QEMU_SUPPORT": settings.ENABLE_QEMU_SUPPORT, "INSTANCE_DEFAULT_HYPERVISOR": settings.INSTANCE_DEFAULT_HYPERVISOR, "ENABLE_CONFIDENTIAL_COMPUTING": settings.ENABLE_CONFIDENTIAL_COMPUTING, + "ENABLE_GPU_SUPPORT": settings.ENABLE_GPU_SUPPORT, }, }, dumps=dumps_for_json, From 70b7390bb59062b87105e11bc0559199d53fb689 Mon Sep 17 00:00:00 2001 From: "Andres D. Molins" Date: Mon, 9 Dec 2024 19:34:30 +0100 Subject: [PATCH 06/11] Fix: Applied some code review suggestions --- packaging/Makefile | 2 +- pyproject.toml | 2 +- src/aleph/vm/hypervisors/qemu/qemuvm.py | 10 +++++----- src/aleph/vm/orchestrator/chain.py | 6 +++++- src/aleph/vm/orchestrator/payment.py | 16 ++++------------ src/aleph/vm/orchestrator/tasks.py | 2 +- 6 files changed, 17 insertions(+), 21 deletions(-) diff --git a/packaging/Makefile b/packaging/Makefile index 898979b5..a601f1c9 100644 --- a/packaging/Makefile +++ b/packaging/Makefile @@ -16,7 +16,7 @@ debian-package-code: cp -r ../examples/data ./aleph-vm/opt/aleph-vm/examples/data mkdir -p ./aleph-vm/opt/aleph-vm/examples/volumes # Fixing this protobuf dependency version to avoid getting CI errors as version 5.29.0 have this compilation issue - pip3 install --progress-bar off --target ./aleph-vm/opt/aleph-vm/ 'aleph-message@git+https://github.com/aleph-im/aleph-message@andres-feature-add_gpu_requirement' 'eth-account==0.10' 'sentry-sdk==1.31.0' 'qmp==1.1.0' 'aleph-superfluid~=0.2.1' 'sqlalchemy[asyncio]>=2.0' 'aiosqlite==0.19.0' 'alembic==1.13.1' 'aiohttp_cors==0.7.0' 'pyroute2==0.7.12' 'python-cpuid==0.1.0' 'solathon==1.0.2' 'protobuf==5.28.3' + pip3 install --progress-bar off --target ./aleph-vm/opt/aleph-vm/ 'aleph-message==0.6' 'eth-account==0.10' 'sentry-sdk==1.31.0' 'qmp==1.1.0' 'aleph-superfluid~=0.2.1' 'sqlalchemy[asyncio]>=2.0' 'aiosqlite==0.19.0' 'alembic==1.13.1' 'aiohttp_cors==0.7.0' 'pyroute2==0.7.12' 'python-cpuid==0.1.0' 'solathon==1.0.2' 'protobuf==5.28.3' python3 -m compileall ./aleph-vm/opt/aleph-vm/ debian-package-resources: firecracker-bins vmlinux download-ipfs-kubo target/bin/sevctl diff --git a/pyproject.toml b/pyproject.toml index 31b767d2..1c934ff1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ dependencies = [ "aioredis==1.3.1", "aiosqlite==0.19", "alembic==1.13.1", - "aleph-message @ git+https://github.com/aleph-im/aleph-message@andres-feature-add_gpu_requirement", + "aleph-message==0.6", "aleph-superfluid~=0.2.1", "dbus-python==1.3.2", "eth-account~=0.10", diff --git a/src/aleph/vm/hypervisors/qemu/qemuvm.py b/src/aleph/vm/hypervisors/qemu/qemuvm.py index 9b95f187..36003e59 100644 --- a/src/aleph/vm/hypervisors/qemu/qemuvm.py +++ b/src/aleph/vm/hypervisors/qemu/qemuvm.py @@ -104,10 +104,6 @@ async def start( # Tell to put the output to std fd, so we can include them in the log "-serial", "stdio", - # Use host-phys-bits-limit argument for GPU support. TODO: Investigate how to get the correct bits size - # - "-cpu", - "host,host-phys-bits-limit=0x28", # Uncomment for debug # "-serial", "telnet:localhost:4321,server,nowait", # "-snapshot", # Do not save anything to disk @@ -145,7 +141,11 @@ def _get_host_volumes_args(self): return args def _get_gpu_args(self): - args = [] + args = [ + # Use host-phys-bits-limit argument for GPU support. TODO: Investigate how to get the correct bits size + "-cpu", + "host,host-phys-bits-limit=0x28", + ] for gpu in self.gpus: args += [ "-device", diff --git a/src/aleph/vm/orchestrator/chain.py b/src/aleph/vm/orchestrator/chain.py index 7321aa45..0b417439 100644 --- a/src/aleph/vm/orchestrator/chain.py +++ b/src/aleph/vm/orchestrator/chain.py @@ -60,9 +60,13 @@ def check_tokens(cls, values): } +class InvalidChainError(ValueError): + pass + + def get_chain(chain: str) -> ChainInfo: try: return STREAM_CHAINS[chain] except KeyError: msg = f"Unknown chain id for chain {chain}" - raise ValueError(msg) + raise InvalidChainError(msg) diff --git a/src/aleph/vm/orchestrator/payment.py b/src/aleph/vm/orchestrator/payment.py index 5c074ce0..f5a79bbc 100644 --- a/src/aleph/vm/orchestrator/payment.py +++ b/src/aleph/vm/orchestrator/payment.py @@ -13,7 +13,7 @@ from aleph.vm.models import VmExecution from aleph.vm.utils import to_normalized_address -from .chain import ChainInfo, get_chain +from .chain import ChainInfo, InvalidChainError, get_chain logger = logging.getLogger(__name__) @@ -91,22 +91,14 @@ class InvalidAddressError(ValueError): pass -class InvalidChainError(ValueError): - pass - - async def get_stream(sender: str, receiver: str, chain: str) -> Decimal: """ Get the stream of the user from the Superfluid API. See https://community.aleph.im/t/pay-as-you-go-using-superfluid/98/11 """ - try: - chain_info: ChainInfo = get_chain(chain=chain) - if not chain_info.active: - msg = f"Chain : {chain} is not active for superfluid" - raise InvalidChainError(msg) - except ValueError: - msg = f"Chain : {chain} is invalid" + chain_info: ChainInfo = get_chain(chain=chain) + if not chain_info.active: + msg = f"Chain : {chain} is not active for superfluid" raise InvalidChainError(msg) superfluid_instance = CFA_V1(chain_info.rpc, chain_info.chain_id) diff --git a/src/aleph/vm/orchestrator/tasks.py b/src/aleph/vm/orchestrator/tasks.py index 1a0afed8..593c5fcd 100644 --- a/src/aleph/vm/orchestrator/tasks.py +++ b/src/aleph/vm/orchestrator/tasks.py @@ -183,7 +183,7 @@ async def monitor_payments(app: web.Application): ) except ValueError as error: logger.error(f"Error found getting stream for chain {chain} and sender {sender}: {error}") - stream = Decimal(0) + continue required_stream = await compute_required_flow(executions) logger.debug(f"Required stream for Sender {sender} executions: {required_stream}") From d919116145092f59393cbf8779bc22ed88a21e65 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Mon, 9 Dec 2024 20:43:02 +0100 Subject: [PATCH 07/11] Add migration --- .../versions/5c6ae643c69b_add_gpu_table.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 src/aleph/vm/orchestrator/migrations/versions/5c6ae643c69b_add_gpu_table.py diff --git a/src/aleph/vm/orchestrator/migrations/versions/5c6ae643c69b_add_gpu_table.py b/src/aleph/vm/orchestrator/migrations/versions/5c6ae643c69b_add_gpu_table.py new file mode 100644 index 00000000..db982716 --- /dev/null +++ b/src/aleph/vm/orchestrator/migrations/versions/5c6ae643c69b_add_gpu_table.py @@ -0,0 +1,28 @@ +"""add gpu table + +Revision ID: 5c6ae643c69b +Revises: bbb12a12372e +Create Date: 2024-12-09 19:40:19.279735 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "5c6ae643c69b" +down_revision = "bbb12a12372e" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column("executions", sa.Column("gpus", sa.JSON(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column("executions", "gpus") + # ### end Alembic commands ### From d847b05df18f2cc31f1a3c1ff12b7071c250bd24 Mon Sep 17 00:00:00 2001 From: "Andres D. Molins" Date: Mon, 9 Dec 2024 22:04:29 +0100 Subject: [PATCH 08/11] Fix: Allow to use the notify endpoint for GPU instances also. --- src/aleph/vm/orchestrator/views/__init__.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/aleph/vm/orchestrator/views/__init__.py b/src/aleph/vm/orchestrator/views/__init__.py index c627a612..9f0d6b32 100644 --- a/src/aleph/vm/orchestrator/views/__init__.py +++ b/src/aleph/vm/orchestrator/views/__init__.py @@ -478,10 +478,14 @@ async def notify_allocation(request: web.Request): payment_type = message.content.payment and message.content.payment.type or PaymentType.hold is_confidential = message.content.environment.trusted_execution is not None - - if payment_type == PaymentType.hold and is_confidential: - # At the moment we will allow hold for PAYG - logger.debug("Confidential instance not using PAYG") + have_gpu = message.content.requirements and message.content.requirements.gpu is not None + + if payment_type == PaymentType.hold and (is_confidential or have_gpu): + # Log confidential and instances with GPU support + if is_confidential: + logger.debug(f"Confidential instance {item_hash} not using PAYG") + if have_gpu: + logger.debug(f"GPU Instance {item_hash} not using PAYG") user_balance = await payment.fetch_balance_of_address(message.sender) hold_price = await payment.fetch_execution_hold_price(item_hash) logger.debug(f"Address {message.sender} Balance: {user_balance}, Price: {hold_price}") From 7c78202554c2f6b2e452005505ae505beca2e68a Mon Sep 17 00:00:00 2001 From: "Andres D. Molins" Date: Wed, 11 Dec 2024 17:44:54 +0100 Subject: [PATCH 09/11] Fix: Remove migration duplicity. --- ...> 0002_5c6ae643c69b_add_gpu_column_to_executions_table.py} | 4 ---- 1 file changed, 4 deletions(-) rename src/aleph/vm/orchestrator/migrations/versions/{5c6ae643c69b_add_gpu_table.py => 0002_5c6ae643c69b_add_gpu_column_to_executions_table.py} (69%) diff --git a/src/aleph/vm/orchestrator/migrations/versions/5c6ae643c69b_add_gpu_table.py b/src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py similarity index 69% rename from src/aleph/vm/orchestrator/migrations/versions/5c6ae643c69b_add_gpu_table.py rename to src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py index db982716..08c200ec 100644 --- a/src/aleph/vm/orchestrator/migrations/versions/5c6ae643c69b_add_gpu_table.py +++ b/src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py @@ -17,12 +17,8 @@ def upgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### op.add_column("executions", sa.Column("gpus", sa.JSON(), nullable=True)) - # ### end Alembic commands ### def downgrade() -> None: - # ### commands auto generated by Alembic - please adjust! ### op.drop_column("executions", "gpus") - # ### end Alembic commands ### From 73035874023ec2caae03288df7827f7793bd1c9c Mon Sep 17 00:00:00 2001 From: "Andres D. Molins" Date: Wed, 11 Dec 2024 19:33:19 +0100 Subject: [PATCH 10/11] Fix: Changes DB initialization order to ensure that DB always exists before running the migrations. --- src/aleph/vm/orchestrator/cli.py | 7 ++++--- src/aleph/vm/orchestrator/supervisor.py | 4 ---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/aleph/vm/orchestrator/cli.py b/src/aleph/vm/orchestrator/cli.py index bbae396d..740733e6 100644 --- a/src/aleph/vm/orchestrator/cli.py +++ b/src/aleph/vm/orchestrator/cli.py @@ -167,9 +167,6 @@ async def benchmark(runs: int): """Measure program performance by immediately running the supervisor with fake requests. """ - engine = metrics.setup_engine() - await metrics.create_tables(engine) - ref = ItemHash("cafecafecafecafecafecafecafecafecafecafecafecafecafecafecafecafe") settings.FAKE_DATA_PROGRAM = settings.BENCHMARK_FAKE_DATA_PROGRAM @@ -357,6 +354,10 @@ def main(): settings.check() logger.debug("Initialising the DB...") + # Check and create execution database + engine = metrics.setup_engine() + asyncio.run(metrics.create_tables(engine)) + # After creating it run the DB migrations asyncio.run(run_async_db_migrations()) logger.debug("DB up to date.") diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py index a5ca999a..ae643629 100644 --- a/src/aleph/vm/orchestrator/supervisor.py +++ b/src/aleph/vm/orchestrator/supervisor.py @@ -20,7 +20,6 @@ from aleph.vm.sevclient import SevClient from aleph.vm.version import __version__ -from .metrics import create_tables, setup_engine from .resources import about_certificates, about_system_usage from .tasks import ( start_payment_monitoring_task, @@ -151,9 +150,6 @@ def run(): """Run the VM Supervisor.""" settings.check() - engine = setup_engine() - asyncio.run(create_tables(engine)) - loop = asyncio.new_event_loop() pool = VmPool(loop) pool.setup() From 11141af1199cc0f9dea83a7b27e2951e0d7eb62a Mon Sep 17 00:00:00 2001 From: "Andres D. Molins" Date: Wed, 11 Dec 2024 20:26:28 +0100 Subject: [PATCH 11/11] Fix: Updated migration to only insert the column if isn't inside. --- ...643c69b_add_gpu_column_to_executions_table.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py b/src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py index 08c200ec..4b739323 100644 --- a/src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py +++ b/src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py @@ -10,6 +10,11 @@ from alembic import op # revision identifiers, used by Alembic. +from sqlalchemy import create_engine +from sqlalchemy.engine import reflection + +from aleph.vm.conf import make_db_url + revision = "5c6ae643c69b" down_revision = "bbb12a12372e" branch_labels = None @@ -17,7 +22,16 @@ def upgrade() -> None: - op.add_column("executions", sa.Column("gpus", sa.JSON(), nullable=True)) + engine = create_engine(make_db_url()) + inspector = reflection.Inspector.from_engine(engine) + + # The table already exists on most CRNs. + tables = inspector.get_table_names() + if "executions" in tables: + columns = inspector.get_columns("executions") + column_names = [c["name"] for c in columns] + if "gpus" not in column_names: + op.add_column("executions", sa.Column("gpus", sa.JSON(), nullable=True)) def downgrade() -> None: