From aaaec2aab07672188aa36708d7a44cbef26fc4cc Mon Sep 17 00:00:00 2001
From: "Andres D. Molins" <amolinsdiaz@yahoo.es>
Date: Tue, 3 Dec 2024 22:27:21 +0100
Subject: [PATCH 01/11] Problem: If a user wants to assign a GPU to a QEmu VM
 he cannot do it.

Solution: Implement GPU assignation feature that will be pass-though to QEmu VMs with native performance.
---
 pyproject.toml                                |  2 +-
 src/aleph/vm/controllers/configuration.py     |  6 ++++
 .../vm/controllers/firecracker/executable.py  |  5 +++
 src/aleph/vm/controllers/qemu/instance.py     | 10 +++++-
 .../controllers/qemu_confidential/instance.py |  5 +++
 src/aleph/vm/hypervisors/qemu/qemuvm.py       | 34 +++++++++++++++----
 .../hypervisors/qemu_confidential/qemuvm.py   | 12 +++----
 src/aleph/vm/models.py                        | 31 ++++++++++++++---
 src/aleph/vm/pool.py                          | 19 +++++++++--
 9 files changed, 103 insertions(+), 21 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index faebfb9a..31b767d2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ dependencies = [
   "aioredis==1.3.1",
   "aiosqlite==0.19",
   "alembic==1.13.1",
-  "aleph-message==0.5",
+  "aleph-message @ git+https://github.com/aleph-im/aleph-message@andres-feature-add_gpu_requirement",
   "aleph-superfluid~=0.2.1",
   "dbus-python==1.3.2",
   "eth-account~=0.10",
diff --git a/src/aleph/vm/controllers/configuration.py b/src/aleph/vm/controllers/configuration.py
index da10d839..fb4b4ff1 100644
--- a/src/aleph/vm/controllers/configuration.py
+++ b/src/aleph/vm/controllers/configuration.py
@@ -23,6 +23,10 @@ class QemuVMHostVolume(BaseModel):
     read_only: bool
 
 
+class QemuGPU(BaseModel):
+    pci_host: str
+
+
 class QemuVMConfiguration(BaseModel):
     qemu_bin_path: str
     cloud_init_drive_path: str | None
@@ -33,6 +37,7 @@ class QemuVMConfiguration(BaseModel):
     mem_size_mb: int
     interface_name: str | None
     host_volumes: list[QemuVMHostVolume]
+    gpus: list[QemuGPU]
 
 
 class QemuConfidentialVMConfiguration(BaseModel):
@@ -45,6 +50,7 @@ class QemuConfidentialVMConfiguration(BaseModel):
     mem_size_mb: int
     interface_name: str | None
     host_volumes: list[QemuVMHostVolume]
+    gpus: list[QemuGPU]
     ovmf_path: Path
     sev_session_file: Path
     sev_dh_cert_file: Path
diff --git a/src/aleph/vm/controllers/firecracker/executable.py b/src/aleph/vm/controllers/firecracker/executable.py
index cbbad03c..840dcb9b 100644
--- a/src/aleph/vm/controllers/firecracker/executable.py
+++ b/src/aleph/vm/controllers/firecracker/executable.py
@@ -72,6 +72,11 @@ class HostVolume:
     read_only: bool
 
 
+@dataclass
+class HostGPU:
+    pci_host: str
+
+
 @dataclass
 class BaseConfiguration:
     vm_hash: ItemHash
diff --git a/src/aleph/vm/controllers/qemu/instance.py b/src/aleph/vm/controllers/qemu/instance.py
index dd840e22..51392c5f 100644
--- a/src/aleph/vm/controllers/qemu/instance.py
+++ b/src/aleph/vm/controllers/qemu/instance.py
@@ -5,7 +5,7 @@
 from asyncio import Task
 from asyncio.subprocess import Process
 from pathlib import Path
-from typing import Generic, TypeVar
+from typing import Generic, TypeVar, List
 
 import psutil
 from aleph_message.models import ItemHash
@@ -19,11 +19,13 @@
     HypervisorType,
     QemuVMConfiguration,
     QemuVMHostVolume,
+    QemuGPU,
     save_controller_configuration,
 )
 from aleph.vm.controllers.firecracker.executable import (
     AlephFirecrackerResources,
     VmSetupError,
+    HostGPU,
 )
 from aleph.vm.controllers.interface import AlephVmControllerInterface
 from aleph.vm.controllers.qemu.cloudinit import CloudInitMixin
@@ -36,6 +38,8 @@
 
 
 class AlephQemuResources(AlephFirecrackerResources):
+    gpus: list[HostGPU]
+
     async def download_runtime(self) -> None:
         volume = self.message_content.rootfs
         parent_image_path = await get_rootfs_base_path(volume.parent.ref)
@@ -200,6 +204,10 @@ async def configure(self):
                 )
                 for volume in self.resources.volumes
             ],
+            gpus=[
+                QemuGPU(pci_host=gpu.pci_host)
+                for gpu in self.resources.gpus
+            ]
         )
 
         configuration = Configuration(
diff --git a/src/aleph/vm/controllers/qemu_confidential/instance.py b/src/aleph/vm/controllers/qemu_confidential/instance.py
index f432cff6..2908e87e 100644
--- a/src/aleph/vm/controllers/qemu_confidential/instance.py
+++ b/src/aleph/vm/controllers/qemu_confidential/instance.py
@@ -13,6 +13,7 @@
     Configuration,
     HypervisorType,
     QemuConfidentialVMConfiguration,
+    QemuGPU,
     QemuVMHostVolume,
     save_controller_configuration,
 )
@@ -126,6 +127,10 @@ async def configure(self):
                 )
                 for volume in self.resources.volumes
             ],
+            gpus=[
+                QemuGPU(pci_host=gpu.pci_host)
+                for gpu in self.resources.gpus
+            ]
         )
 
         configuration = Configuration(
diff --git a/src/aleph/vm/hypervisors/qemu/qemuvm.py b/src/aleph/vm/hypervisors/qemu/qemuvm.py
index 5949fbdc..d6c9274d 100644
--- a/src/aleph/vm/hypervisors/qemu/qemuvm.py
+++ b/src/aleph/vm/hypervisors/qemu/qemuvm.py
@@ -7,7 +7,7 @@
 import qmp
 from systemd import journal
 
-from aleph.vm.controllers.configuration import QemuVMConfiguration
+from aleph.vm.controllers.configuration import QemuVMConfiguration, QemuGPU
 from aleph.vm.controllers.qemu.instance import logger
 
 
@@ -28,6 +28,7 @@ class QemuVM:
     interface_name: str
     qemu_process: Process | None = None
     host_volumes: list[HostVolume]
+    gpus: list[QemuGPU]
     journal_stdout: TextIO | None
     journal_stderr: TextIO | None
 
@@ -55,6 +56,7 @@ def __init__(self, vm_hash, config: QemuVMConfiguration):
             )
             for volume in config.host_volumes
         ]
+        self.gpus = config.gpus
 
     @property
     def _journal_stdout_name(self) -> str:
@@ -102,21 +104,23 @@ async def start(
             # Tell to put the output to std fd, so we can include them in the log
             "-serial",
             "stdio",
+            # Use host-phys-bits-limit argument for GPU support. TODO: Investigate how to get the correct bits size
+            #
+            "-cpu",
+            "host,host-phys-bits-limit=0x28"
             # Uncomment for debug
             # "-serial", "telnet:localhost:4321,server,nowait",
             # "-snapshot",  # Do not save anything to disk
         ]
-        for volume in self.host_volumes:
-            args += [
-                "-drive",
-                f"file={volume.path_on_host},format=raw,readonly={'on' if volume.read_only else 'off'},media=disk,if=virtio",
-            ]
         if self.interface_name:
             # script=no, downscript=no tell qemu not to try to set up the network itself
             args += ["-net", "nic,model=virtio", "-net", f"tap,ifname={self.interface_name},script=no,downscript=no"]
 
         if self.cloud_init_drive_path:
             args += ["-cdrom", f"{self.cloud_init_drive_path}"]
+
+        args += self._get_host_volumes_args()
+        args += self._get_gpu_args()
         print(*args)
 
         self.qemu_process = proc = await asyncio.create_subprocess_exec(
@@ -131,6 +135,24 @@ async def start(
         )
         return proc
 
+    def _get_host_volumes_args(self):
+        args = []
+        for volume in self.host_volumes:
+            args += [
+                "-drive",
+                f"file={volume.path_on_host},format=raw,readonly={'on' if volume.read_only else 'off'},media=disk,if=virtio",
+            ]
+        return args
+
+    def _get_gpu_args(self):
+        args = []
+        for gpu in self.gpus:
+            args += [
+                "-device",
+                f"vfio-pci,host={gpu.pci_host},multifunction=on,x-vga=on",
+            ]
+        return args
+
     def _get_qmpclient(self) -> qmp.QEMUMonitorProtocol | None:
         if not (self.qmp_socket_path and self.qmp_socket_path.exists()):
             return None
diff --git a/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py b/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py
index 5e32e899..868c12c1 100644
--- a/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py
+++ b/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py
@@ -105,24 +105,24 @@ async def start(
             # raise an error and prevent boot. Passing the argument --cpu host instruct the VM to use the same CPU
             # model than the host thus the VM's kernel knows which method is used to get random numbers (Intel and
             # AMD have different methods) and properly boot.
+            # Use host-phys-bits-limit argument for GPU support. TODO: Investigate how to get the correct bits size
             "-cpu",
-            "host",
+            "host,host-phys-bits-limit=0x28"
             # Uncomment following for debug
             # "-serial", "telnet:localhost:4321,server,nowait",
             # "-snapshot",  # Do not save anything to disk
         ]
-        for volume in self.host_volumes:
-            args += [
-                "-drive",
-                f"file={volume.path_on_host},format=raw,readonly={'on' if volume.read_only else 'off'},media=disk,if=virtio",
-            ]
         if self.interface_name:
             # script=no, downscript=no tell qemu not to try to set up the network itself
             args += ["-net", "nic,model=virtio", "-net", f"tap,ifname={self.interface_name},script=no,downscript=no"]
 
         if self.cloud_init_drive_path:
             args += ["-cdrom", f"{self.cloud_init_drive_path}"]
+
+        args += self._get_host_volumes_args()
+        args += self._get_gpu_args()
         print(*args)
+
         self.qemu_process = proc = await asyncio.create_subprocess_exec(
             *args,
             stdin=asyncio.subprocess.DEVNULL,
diff --git a/src/aleph/vm/models.py b/src/aleph/vm/models.py
index 9aee9320..97e846d6 100644
--- a/src/aleph/vm/models.py
+++ b/src/aleph/vm/models.py
@@ -5,6 +5,7 @@
 from collections.abc import Callable, Coroutine
 from dataclasses import dataclass
 from datetime import datetime, timezone
+from typing import List
 
 from aleph_message.models import (
     ExecutableContent,
@@ -15,11 +16,10 @@
 from aleph_message.models.execution.environment import HypervisorType
 
 from aleph.vm.conf import settings
-from aleph.vm.controllers.firecracker.executable import AlephFirecrackerExecutable
+from aleph.vm.controllers.firecracker.executable import AlephFirecrackerExecutable, HostGPU
 from aleph.vm.controllers.firecracker.instance import AlephInstanceResources
 from aleph.vm.controllers.firecracker.program import (
     AlephFirecrackerProgram,
-    AlephFirecrackerResources,
     AlephProgramResources,
 )
 from aleph.vm.controllers.firecracker.snapshot_manager import SnapshotManager
@@ -38,6 +38,7 @@
 )
 from aleph.vm.orchestrator.pubsub import PubSub
 from aleph.vm.orchestrator.vm import AlephFirecrackerInstance
+from aleph.vm.resources import GpuDevice
 from aleph.vm.systemd import SystemDManager
 from aleph.vm.utils import create_task_log_exceptions, dumps_for_json
 
@@ -69,8 +70,9 @@ class VmExecution:
     vm_hash: ItemHash
     original: ExecutableContent
     message: ExecutableContent
-    resources: AlephFirecrackerResources | None = None
-    vm: AlephFirecrackerExecutable | AlephQemuInstance | None = None
+    resources: AlephProgramResources | AlephInstanceResources | AlephQemuResources | AlephQemuConfidentialInstance | None = None
+    vm: AlephFirecrackerExecutable | AlephQemuInstance | AlephQemuConfidentialInstance | None = None
+    gpus: List[HostGPU]
 
     times: VmExecutionTimes
 
@@ -202,6 +204,7 @@ async def prepare(self) -> None:
                         resources = AlephQemuConfidentialResources(self.message, namespace=self.vm_hash)
                     else:
                         resources = AlephQemuResources(self.message, namespace=self.vm_hash)
+                    resources.gpus = self.gpus
                 else:
                     msg = f"Unknown hypervisor type {self.hypervisor}"
                     raise ValueError(msg)
@@ -216,6 +219,26 @@ async def prepare(self) -> None:
             self.times.prepared_at = datetime.now(tz=timezone.utc)
             self.resources = resources
 
+    def prepare_gpus(self, available_gpus: List[GpuDevice]) -> None:
+        gpus = []
+        for gpu in self.message.requirements.gpu:
+            for available_gpu in available_gpus:
+                if available_gpu.device_id == gpu.device_id:
+                    gpus.append(
+                        HostGPU(
+                            pci_host=available_gpu.pci_host
+                        )
+                    )
+                    break
+        self.gpus = gpus
+
+    def uses_gpu(self, pci_host: str) -> bool:
+        for gpu in self.gpus:
+            if gpu.pci_host == pci_host:
+                return True
+
+        return False
+
     def create(
         self, vm_id: int, tap_interface: TapInterface | None = None, prepare: bool = True
     ) -> AlephVmControllerInterface:
diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py
index dfc4242e..8b7cbef2 100644
--- a/src/aleph/vm/pool.py
+++ b/src/aleph/vm/pool.py
@@ -112,7 +112,11 @@ async def create_a_vm(
                 self.executions[vm_hash] = execution
 
             try:
+                # First assign Host GPUs from the available
+                execution.prepare_gpus(self.get_available_gpus())
+                # Prepare VM general Resources and also the GPUs
                 await execution.prepare()
+
                 vm_id = self.get_unique_vm_id()
 
                 if self.network:
@@ -236,6 +240,10 @@ async def load_persistent_executions(self):
 
             if execution.is_running:
                 # TODO: Improve the way that we re-create running execution
+                # Load existing GPUs assigned to VMs
+                for saved_gpu in saved_execution.gpus:
+                    execution.gpus.append(HostGPU(pci_host=saved_gpu.pci_host))
+                # Load and instantiate the rest of resources and already assigned GPUs
                 await execution.prepare()
                 if self.network:
                     vm_type = VmType.from_message_content(execution.message)
@@ -288,9 +296,14 @@ def get_instance_executions(self) -> Iterable[VmExecution]:
         )
         return executions or []
 
-    def get_available_gpus(self) -> Iterable[GpuDevice]:
-        # TODO: Filter already used GPUs on current executions and remove it from available
-        available_gpus = self.gpus
+    def get_available_gpus(self) -> List[GpuDevice]:
+        available_gpus = (
+            gpu
+            for gpu in self.gpus
+            for _, execution in self.executions.items()
+            if (isinstance(execution.resources, AlephQemuResources) or isinstance(execution.resources, AlephQemuConfidentialResources)) and not execution.uses_device_gpu(gpu.pci_host)
+        )
+
         return available_gpus or []
 
     def get_executions_by_sender(self, payment_type: PaymentType) -> dict[str, dict[str, list[VmExecution]]]:

From 24516a2d771f199c71bab6a293b82d38ceee637d Mon Sep 17 00:00:00 2001
From: "Andres D. Molins" <amolinsdiaz@yahoo.es>
Date: Tue, 3 Dec 2024 22:35:45 +0100
Subject: [PATCH 02/11] Fix: Solved code quality issues

---
 .../vm/controllers/firecracker/executable.py     |  5 -----
 src/aleph/vm/controllers/qemu/instance.py        | 13 +++++--------
 .../vm/controllers/qemu_confidential/instance.py |  5 +----
 src/aleph/vm/hypervisors/qemu/qemuvm.py          |  4 ++--
 .../vm/hypervisors/qemu_confidential/qemuvm.py   |  2 +-
 src/aleph/vm/models.py                           | 14 ++++++--------
 src/aleph/vm/pool.py                             | 16 +++++++---------
 src/aleph/vm/resources.py                        |  6 ++++++
 8 files changed, 28 insertions(+), 37 deletions(-)

diff --git a/src/aleph/vm/controllers/firecracker/executable.py b/src/aleph/vm/controllers/firecracker/executable.py
index 840dcb9b..cbbad03c 100644
--- a/src/aleph/vm/controllers/firecracker/executable.py
+++ b/src/aleph/vm/controllers/firecracker/executable.py
@@ -72,11 +72,6 @@ class HostVolume:
     read_only: bool
 
 
-@dataclass
-class HostGPU:
-    pci_host: str
-
-
 @dataclass
 class BaseConfiguration:
     vm_hash: ItemHash
diff --git a/src/aleph/vm/controllers/qemu/instance.py b/src/aleph/vm/controllers/qemu/instance.py
index 51392c5f..259f8474 100644
--- a/src/aleph/vm/controllers/qemu/instance.py
+++ b/src/aleph/vm/controllers/qemu/instance.py
@@ -5,7 +5,7 @@
 from asyncio import Task
 from asyncio.subprocess import Process
 from pathlib import Path
-from typing import Generic, TypeVar, List
+from typing import Generic, List, TypeVar
 
 import psutil
 from aleph_message.models import ItemHash
@@ -17,20 +17,20 @@
 from aleph.vm.controllers.configuration import (
     Configuration,
     HypervisorType,
+    QemuGPU,
     QemuVMConfiguration,
     QemuVMHostVolume,
-    QemuGPU,
     save_controller_configuration,
 )
 from aleph.vm.controllers.firecracker.executable import (
     AlephFirecrackerResources,
     VmSetupError,
-    HostGPU,
 )
 from aleph.vm.controllers.interface import AlephVmControllerInterface
 from aleph.vm.controllers.qemu.cloudinit import CloudInitMixin
 from aleph.vm.network.firewall import teardown_nftables_for_vm
 from aleph.vm.network.interfaces import TapInterface
+from aleph.vm.resources import HostGPU
 from aleph.vm.storage import get_rootfs_base_path
 from aleph.vm.utils import HostNotFoundError, ping, run_in_subprocess
 
@@ -38,7 +38,7 @@
 
 
 class AlephQemuResources(AlephFirecrackerResources):
-    gpus: list[HostGPU]
+    gpus: List[HostGPU] = []
 
     async def download_runtime(self) -> None:
         volume = self.message_content.rootfs
@@ -204,10 +204,7 @@ async def configure(self):
                 )
                 for volume in self.resources.volumes
             ],
-            gpus=[
-                QemuGPU(pci_host=gpu.pci_host)
-                for gpu in self.resources.gpus
-            ]
+            gpus=[QemuGPU(pci_host=gpu.pci_host) for gpu in self.resources.gpus],
         )
 
         configuration = Configuration(
diff --git a/src/aleph/vm/controllers/qemu_confidential/instance.py b/src/aleph/vm/controllers/qemu_confidential/instance.py
index 2908e87e..37986b10 100644
--- a/src/aleph/vm/controllers/qemu_confidential/instance.py
+++ b/src/aleph/vm/controllers/qemu_confidential/instance.py
@@ -127,10 +127,7 @@ async def configure(self):
                 )
                 for volume in self.resources.volumes
             ],
-            gpus=[
-                QemuGPU(pci_host=gpu.pci_host)
-                for gpu in self.resources.gpus
-            ]
+            gpus=[QemuGPU(pci_host=gpu.pci_host) for gpu in self.resources.gpus],
         )
 
         configuration = Configuration(
diff --git a/src/aleph/vm/hypervisors/qemu/qemuvm.py b/src/aleph/vm/hypervisors/qemu/qemuvm.py
index d6c9274d..9b95f187 100644
--- a/src/aleph/vm/hypervisors/qemu/qemuvm.py
+++ b/src/aleph/vm/hypervisors/qemu/qemuvm.py
@@ -7,7 +7,7 @@
 import qmp
 from systemd import journal
 
-from aleph.vm.controllers.configuration import QemuVMConfiguration, QemuGPU
+from aleph.vm.controllers.configuration import QemuGPU, QemuVMConfiguration
 from aleph.vm.controllers.qemu.instance import logger
 
 
@@ -107,7 +107,7 @@ async def start(
             # Use host-phys-bits-limit argument for GPU support. TODO: Investigate how to get the correct bits size
             #
             "-cpu",
-            "host,host-phys-bits-limit=0x28"
+            "host,host-phys-bits-limit=0x28",
             # Uncomment for debug
             # "-serial", "telnet:localhost:4321,server,nowait",
             # "-snapshot",  # Do not save anything to disk
diff --git a/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py b/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py
index 868c12c1..1ef33e40 100644
--- a/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py
+++ b/src/aleph/vm/hypervisors/qemu_confidential/qemuvm.py
@@ -107,7 +107,7 @@ async def start(
             # AMD have different methods) and properly boot.
             # Use host-phys-bits-limit argument for GPU support. TODO: Investigate how to get the correct bits size
             "-cpu",
-            "host,host-phys-bits-limit=0x28"
+            "host,host-phys-bits-limit=0x28",
             # Uncomment following for debug
             # "-serial", "telnet:localhost:4321,server,nowait",
             # "-snapshot",  # Do not save anything to disk
diff --git a/src/aleph/vm/models.py b/src/aleph/vm/models.py
index 97e846d6..b22c3d39 100644
--- a/src/aleph/vm/models.py
+++ b/src/aleph/vm/models.py
@@ -16,7 +16,7 @@
 from aleph_message.models.execution.environment import HypervisorType
 
 from aleph.vm.conf import settings
-from aleph.vm.controllers.firecracker.executable import AlephFirecrackerExecutable, HostGPU
+from aleph.vm.controllers.firecracker.executable import AlephFirecrackerExecutable
 from aleph.vm.controllers.firecracker.instance import AlephInstanceResources
 from aleph.vm.controllers.firecracker.program import (
     AlephFirecrackerProgram,
@@ -38,7 +38,7 @@
 )
 from aleph.vm.orchestrator.pubsub import PubSub
 from aleph.vm.orchestrator.vm import AlephFirecrackerInstance
-from aleph.vm.resources import GpuDevice
+from aleph.vm.resources import GpuDevice, HostGPU
 from aleph.vm.systemd import SystemDManager
 from aleph.vm.utils import create_task_log_exceptions, dumps_for_json
 
@@ -70,7 +70,9 @@ class VmExecution:
     vm_hash: ItemHash
     original: ExecutableContent
     message: ExecutableContent
-    resources: AlephProgramResources | AlephInstanceResources | AlephQemuResources | AlephQemuConfidentialInstance | None = None
+    resources: (
+        AlephProgramResources | AlephInstanceResources | AlephQemuResources | AlephQemuConfidentialInstance | None
+    ) = None
     vm: AlephFirecrackerExecutable | AlephQemuInstance | AlephQemuConfidentialInstance | None = None
     gpus: List[HostGPU]
 
@@ -224,11 +226,7 @@ def prepare_gpus(self, available_gpus: List[GpuDevice]) -> None:
         for gpu in self.message.requirements.gpu:
             for available_gpu in available_gpus:
                 if available_gpu.device_id == gpu.device_id:
-                    gpus.append(
-                        HostGPU(
-                            pci_host=available_gpu.pci_host
-                        )
-                    )
+                    gpus.append(HostGPU(pci_host=available_gpu.pci_host))
                     break
         self.gpus = gpus
 
diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py
index 8b7cbef2..a4f2f2f9 100644
--- a/src/aleph/vm/pool.py
+++ b/src/aleph/vm/pool.py
@@ -19,7 +19,7 @@
 from aleph.vm.controllers.firecracker.snapshot_manager import SnapshotManager
 from aleph.vm.network.hostnetwork import Network, make_ipv6_allocator
 from aleph.vm.orchestrator.metrics import get_execution_records
-from aleph.vm.resources import GpuDevice, get_gpu_devices
+from aleph.vm.resources import GpuDevice, HostGPU, get_gpu_devices
 from aleph.vm.systemd import SystemDManager
 from aleph.vm.utils import get_message_executable_content
 from aleph.vm.vm_type import VmType
@@ -297,14 +297,12 @@ def get_instance_executions(self) -> Iterable[VmExecution]:
         return executions or []
 
     def get_available_gpus(self) -> List[GpuDevice]:
-        available_gpus = (
-            gpu
-            for gpu in self.gpus
-            for _, execution in self.executions.items()
-            if (isinstance(execution.resources, AlephQemuResources) or isinstance(execution.resources, AlephQemuConfidentialResources)) and not execution.uses_device_gpu(gpu.pci_host)
-        )
-
-        return available_gpus or []
+        available_gpus = []
+        for gpu in self.gpus:
+            for _, execution in self.executions.items():
+                if not execution.uses_gpu(gpu.pci_host):
+                    available_gpus.append(gpu)
+        return available_gpus
 
     def get_executions_by_sender(self, payment_type: PaymentType) -> dict[str, dict[str, list[VmExecution]]]:
         """Return all executions of the given type, grouped by sender and by chain."""
diff --git a/src/aleph/vm/resources.py b/src/aleph/vm/resources.py
index 5532c226..b237dc0e 100644
--- a/src/aleph/vm/resources.py
+++ b/src/aleph/vm/resources.py
@@ -1,4 +1,5 @@
 import subprocess
+from dataclasses import dataclass
 from enum import Enum
 from typing import List, Optional
 
@@ -6,6 +7,11 @@
 from pydantic import Extra, Field
 
 
+@dataclass
+class HostGPU:
+    pci_host: str
+
+
 class GpuDeviceClass(str, Enum):
     VGA_COMPATIBLE_CONTROLLER = "0300"
     _3D_CONTROLLER = "0302"

From 60b2491fae24c7b625b897a3b119f71e0e6b5025 Mon Sep 17 00:00:00 2001
From: "Andres D. Molins" <amolinsdiaz@yahoo.es>
Date: Thu, 5 Dec 2024 14:46:02 +0100
Subject: [PATCH 03/11] Fix: Solved compilation issue and fixed gpu logic.

---
 packaging/Makefile     |  2 +-
 src/aleph/vm/models.py | 16 +++++++++-------
 src/aleph/vm/pool.py   |  8 ++++++--
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/packaging/Makefile b/packaging/Makefile
index 3c4f8a6b..898979b5 100644
--- a/packaging/Makefile
+++ b/packaging/Makefile
@@ -16,7 +16,7 @@ debian-package-code:
 	cp -r ../examples/data ./aleph-vm/opt/aleph-vm/examples/data
 	mkdir -p ./aleph-vm/opt/aleph-vm/examples/volumes
 	# Fixing this protobuf dependency version to avoid getting CI errors as version 5.29.0 have this compilation issue
-	pip3 install --progress-bar off --target ./aleph-vm/opt/aleph-vm/ 'aleph-message==0.5.0' 'eth-account==0.10' 'sentry-sdk==1.31.0' 'qmp==1.1.0' 'aleph-superfluid~=0.2.1' 'sqlalchemy[asyncio]>=2.0' 'aiosqlite==0.19.0' 'alembic==1.13.1' 'aiohttp_cors==0.7.0' 'pyroute2==0.7.12' 'python-cpuid==0.1.0' 'solathon==1.0.2' 'protobuf==5.28.3'
+	pip3 install --progress-bar off --target ./aleph-vm/opt/aleph-vm/ 'aleph-message@git+https://github.com/aleph-im/aleph-message@andres-feature-add_gpu_requirement' 'eth-account==0.10' 'sentry-sdk==1.31.0' 'qmp==1.1.0' 'aleph-superfluid~=0.2.1' 'sqlalchemy[asyncio]>=2.0' 'aiosqlite==0.19.0' 'alembic==1.13.1' 'aiohttp_cors==0.7.0' 'pyroute2==0.7.12' 'python-cpuid==0.1.0' 'solathon==1.0.2' 'protobuf==5.28.3'
 	python3 -m compileall ./aleph-vm/opt/aleph-vm/
 
 debian-package-resources: firecracker-bins vmlinux download-ipfs-kubo  target/bin/sevctl
diff --git a/src/aleph/vm/models.py b/src/aleph/vm/models.py
index b22c3d39..b0858d33 100644
--- a/src/aleph/vm/models.py
+++ b/src/aleph/vm/models.py
@@ -13,7 +13,7 @@
     ItemHash,
     ProgramContent,
 )
-from aleph_message.models.execution.environment import HypervisorType
+from aleph_message.models.execution.environment import GpuProperties, HypervisorType
 
 from aleph.vm.conf import settings
 from aleph.vm.controllers.firecracker.executable import AlephFirecrackerExecutable
@@ -74,7 +74,7 @@ class VmExecution:
         AlephProgramResources | AlephInstanceResources | AlephQemuResources | AlephQemuConfidentialInstance | None
     ) = None
     vm: AlephFirecrackerExecutable | AlephQemuInstance | AlephQemuConfidentialInstance | None = None
-    gpus: List[HostGPU]
+    gpus: List[HostGPU] = []
 
     times: VmExecutionTimes
 
@@ -223,11 +223,13 @@ async def prepare(self) -> None:
 
     def prepare_gpus(self, available_gpus: List[GpuDevice]) -> None:
         gpus = []
-        for gpu in self.message.requirements.gpu:
-            for available_gpu in available_gpus:
-                if available_gpu.device_id == gpu.device_id:
-                    gpus.append(HostGPU(pci_host=available_gpu.pci_host))
-                    break
+        if self.message.requirements and self.message.requirements.gpu:
+            for gpu in self.message.requirements.gpu:
+                gpu = GpuProperties.parse_obj(gpu)
+                for available_gpu in available_gpus:
+                    if available_gpu.device_id == gpu.device_id:
+                        gpus.append(HostGPU(pci_host=available_gpu.pci_host))
+                        break
         self.gpus = gpus
 
     def uses_gpu(self, pci_host: str) -> bool:
diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py
index a4f2f2f9..4268d9ed 100644
--- a/src/aleph/vm/pool.py
+++ b/src/aleph/vm/pool.py
@@ -299,9 +299,13 @@ def get_instance_executions(self) -> Iterable[VmExecution]:
     def get_available_gpus(self) -> List[GpuDevice]:
         available_gpus = []
         for gpu in self.gpus:
+            used = False
             for _, execution in self.executions.items():
-                if not execution.uses_gpu(gpu.pci_host):
-                    available_gpus.append(gpu)
+                if execution.uses_gpu(gpu.pci_host):
+                    used = True
+                    break
+            if not used:
+                available_gpus.append(gpu)
         return available_gpus
 
     def get_executions_by_sender(self, payment_type: PaymentType) -> dict[str, dict[str, list[VmExecution]]]:

From ba1fc9d3ce28078017a3b2f9841376dd3671bd0f Mon Sep 17 00:00:00 2001
From: "Andres D. Molins" <amolinsdiaz@yahoo.es>
Date: Fri, 6 Dec 2024 11:17:35 +0100
Subject: [PATCH 04/11] Fix: Solved issue getting already running executions
 with GPU

---
 src/aleph/vm/models.py               |  3 +++
 src/aleph/vm/orchestrator/metrics.py |  2 ++
 src/aleph/vm/orchestrator/payment.py | 10 +++++++---
 src/aleph/vm/orchestrator/tasks.py   | 13 +++++++++----
 src/aleph/vm/pool.py                 |  4 ++--
 src/aleph/vm/resources.py            | 15 ++++++++++-----
 6 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/src/aleph/vm/models.py b/src/aleph/vm/models.py
index b0858d33..7dd59091 100644
--- a/src/aleph/vm/models.py
+++ b/src/aleph/vm/models.py
@@ -1,4 +1,5 @@
 import asyncio
+import json
 import logging
 import uuid
 from asyncio import Task
@@ -14,6 +15,7 @@
     ProgramContent,
 )
 from aleph_message.models.execution.environment import GpuProperties, HypervisorType
+from pydantic.json import pydantic_encoder
 
 from aleph.vm.conf import settings
 from aleph.vm.controllers.firecracker.executable import AlephFirecrackerExecutable
@@ -460,6 +462,7 @@ async def save(self):
                     message=self.message.json(),
                     original_message=self.original.json(),
                     persistent=self.persistent,
+                    gpus=json.dumps(self.gpus, default=pydantic_encoder),
                 )
             )
 
diff --git a/src/aleph/vm/orchestrator/metrics.py b/src/aleph/vm/orchestrator/metrics.py
index f7f16648..6c9b8eea 100644
--- a/src/aleph/vm/orchestrator/metrics.py
+++ b/src/aleph/vm/orchestrator/metrics.py
@@ -76,6 +76,8 @@ class ExecutionRecord(Base):
     original_message = Column(JSON, nullable=True)
     persistent = Column(Boolean, nullable=True)
 
+    gpus = Column(JSON, nullable=True)
+
     def __repr__(self):
         return f"<ExecutionRecord(uuid={self.uuid}, vm_hash={self.vm_hash}, vm_id={self.vm_id})>"
 
diff --git a/src/aleph/vm/orchestrator/payment.py b/src/aleph/vm/orchestrator/payment.py
index 7194f873..5c074ce0 100644
--- a/src/aleph/vm/orchestrator/payment.py
+++ b/src/aleph/vm/orchestrator/payment.py
@@ -100,9 +100,13 @@ async def get_stream(sender: str, receiver: str, chain: str) -> Decimal:
     Get the stream of the user from the Superfluid API.
     See https://community.aleph.im/t/pay-as-you-go-using-superfluid/98/11
     """
-    chain_info: ChainInfo = get_chain(chain=chain)
-    if not chain_info.active:
-        msg = f"Chain : {chain} is not active for superfluid"
+    try:
+        chain_info: ChainInfo = get_chain(chain=chain)
+        if not chain_info.active:
+            msg = f"Chain : {chain} is not active for superfluid"
+            raise InvalidChainError(msg)
+    except ValueError:
+        msg = f"Chain : {chain} is invalid"
         raise InvalidChainError(msg)
 
     superfluid_instance = CFA_V1(chain_info.rpc, chain_info.chain_id)
diff --git a/src/aleph/vm/orchestrator/tasks.py b/src/aleph/vm/orchestrator/tasks.py
index 921a2265..1a0afed8 100644
--- a/src/aleph/vm/orchestrator/tasks.py
+++ b/src/aleph/vm/orchestrator/tasks.py
@@ -4,6 +4,7 @@
 import math
 import time
 from collections.abc import AsyncIterable
+from decimal import Decimal
 from typing import TypeVar
 
 import aiohttp
@@ -175,10 +176,14 @@ async def monitor_payments(app: web.Application):
         # Check if the balance held in the wallet is sufficient stream tier resources
         for sender, chains in pool.get_executions_by_sender(payment_type=PaymentType.superfluid).items():
             for chain, executions in chains.items():
-                stream = await get_stream(sender=sender, receiver=settings.PAYMENT_RECEIVER_ADDRESS, chain=chain)
-                logger.debug(
-                    f"Get stream flow from Sender {sender} to Receiver {settings.PAYMENT_RECEIVER_ADDRESS} of {stream}"
-                )
+                try:
+                    stream = await get_stream(sender=sender, receiver=settings.PAYMENT_RECEIVER_ADDRESS, chain=chain)
+                    logger.debug(
+                        f"Get stream flow from Sender {sender} to Receiver {settings.PAYMENT_RECEIVER_ADDRESS} of {stream}"
+                    )
+                except ValueError as error:
+                    logger.error(f"Error found getting stream for chain {chain} and sender {sender}: {error}")
+                    stream = Decimal(0)
 
                 required_stream = await compute_required_flow(executions)
                 logger.debug(f"Required stream for Sender {sender} executions: {required_stream}")
diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py
index 4268d9ed..1ffabd41 100644
--- a/src/aleph/vm/pool.py
+++ b/src/aleph/vm/pool.py
@@ -14,6 +14,7 @@
     Payment,
     PaymentType,
 )
+from pydantic import parse_raw_as
 
 from aleph.vm.conf import settings
 from aleph.vm.controllers.firecracker.snapshot_manager import SnapshotManager
@@ -241,8 +242,7 @@ async def load_persistent_executions(self):
             if execution.is_running:
                 # TODO: Improve the way that we re-create running execution
                 # Load existing GPUs assigned to VMs
-                for saved_gpu in saved_execution.gpus:
-                    execution.gpus.append(HostGPU(pci_host=saved_gpu.pci_host))
+                execution.gpus = parse_raw_as(List[HostGPU], saved_execution.gpus)
                 # Load and instantiate the rest of resources and already assigned GPUs
                 await execution.prepare()
                 if self.network:
diff --git a/src/aleph/vm/resources.py b/src/aleph/vm/resources.py
index b237dc0e..767b6490 100644
--- a/src/aleph/vm/resources.py
+++ b/src/aleph/vm/resources.py
@@ -1,18 +1,23 @@
 import subprocess
-from dataclasses import dataclass
 from enum import Enum
 from typing import List, Optional
 
 from aleph_message.models import HashableModel
-from pydantic import Extra, Field
+from pydantic import BaseModel, Extra, Field
 
 
-@dataclass
-class HostGPU:
-    pci_host: str
+class HostGPU(BaseModel):
+    """Host GPU properties detail."""
+
+    pci_host: str = Field(description="GPU PCI host address")
+
+    class Config:
+        extra = Extra.forbid
 
 
 class GpuDeviceClass(str, Enum):
+    """GPU device class. Look at https://admin.pci-ids.ucw.cz/read/PD/03"""
+
     VGA_COMPATIBLE_CONTROLLER = "0300"
     _3D_CONTROLLER = "0302"
 

From b06594c68acb7cd4ccdd2e9526b4aa6d8e589d22 Mon Sep 17 00:00:00 2001
From: "Andres D. Molins" <amolinsdiaz@yahoo.es>
Date: Fri, 6 Dec 2024 12:53:21 +0100
Subject: [PATCH 05/11] Fix: Expose GPU support option in `status/config`
 endpoint

---
 src/aleph/vm/orchestrator/views/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/aleph/vm/orchestrator/views/__init__.py b/src/aleph/vm/orchestrator/views/__init__.py
index 4c9b3866..c627a612 100644
--- a/src/aleph/vm/orchestrator/views/__init__.py
+++ b/src/aleph/vm/orchestrator/views/__init__.py
@@ -347,6 +347,7 @@ async def status_public_config(request: web.Request):
                 "ENABLE_QEMU_SUPPORT": settings.ENABLE_QEMU_SUPPORT,
                 "INSTANCE_DEFAULT_HYPERVISOR": settings.INSTANCE_DEFAULT_HYPERVISOR,
                 "ENABLE_CONFIDENTIAL_COMPUTING": settings.ENABLE_CONFIDENTIAL_COMPUTING,
+                "ENABLE_GPU_SUPPORT": settings.ENABLE_GPU_SUPPORT,
             },
         },
         dumps=dumps_for_json,

From 70b7390bb59062b87105e11bc0559199d53fb689 Mon Sep 17 00:00:00 2001
From: "Andres D. Molins" <amolinsdiaz@yahoo.es>
Date: Mon, 9 Dec 2024 19:34:30 +0100
Subject: [PATCH 06/11] Fix: Applied some code review suggestions

---
 packaging/Makefile                      |  2 +-
 pyproject.toml                          |  2 +-
 src/aleph/vm/hypervisors/qemu/qemuvm.py | 10 +++++-----
 src/aleph/vm/orchestrator/chain.py      |  6 +++++-
 src/aleph/vm/orchestrator/payment.py    | 16 ++++------------
 src/aleph/vm/orchestrator/tasks.py      |  2 +-
 6 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/packaging/Makefile b/packaging/Makefile
index 898979b5..a601f1c9 100644
--- a/packaging/Makefile
+++ b/packaging/Makefile
@@ -16,7 +16,7 @@ debian-package-code:
 	cp -r ../examples/data ./aleph-vm/opt/aleph-vm/examples/data
 	mkdir -p ./aleph-vm/opt/aleph-vm/examples/volumes
 	# Fixing this protobuf dependency version to avoid getting CI errors as version 5.29.0 have this compilation issue
-	pip3 install --progress-bar off --target ./aleph-vm/opt/aleph-vm/ 'aleph-message@git+https://github.com/aleph-im/aleph-message@andres-feature-add_gpu_requirement' 'eth-account==0.10' 'sentry-sdk==1.31.0' 'qmp==1.1.0' 'aleph-superfluid~=0.2.1' 'sqlalchemy[asyncio]>=2.0' 'aiosqlite==0.19.0' 'alembic==1.13.1' 'aiohttp_cors==0.7.0' 'pyroute2==0.7.12' 'python-cpuid==0.1.0' 'solathon==1.0.2' 'protobuf==5.28.3'
+	pip3 install --progress-bar off --target ./aleph-vm/opt/aleph-vm/ 'aleph-message==0.6' 'eth-account==0.10' 'sentry-sdk==1.31.0' 'qmp==1.1.0' 'aleph-superfluid~=0.2.1' 'sqlalchemy[asyncio]>=2.0' 'aiosqlite==0.19.0' 'alembic==1.13.1' 'aiohttp_cors==0.7.0' 'pyroute2==0.7.12' 'python-cpuid==0.1.0' 'solathon==1.0.2' 'protobuf==5.28.3'
 	python3 -m compileall ./aleph-vm/opt/aleph-vm/
 
 debian-package-resources: firecracker-bins vmlinux download-ipfs-kubo  target/bin/sevctl
diff --git a/pyproject.toml b/pyproject.toml
index 31b767d2..1c934ff1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ dependencies = [
   "aioredis==1.3.1",
   "aiosqlite==0.19",
   "alembic==1.13.1",
-  "aleph-message @ git+https://github.com/aleph-im/aleph-message@andres-feature-add_gpu_requirement",
+  "aleph-message==0.6",
   "aleph-superfluid~=0.2.1",
   "dbus-python==1.3.2",
   "eth-account~=0.10",
diff --git a/src/aleph/vm/hypervisors/qemu/qemuvm.py b/src/aleph/vm/hypervisors/qemu/qemuvm.py
index 9b95f187..36003e59 100644
--- a/src/aleph/vm/hypervisors/qemu/qemuvm.py
+++ b/src/aleph/vm/hypervisors/qemu/qemuvm.py
@@ -104,10 +104,6 @@ async def start(
             # Tell to put the output to std fd, so we can include them in the log
             "-serial",
             "stdio",
-            # Use host-phys-bits-limit argument for GPU support. TODO: Investigate how to get the correct bits size
-            #
-            "-cpu",
-            "host,host-phys-bits-limit=0x28",
             # Uncomment for debug
             # "-serial", "telnet:localhost:4321,server,nowait",
             # "-snapshot",  # Do not save anything to disk
@@ -145,7 +141,11 @@ def _get_host_volumes_args(self):
         return args
 
     def _get_gpu_args(self):
-        args = []
+        args = [
+            # Use host-phys-bits-limit argument for GPU support. TODO: Investigate how to get the correct bits size
+            "-cpu",
+            "host,host-phys-bits-limit=0x28",
+        ]
         for gpu in self.gpus:
             args += [
                 "-device",
diff --git a/src/aleph/vm/orchestrator/chain.py b/src/aleph/vm/orchestrator/chain.py
index 7321aa45..0b417439 100644
--- a/src/aleph/vm/orchestrator/chain.py
+++ b/src/aleph/vm/orchestrator/chain.py
@@ -60,9 +60,13 @@ def check_tokens(cls, values):
 }
 
 
+class InvalidChainError(ValueError):
+    pass
+
+
 def get_chain(chain: str) -> ChainInfo:
     try:
         return STREAM_CHAINS[chain]
     except KeyError:
         msg = f"Unknown chain id for chain {chain}"
-        raise ValueError(msg)
+        raise InvalidChainError(msg)
diff --git a/src/aleph/vm/orchestrator/payment.py b/src/aleph/vm/orchestrator/payment.py
index 5c074ce0..f5a79bbc 100644
--- a/src/aleph/vm/orchestrator/payment.py
+++ b/src/aleph/vm/orchestrator/payment.py
@@ -13,7 +13,7 @@
 from aleph.vm.models import VmExecution
 from aleph.vm.utils import to_normalized_address
 
-from .chain import ChainInfo, get_chain
+from .chain import ChainInfo, InvalidChainError, get_chain
 
 logger = logging.getLogger(__name__)
 
@@ -91,22 +91,14 @@ class InvalidAddressError(ValueError):
     pass
 
 
-class InvalidChainError(ValueError):
-    pass
-
-
 async def get_stream(sender: str, receiver: str, chain: str) -> Decimal:
     """
     Get the stream of the user from the Superfluid API.
     See https://community.aleph.im/t/pay-as-you-go-using-superfluid/98/11
     """
-    try:
-        chain_info: ChainInfo = get_chain(chain=chain)
-        if not chain_info.active:
-            msg = f"Chain : {chain} is not active for superfluid"
-            raise InvalidChainError(msg)
-    except ValueError:
-        msg = f"Chain : {chain} is invalid"
+    chain_info: ChainInfo = get_chain(chain=chain)
+    if not chain_info.active:
+        msg = f"Chain : {chain} is not active for superfluid"
         raise InvalidChainError(msg)
 
     superfluid_instance = CFA_V1(chain_info.rpc, chain_info.chain_id)
diff --git a/src/aleph/vm/orchestrator/tasks.py b/src/aleph/vm/orchestrator/tasks.py
index 1a0afed8..593c5fcd 100644
--- a/src/aleph/vm/orchestrator/tasks.py
+++ b/src/aleph/vm/orchestrator/tasks.py
@@ -183,7 +183,7 @@ async def monitor_payments(app: web.Application):
                     )
                 except ValueError as error:
                     logger.error(f"Error found getting stream for chain {chain} and sender {sender}: {error}")
-                    stream = Decimal(0)
+                    continue
 
                 required_stream = await compute_required_flow(executions)
                 logger.debug(f"Required stream for Sender {sender} executions: {required_stream}")

From d919116145092f59393cbf8779bc22ed88a21e65 Mon Sep 17 00:00:00 2001
From: Olivier Le Thanh Duong <olivier@lethanh.be>
Date: Mon, 9 Dec 2024 20:43:02 +0100
Subject: [PATCH 07/11] Add migration

---
 .../versions/5c6ae643c69b_add_gpu_table.py    | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 src/aleph/vm/orchestrator/migrations/versions/5c6ae643c69b_add_gpu_table.py

diff --git a/src/aleph/vm/orchestrator/migrations/versions/5c6ae643c69b_add_gpu_table.py b/src/aleph/vm/orchestrator/migrations/versions/5c6ae643c69b_add_gpu_table.py
new file mode 100644
index 00000000..db982716
--- /dev/null
+++ b/src/aleph/vm/orchestrator/migrations/versions/5c6ae643c69b_add_gpu_table.py
@@ -0,0 +1,28 @@
+"""add gpu table
+
+Revision ID: 5c6ae643c69b
+Revises: bbb12a12372e
+Create Date: 2024-12-09 19:40:19.279735
+
+"""
+
+import sqlalchemy as sa
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "5c6ae643c69b"
+down_revision = "bbb12a12372e"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column("executions", sa.Column("gpus", sa.JSON(), nullable=True))
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("executions", "gpus")
+    # ### end Alembic commands ###

From d847b05df18f2cc31f1a3c1ff12b7071c250bd24 Mon Sep 17 00:00:00 2001
From: "Andres D. Molins" <amolinsdiaz@yahoo.es>
Date: Mon, 9 Dec 2024 22:04:29 +0100
Subject: [PATCH 08/11] Fix: Allow to use the notify endpoint for GPU instances
 also.

---
 src/aleph/vm/orchestrator/views/__init__.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/aleph/vm/orchestrator/views/__init__.py b/src/aleph/vm/orchestrator/views/__init__.py
index c627a612..9f0d6b32 100644
--- a/src/aleph/vm/orchestrator/views/__init__.py
+++ b/src/aleph/vm/orchestrator/views/__init__.py
@@ -478,10 +478,14 @@ async def notify_allocation(request: web.Request):
     payment_type = message.content.payment and message.content.payment.type or PaymentType.hold
 
     is_confidential = message.content.environment.trusted_execution is not None
-
-    if payment_type == PaymentType.hold and is_confidential:
-        # At the moment we will allow hold for PAYG
-        logger.debug("Confidential instance not using PAYG")
+    have_gpu = message.content.requirements and message.content.requirements.gpu is not None
+
+    if payment_type == PaymentType.hold and (is_confidential or have_gpu):
+        # Log confidential and instances with GPU support
+        if is_confidential:
+            logger.debug(f"Confidential instance {item_hash} not using PAYG")
+        if have_gpu:
+            logger.debug(f"GPU Instance {item_hash} not using PAYG")
         user_balance = await payment.fetch_balance_of_address(message.sender)
         hold_price = await payment.fetch_execution_hold_price(item_hash)
         logger.debug(f"Address {message.sender} Balance: {user_balance}, Price: {hold_price}")

From 7c78202554c2f6b2e452005505ae505beca2e68a Mon Sep 17 00:00:00 2001
From: "Andres D. Molins" <amolinsdiaz@yahoo.es>
Date: Wed, 11 Dec 2024 17:44:54 +0100
Subject: [PATCH 09/11] Fix: Remove migration duplicity.

---
 ...> 0002_5c6ae643c69b_add_gpu_column_to_executions_table.py} | 4 ----
 1 file changed, 4 deletions(-)
 rename src/aleph/vm/orchestrator/migrations/versions/{5c6ae643c69b_add_gpu_table.py => 0002_5c6ae643c69b_add_gpu_column_to_executions_table.py} (69%)

diff --git a/src/aleph/vm/orchestrator/migrations/versions/5c6ae643c69b_add_gpu_table.py b/src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py
similarity index 69%
rename from src/aleph/vm/orchestrator/migrations/versions/5c6ae643c69b_add_gpu_table.py
rename to src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py
index db982716..08c200ec 100644
--- a/src/aleph/vm/orchestrator/migrations/versions/5c6ae643c69b_add_gpu_table.py
+++ b/src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py
@@ -17,12 +17,8 @@
 
 
 def upgrade() -> None:
-    # ### commands auto generated by Alembic - please adjust! ###
     op.add_column("executions", sa.Column("gpus", sa.JSON(), nullable=True))
-    # ### end Alembic commands ###
 
 
 def downgrade() -> None:
-    # ### commands auto generated by Alembic - please adjust! ###
     op.drop_column("executions", "gpus")
-    # ### end Alembic commands ###

From 73035874023ec2caae03288df7827f7793bd1c9c Mon Sep 17 00:00:00 2001
From: "Andres D. Molins" <amolinsdiaz@yahoo.es>
Date: Wed, 11 Dec 2024 19:33:19 +0100
Subject: [PATCH 10/11] Fix: Changes DB initialization order to ensure that DB
 always exists before running the migrations.

---
 src/aleph/vm/orchestrator/cli.py        | 7 ++++---
 src/aleph/vm/orchestrator/supervisor.py | 4 ----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/aleph/vm/orchestrator/cli.py b/src/aleph/vm/orchestrator/cli.py
index bbae396d..740733e6 100644
--- a/src/aleph/vm/orchestrator/cli.py
+++ b/src/aleph/vm/orchestrator/cli.py
@@ -167,9 +167,6 @@ async def benchmark(runs: int):
     """Measure program performance by immediately running the supervisor
     with fake requests.
     """
-    engine = metrics.setup_engine()
-    await metrics.create_tables(engine)
-
     ref = ItemHash("cafecafecafecafecafecafecafecafecafecafecafecafecafecafecafecafe")
     settings.FAKE_DATA_PROGRAM = settings.BENCHMARK_FAKE_DATA_PROGRAM
 
@@ -357,6 +354,10 @@ def main():
     settings.check()
 
     logger.debug("Initialising the DB...")
+    # Check and create execution database
+    engine = metrics.setup_engine()
+    asyncio.run(metrics.create_tables(engine))
+    # After creating it run the DB migrations
     asyncio.run(run_async_db_migrations())
     logger.debug("DB up to date.")
 
diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py
index a5ca999a..ae643629 100644
--- a/src/aleph/vm/orchestrator/supervisor.py
+++ b/src/aleph/vm/orchestrator/supervisor.py
@@ -20,7 +20,6 @@
 from aleph.vm.sevclient import SevClient
 from aleph.vm.version import __version__
 
-from .metrics import create_tables, setup_engine
 from .resources import about_certificates, about_system_usage
 from .tasks import (
     start_payment_monitoring_task,
@@ -151,9 +150,6 @@ def run():
     """Run the VM Supervisor."""
     settings.check()
 
-    engine = setup_engine()
-    asyncio.run(create_tables(engine))
-
     loop = asyncio.new_event_loop()
     pool = VmPool(loop)
     pool.setup()

From 11141af1199cc0f9dea83a7b27e2951e0d7eb62a Mon Sep 17 00:00:00 2001
From: "Andres D. Molins" <amolinsdiaz@yahoo.es>
Date: Wed, 11 Dec 2024 20:26:28 +0100
Subject: [PATCH 11/11] Fix: Updated migration to only insert the column if
 isn't inside.

---
 ...643c69b_add_gpu_column_to_executions_table.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py b/src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py
index 08c200ec..4b739323 100644
--- a/src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py
+++ b/src/aleph/vm/orchestrator/migrations/versions/0002_5c6ae643c69b_add_gpu_column_to_executions_table.py
@@ -10,6 +10,11 @@
 from alembic import op
 
 # revision identifiers, used by Alembic.
+from sqlalchemy import create_engine
+from sqlalchemy.engine import reflection
+
+from aleph.vm.conf import make_db_url
+
 revision = "5c6ae643c69b"
 down_revision = "bbb12a12372e"
 branch_labels = None
@@ -17,7 +22,16 @@
 
 
 def upgrade() -> None:
-    op.add_column("executions", sa.Column("gpus", sa.JSON(), nullable=True))
+    engine = create_engine(make_db_url())
+    inspector = reflection.Inspector.from_engine(engine)
+
+    # The table already exists on most CRNs.
+    tables = inspector.get_table_names()
+    if "executions" in tables:
+        columns = inspector.get_columns("executions")
+        column_names = [c["name"] for c in columns]
+        if "gpus" not in column_names:
+            op.add_column("executions", sa.Column("gpus", sa.JSON(), nullable=True))
 
 
 def downgrade() -> None: