Skip to content

Commit

Permalink
Fix: Added GPU resources to the main pool and exposed it on the endpo…
Browse files Browse the repository at this point in the history
…int.
  • Loading branch information
nesitor committed Dec 2, 2024
1 parent ae278ce commit 68c0d88
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 70 deletions.
7 changes: 7 additions & 0 deletions src/aleph/vm/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,11 @@ class Settings(BaseSettings):

CONFIDENTIAL_SESSION_DIRECTORY: Path = Field(None, description="Default to EXECUTION_ROOT/sessions")

ENABLE_GPU_SUPPORT: bool = Field(
default=False,
description="Enable GPU pass-through support to VMs, only allowed for QEmu hypervisor",
)

# Tests on programs

FAKE_DATA_PROGRAM: Path | None = None
Expand Down Expand Up @@ -391,6 +396,8 @@ def check(self):
# assert check_amd_sev_snp_supported(), "SEV-SNP feature isn't enabled, enable it in BIOS"
assert self.ENABLE_QEMU_SUPPORT, "Qemu Support is needed for confidential computing and it's disabled, "
"enable it setting the env variable `ENABLE_QEMU_SUPPORT=True` in configuration"
if self.ENABLE_GPU_SUPPORT:
assert self.ENABLE_QEMU_SUPPORT, "Qemu Support is needed for GPU support and it's disabled, "

def setup(self):
"""Setup the environment defined by the settings. Call this method after loading the settings."""
Expand Down
83 changes: 13 additions & 70 deletions src/aleph/vm/orchestrator/resources.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
import math
import subprocess
from datetime import datetime, timezone
from enum import Enum
from functools import lru_cache
from typing import List, Literal, Optional
from typing import List, Optional

import cpuinfo
import psutil
from aiohttp import web
from aleph_message.models import ItemHash
from aleph_message.models.abstract import HashableModel
from aleph_message.models.execution.environment import CpuProperties
from pydantic import BaseModel, Extra, Field
from pydantic import BaseModel, Field

from aleph.vm.conf import settings
from aleph.vm.pool import VmPool
from aleph.vm.resources import GpuProperties
from aleph.vm.sevclient import SevClient
from aleph.vm.utils import (
check_amd_sev_es_supported,
Expand Down Expand Up @@ -73,36 +72,10 @@ class UsagePeriod(BaseModel):
duration_seconds: float


class GpuDeviceClass(str, Enum):
VGA_COMPATIBLE_CONTROLLER = "0300"
_3D_CONTROLLER = "0302"


class GpuProperties(BaseModel):
"""GPU properties."""

vendor: str = Field(description="GPU vendor name")
device_name: str = Field(description="GPU vendor card name")
device_class: GpuDeviceClass = Field(
description="GPU device class. Look at https://admin.pci-ids.ucw.cz/read/PD/03"
)
device_id: str = Field(description="GPU vendor & device ids")

class Config:
extra = Extra.forbid


def is_gpu_device_class(device_class: str) -> bool:
try:
GpuDeviceClass(device_class)
return True
except ValueError:
return False


class MachineProperties(BaseModel):
cpu: CpuProperties
gpu: Optional[List[GpuProperties]]
available_gpus: Optional[List[GpuProperties]]


class MachineUsage(BaseModel):
Expand All @@ -114,48 +87,17 @@ class MachineUsage(BaseModel):
active: bool = True


def parse_gpu_device_info(line) -> Optional[GpuProperties]:
"""Parse GPU device info from a line of lspci output."""

device = line.split(' "', maxsplit=1)[1]
device_class, device_vendor, device_info = device.split('" "', maxsplit=2)
device_class = device_class.split("[", maxsplit=1)[1][:-1]
vendor, vendor_id = device_vendor.split(" [", maxsplit=1)
device_name = device_info.split('"', maxsplit=1)[0]
device_name, model_id = device_name.split(" [", maxsplit=1)
device_id = f"{vendor_id[:-1]}:{model_id[:-1]}"

return (
GpuProperties(
vendor=vendor,
device_name=device_name,
device_class=device_class,
device_id=device_id,
)
if is_gpu_device_class(device_class)
else None
)


def get_gpu_info() -> Optional[List[GpuProperties]]:
"""Get GPU info using lspci command."""

result = subprocess.run(["lspci", "-mmnnn"], capture_output=True, text=True, check=True)
gpu_devices = list(
{device for line in result.stdout.split("\n") if line and (device := parse_gpu_device_info(line)) is not None}
)
return gpu_devices if gpu_devices else None


@lru_cache
def get_machine_properties() -> MachineProperties:
def get_machine_properties(request: web.Request) -> MachineProperties:
"""Fetch machine properties such as architecture, CPU vendor, ...
These should not change while the supervisor is running.
In the future, some properties may have to be fetched from within a VM.
"""
cpu_info = cpuinfo.get_cpu_info() # Slow
gpu_info = get_gpu_info()
pool: VmPool = request.app["vm_pool"]
gpus = pool.gpus
available_gpus = pool.get_available_gpus()
return MachineProperties(
cpu=CpuProperties(
architecture=cpu_info.get("raw_arch_string", cpu_info.get("arch_string_raw")),
Expand All @@ -171,12 +113,13 @@ def get_machine_properties() -> MachineProperties:
)
),
),
gpu=gpu_info,
gpu=gpus,
available_gpus=available_gpus,
)


@cors_allow_all
async def about_system_usage(_: web.Request):
async def about_system_usage(request: web.Request):
"""Public endpoint to expose information about the system usage."""
period_start = datetime.now(timezone.utc).replace(second=0, microsecond=0)

Expand All @@ -198,7 +141,7 @@ async def about_system_usage(_: web.Request):
start_timestamp=period_start,
duration_seconds=60,
),
properties=get_machine_properties(),
properties=get_machine_properties(request),
)

return web.json_response(text=usage.json(exclude_none=True))
Expand Down
10 changes: 10 additions & 0 deletions src/aleph/vm/pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from aleph.vm.controllers.firecracker.snapshot_manager import SnapshotManager
from aleph.vm.network.hostnetwork import Network, make_ipv6_allocator
from aleph.vm.orchestrator.metrics import get_execution_records
from aleph.vm.resources import get_gpu_info
from aleph.vm.systemd import SystemDManager
from aleph.vm.utils import get_message_executable_content
from aleph.vm.vm_type import VmType
Expand All @@ -41,6 +42,7 @@ class VmPool:
snapshot_manager: SnapshotManager | None = None
systemd_manager: SystemDManager
creation_lock: asyncio.Lock
gpus: List[GpuProperties] = []

def __init__(self, loop: asyncio.AbstractEventLoop):
self.executions = {}
Expand Down Expand Up @@ -78,6 +80,10 @@ def setup(self) -> None:
logger.debug("Initializing SnapshotManager ...")
self.snapshot_manager.run_in_thread()

if settings.ENABLE_GPU_SUPPORT:
logger.debug("Detecting GPU devices ...")
self.available_gpus = get_gpu_info()

def teardown(self) -> None:
"""Stop the VM pool and the network properly."""
if self.network:
Expand Down Expand Up @@ -281,6 +287,10 @@ def get_instance_executions(self) -> Iterable[VmExecution]:
)
return executions or []

def get_available_gpus(self) -> Iterable[GpuProperties]:
available_gpus = self.available_gpus
return available_gpus or []

def get_executions_by_sender(self, payment_type: PaymentType) -> dict[str, dict[str, list[VmExecution]]]:
"""Return all executions of the given type, grouped by sender and by chain."""
executions_by_sender: dict[str, dict[str, list[VmExecution]]] = {}
Expand Down
96 changes: 96 additions & 0 deletions src/aleph/vm/resources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import subprocess
from enum import Enum
from typing import Optional, List

from pydantic import BaseModel, Field, Extra


class GpuDeviceClass(str, Enum):
VGA_COMPATIBLE_CONTROLLER = "0300"
_3D_CONTROLLER = "0302"


class GpuProperties(BaseModel):
"""GPU properties."""

vendor: str = Field(description="GPU vendor name")
device_name: str = Field(description="GPU vendor card name")
device_class: GpuDeviceClass = Field(
description="GPU device class. Look at https://admin.pci-ids.ucw.cz/read/PD/03"
)
pci_host: str = Field(description="Host PCI bus for this device")
device_id: str = Field(description="GPU vendor & device ids")

class Config:
extra = Extra.forbid


def is_gpu_device_class(device_class: str) -> bool:
try:
GpuDeviceClass(device_class)
return True
except ValueError:
return False


def get_vendor_name(vendor_id: str) -> str:
match vendor_id:
case "10de":
return "NVIDIA"
case "1002":
return "AMD"
case "8086":
return "Intel"
case _:
raise ValueError("Device vendor not compatible")


def is_kernel_enabled_gpu(pci_host: str) -> bool:
# Get detailed info about Kernel drivers used by this device.
# Needs to use specifically only the kernel driver vfio-pci to be compatible for QEmu virtualization
result = subprocess.run(["lspci", "-s", pci_host, "-nnk"], capture_output=True, text=True, check=True)
details = result.stdout.split("\n")
if "\tKernel driver in use: vfio-pci" in details:
return True

return False


def parse_gpu_device_info(line: str) -> Optional[GpuProperties]:
"""Parse GPU device info from a line of lspci output."""

pci_host, device = line.split(' "', maxsplit=1)

if not is_kernel_enabled_gpu(pci_host):
return None

device_class, device_vendor, device_info = device.split('" "', maxsplit=2)
device_class = device_class.split("[", maxsplit=1)[1][:-1]

if not is_gpu_device_class(device_class):
return None

vendor, vendor_id = device_vendor.split(" [", maxsplit=1)
vendor_id = vendor_id[:-1]
vendor_name = get_vendor_name(vendor_id)
device_name = device_info.split('"', maxsplit=1)[0]
device_name, model_id = device_name.split(" [", maxsplit=1)
model_id = model_id[:-1]
device_id = f"{vendor_id}:{model_id}"

return GpuProperties(
vendor=vendor_name,
device_name=device_name,
device_class=device_class,
device_id=device_id,
)


def get_gpu_info() -> Optional[List[GpuProperties]]:
"""Get GPU info using lspci command."""

result = subprocess.run(["lspci", "-mmnnn"], capture_output=True, text=True, check=True)
gpu_devices = list(
{device for line in result.stdout.split("\n") if line and (device := parse_gpu_device_info(line)) is not None}
)
return gpu_devices if gpu_devices else None

0 comments on commit 68c0d88

Please sign in to comment.