From 089ccef903648db0479b1203c9f6c13d0eeea62a Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Thu, 31 Oct 2024 14:48:19 +0100 Subject: [PATCH 1/6] Make vm_id assignment more robust (#714) Remove the counter way to assign a vm_id as it didn't work reliably Jira ticket: ALEPH-272 That method was broken when persitent instances were loaded at start up. Since the "new" feature that allow persistent instance across aleph-vm reboot if one was started then aleph-vm was stopped and restarted the counter method could reassign the ip and break the existing vm's. Secundary reason was that the feature wasn't working properly with the default settings, as `2**available_bits` returned 1. So that code path was only used if the node owner tweaked some undocumented settings making it hard to identify and debug in prod nodes. --- src/aleph/vm/pool.py | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index 3ecf500e..025bfe45 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -28,15 +28,13 @@ class VmPool: - """Pool of VMs already started and used to decrease response time. + """Pool of existing VMs + + For function VM we keep the VM a while after they have run, so we can reuse them and thus decrease response time. After running, a VM is saved for future reuse from the same function during a configurable duration. - - The counter is used by the VMs to set their tap interface name and the corresponding - IPv4 subnet. """ - counter: int # Used to provide distinct ids to network interfaces executions: dict[ItemHash, VmExecution] message_cache: dict[str, ExecutableMessage] network: Network | None @@ -45,7 +43,6 @@ class VmPool: creation_lock: asyncio.Lock def __init__(self, loop: asyncio.AbstractEventLoop): - self.counter = settings.START_ID_INDEX self.executions = {} self.message_cache = {} @@ -150,25 +147,13 @@ def get_unique_vm_id(self) -> int: This identifier is used to name the network interface and in the IPv4 range dedicated to the VM. """ - _, network_range = settings.IPV4_ADDRESS_POOL.split("/") - available_bits = int(network_range) - settings.IPV4_NETWORK_PREFIX_LENGTH - self.counter += 1 - if self.counter < 2**available_bits: - # In common cases, use the counter itself as the vm_id. This makes it - # easier to debug. - return self.counter - else: - # The value of the counter is too high and some functions such as the - # IPv4 range dedicated to the VM do not support such high values. - # - # We therefore recycle vm_id values from executions that are not running - # anymore. - currently_used_vm_ids = {execution.vm_id for execution in self.executions.values()} - for i in range(settings.START_ID_INDEX, 255**2): - if i not in currently_used_vm_ids: - return i - msg = "No available value for vm_id." - raise ValueError(msg) + # Take the first id that is not already taken + currently_used_vm_ids = {execution.vm_id for execution in self.executions.values()} + for i in range(settings.START_ID_INDEX, 255**2): + if i not in currently_used_vm_ids: + return i + msg = "No available value for vm_id." + raise ValueError(msg) def get_running_vm(self, vm_hash: ItemHash) -> VmExecution | None: """Return a running VM or None. Disables the VM expiration task.""" From 5acbdef9bf0e8937a2ebe09d65dd8fda8fc77e17 Mon Sep 17 00:00:00 2001 From: nesitor Date: Mon, 4 Nov 2024 17:00:30 +0100 Subject: [PATCH 2/6] Implement new EVM chains (#717) * Feature: Implement new EVM chains. * FIX: Update Makefile with new dependency. * Fix: Updated to proper released package version of aleph_message dependency. --------- Co-authored-by: Andres D. Molins --- packaging/Makefile | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packaging/Makefile b/packaging/Makefile index 0d1c4dcb..cc217ce3 100644 --- a/packaging/Makefile +++ b/packaging/Makefile @@ -15,7 +15,7 @@ debian-package-code: cp ../examples/instance_message_from_aleph.json ./aleph-vm/opt/aleph-vm/examples/instance_message_from_aleph.json cp -r ../examples/data ./aleph-vm/opt/aleph-vm/examples/data mkdir -p ./aleph-vm/opt/aleph-vm/examples/volumes - pip3 install --progress-bar off --target ./aleph-vm/opt/aleph-vm/ 'aleph-message==0.4.9' 'eth-account==0.10' 'sentry-sdk==1.31.0' 'qmp==1.1.0' 'aleph-superfluid~=0.2.1' 'sqlalchemy[asyncio]>=2.0' 'aiosqlite==0.19.0' 'alembic==1.13.1' 'aiohttp_cors==0.7.0' 'pyroute2==0.7.12' 'python-cpuid==0.1.0' 'solathon==1.0.2' + pip3 install --progress-bar off --target ./aleph-vm/opt/aleph-vm/ 'aleph-message==0.5.0' 'eth-account==0.10' 'sentry-sdk==1.31.0' 'qmp==1.1.0' 'aleph-superfluid~=0.2.1' 'sqlalchemy[asyncio]>=2.0' 'aiosqlite==0.19.0' 'alembic==1.13.1' 'aiohttp_cors==0.7.0' 'pyroute2==0.7.12' 'python-cpuid==0.1.0' 'solathon==1.0.2' python3 -m compileall ./aleph-vm/opt/aleph-vm/ debian-package-resources: firecracker-bins vmlinux download-ipfs-kubo target/bin/sevctl @@ -144,6 +144,6 @@ repository-noble: cd ./repositories/noble && reprepro -Vb . includedeb noble ../../target/aleph-vm.ubuntu-24.04.deb && cd .. repositories: repository-bookworm repository-jammy repository-noble - + all-podman: all-podman-debian-12 all-podman-ubuntu-2204 all-podman-ubuntu-2404 repositories diff --git a/pyproject.toml b/pyproject.toml index 5c4efc54..50c314c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ dependencies = [ "aioredis==1.3.1", "aiosqlite==0.19", "alembic==1.13.1", - "aleph-message==0.4.9", + "aleph-message==0.5", "aleph-superfluid~=0.2.1", "dbus-python==1.3.2", "eth-account~=0.10", From b113406a2e79ac04c00c0fbc1556ac590457a9cb Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 5 Nov 2024 15:30:46 +0100 Subject: [PATCH 3/6] Feature: allow IPv6 DNS (#455) * Feature: allow IPv6 DNS Problem IPv6 DNS were automatically filtered when detected from resolvectl Solution: Nameservers are now split into ipv4 and ipv6 and can be passed to the VM accordingly At the moment we pass them if the ipv6 parameter is present on the tap interface but we need a more robust detection method * Display proper env conf --- src/aleph/vm/conf.py | 36 +++++++++++-------- .../vm/controllers/firecracker/instance.py | 7 +++- .../supervisor/test_resolvectl_dns_servers.py | 8 +---- 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/src/aleph/vm/conf.py b/src/aleph/vm/conf.py index a1737b2b..b68ff9e8 100644 --- a/src/aleph/vm/conf.py +++ b/src/aleph/vm/conf.py @@ -74,17 +74,6 @@ def resolvectl_dns_servers(interface: str) -> Iterable[str]: yield server.strip() -def resolvectl_dns_servers_ipv4(interface: str) -> Iterable[str]: - """ - Use resolvectl to list available IPv4 DNS servers. - VMs only support IPv4 networking for now, we must exclude IPv6 DNS from their config. - """ - for server in resolvectl_dns_servers(interface): - ip_addr = ipaddress.ip_address(server) - if isinstance(ip_addr, ipaddress.IPv4Address): - yield server - - def get_default_interface() -> str | None: """Returns the default network interface""" with open("/proc/net/route") as f: @@ -102,7 +91,7 @@ def obtain_dns_ips(dns_resolver: DnsResolver, network_interface: str) -> list[st # Use a try-except approach since resolvectl can be present but disabled and raise the following # "Failed to get global data: Unit dbus-org.freedesktop.resolve1.service not found." try: - return list(resolvectl_dns_servers_ipv4(interface=network_interface)) + return list(resolvectl_dns_servers(interface=network_interface)) except (FileNotFoundError, CalledProcessError) as error: if Path("/etc/resolv.conf").exists(): return list(etc_resolv_conf_dns_servers()) @@ -114,7 +103,7 @@ def obtain_dns_ips(dns_resolver: DnsResolver, network_interface: str) -> list[st return list(etc_resolv_conf_dns_servers()) elif dns_resolver == DnsResolver.resolvectl: - return list(resolvectl_dns_servers_ipv4(interface=network_interface)) + return list(resolvectl_dns_servers(interface=network_interface)) else: msg = "No DNS resolve defined, this should never happen." @@ -180,8 +169,13 @@ class Settings(BaseSettings): description="Use the Neighbor Discovery Protocol Proxy to respond to Router Solicitation for instances on IPv6", ) - DNS_RESOLUTION: DnsResolver | None = DnsResolver.detect + DNS_RESOLUTION: DnsResolver | None = Field( + default=DnsResolver.detect, + description="Method used to resolve the dns server if DNS_NAMESERVERS is not present.", + ) DNS_NAMESERVERS: list[str] | None = None + DNS_NAMESERVERS_IPV4: list[str] | None + DNS_NAMESERVERS_IPV6: list[str] | None FIRECRACKER_PATH = Path("/opt/firecracker/firecracker") JAILER_PATH = Path("/opt/firecracker/jailer") @@ -439,6 +433,18 @@ def setup(self): network_interface=self.NETWORK_INTERFACE, ) + if not self.DNS_NAMESERVERS_IPV4: + self.DNS_NAMESERVERS_IPV4 = [] + if not self.DNS_NAMESERVERS_IPV6: + self.DNS_NAMESERVERS_IPV6 = [] + if self.DNS_NAMESERVERS: + for server in self.DNS_NAMESERVERS: + ip_addr = ipaddress.ip_address(server) + if isinstance(ip_addr, ipaddress.IPv4Address): + self.DNS_NAMESERVERS_IPV4.append(server) + if isinstance(ip_addr, ipaddress.IPv6Address): + self.DNS_NAMESERVERS_IPV6.append(server) + if not settings.ENABLE_QEMU_SUPPORT: # If QEmu is not supported, ignore the setting and use Firecracker by default settings.INSTANCE_DEFAULT_HYPERVISOR = HypervisorType.firecracker @@ -456,7 +462,7 @@ def display(self) -> str: else: attributes[attr] = getattr(self, attr) - return "\n".join(f"{attribute:<27} = {value}" for attribute, value in attributes.items()) + return "\n".join(f"{self.Config.env_prefix}{attribute} = {value}" for attribute, value in attributes.items()) def __init__( self, diff --git a/src/aleph/vm/controllers/firecracker/instance.py b/src/aleph/vm/controllers/firecracker/instance.py index f8c33b07..da423ef7 100644 --- a/src/aleph/vm/controllers/firecracker/instance.py +++ b/src/aleph/vm/controllers/firecracker/instance.py @@ -198,6 +198,11 @@ def _create_network_file(self) -> bytes: ipv6 = self.get_ipv6() ipv6_gateway = self.get_ipv6_gateway() + nameservers_ip = [] + if ip: + nameservers_ip = settings.DNS_NAMESERVERS_IPV4 + if ipv6: + nameservers_ip += settings.DNS_NAMESERVERS_IPV6 network = { "ethernets": { "eth0": { @@ -207,7 +212,7 @@ def _create_network_file(self) -> bytes: "gateway4": route, "gateway6": ipv6_gateway, "nameservers": { - "addresses": settings.DNS_NAMESERVERS, + "addresses": nameservers_ip, }, }, }, diff --git a/tests/supervisor/test_resolvectl_dns_servers.py b/tests/supervisor/test_resolvectl_dns_servers.py index 0daaf03c..0af9b6fb 100644 --- a/tests/supervisor/test_resolvectl_dns_servers.py +++ b/tests/supervisor/test_resolvectl_dns_servers.py @@ -2,7 +2,7 @@ import os from unittest import mock -from aleph.vm.conf import resolvectl_dns_servers, resolvectl_dns_servers_ipv4 +from aleph.vm.conf import resolvectl_dns_servers os.environ["ALEPH_VM_ALLOW_VM_NETWORKING"] = "False" @@ -17,9 +17,6 @@ def test_resolvectl(): dns_servers = set(resolvectl_dns_servers("eth0")) assert dns_servers == servers - dns_servers_ipv4 = set(resolvectl_dns_servers_ipv4("eth0")) - assert dns_servers_ipv4 == servers - def test_resolvectl_ipv6(): with mock.patch( @@ -31,6 +28,3 @@ def test_resolvectl_ipv6(): dns_servers = set(resolvectl_dns_servers("eth0")) assert dns_servers == ipv4_servers | ipv6_servers - - dns_servers_ipv4 = set(resolvectl_dns_servers_ipv4("eth0")) - assert dns_servers_ipv4 == ipv4_servers From 662c0c00b50b1feef83a849038f0c6abd922c773 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Wed, 6 Nov 2024 14:54:42 +0100 Subject: [PATCH 4/6] Problem: IGNORE_TRACEBACK_FROM_DIAGNOSTICS broken (#713) Symptom: The CustomError from the diagnostics VM was printed even if if IGNORE_TRACEBACK_FROM_DIAGNOSTICS was set to True (the default) Analysis: This was caused by the refactoring of the fastapi_example/main.py file done in fe9235ac658915eea20d5371ae45cedabe1f7b17 Which changed the output used to detect the error to catch Solution: Fix detection string --- src/aleph/vm/orchestrator/run.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/aleph/vm/orchestrator/run.py b/src/aleph/vm/orchestrator/run.py index a2a2a824..9c2a8b29 100644 --- a/src/aleph/vm/orchestrator/run.py +++ b/src/aleph/vm/orchestrator/run.py @@ -150,9 +150,11 @@ async def run_code_on_request(vm_hash: ItemHash, path: str, pool: VmPool, reques # The Diagnostics VM checks for the proper handling of exceptions. # This fills the logs with noisy stack traces, so we ignore this specific error. - ignored_error = 'raise CustomError("Whoops")' + ignored_errors = ['raise CustomError("Whoops")', "main.CustomError: Whoops"] - if settings.IGNORE_TRACEBACK_FROM_DIAGNOSTICS and ignored_error in result["traceback"]: + if settings.IGNORE_TRACEBACK_FROM_DIAGNOSTICS and any( + ignored_error in result["traceback"] for ignored_error in ignored_errors + ): logger.debug('Ignored traceback from CustomError("Whoops")') else: logger.warning(result["traceback"]) From 7461a4958c123bc8a8890d18b39374998d55d9f9 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Fri, 8 Nov 2024 15:17:09 +0100 Subject: [PATCH 5/6] Problem: error Too many open files (#720) Jira ticket: ALEPH-298 some CRN failed on any action with error OSError: [Errno 24] Too many open files: Solution: Properly close stream to journald when the VM is stopped --- .../vm/hypervisors/firecracker/microvm.py | 41 +++++++++++++------ src/aleph/vm/hypervisors/qemu/qemuvm.py | 17 +++++--- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/src/aleph/vm/hypervisors/firecracker/microvm.py b/src/aleph/vm/hypervisors/firecracker/microvm.py index 7a8fe787..d357fb6e 100644 --- a/src/aleph/vm/hypervisors/firecracker/microvm.py +++ b/src/aleph/vm/hypervisors/firecracker/microvm.py @@ -13,7 +13,7 @@ from pathlib import Path from pwd import getpwnam from tempfile import NamedTemporaryFile -from typing import Any +from typing import Any, BinaryIO import msgpack from aleph_message.models import ItemHash @@ -93,6 +93,8 @@ class MicroVM: mounted_rootfs: Path | None = None _unix_socket: Server | None = None enable_log: bool + journal_stdout: BinaryIO | int | None = None + journal_stderr: BinaryIO | int | None = None def __repr__(self): return f"" @@ -219,19 +221,19 @@ async def start_firecracker(self, config_path: Path) -> asyncio.subprocess.Proce str(config_path), ) if self.enable_log: - journal_stdout = journal.stream(self._journal_stdout_name) - journal_stderr = journal.stream(self._journal_stderr_name) + self.journal_stdout = journal.stream(self._journal_stdout_name) + self.journal_stderr = journal.stream(self._journal_stderr_name) else: - journal_stdout = asyncio.subprocess.DEVNULL - journal_stderr = asyncio.subprocess.DEVNULL + self.journal_stdout = asyncio.subprocess.DEVNULL + self.journal_stderr = asyncio.subprocess.DEVNULL logger.debug(" ".join(options)) self.proc = await asyncio.create_subprocess_exec( *options, stdin=asyncio.subprocess.PIPE, - stdout=journal_stdout, - stderr=journal_stderr, + stdout=self.journal_stdout, + stderr=self.journal_stderr, ) return self.proc @@ -252,11 +254,11 @@ async def start_jailed_firecracker(self, config_path: Path) -> asyncio.subproces self.config_file_path = config_path if self.enable_log: - journal_stdout = journal.stream(self._journal_stdout_name) - journal_stderr = journal.stream(self._journal_stderr_name) + self.journal_stdout = journal.stream(self._journal_stdout_name) + self.journal_stderr = journal.stream(self._journal_stderr_name) else: - journal_stdout = asyncio.subprocess.DEVNULL - journal_stderr = asyncio.subprocess.DEVNULL + self.journal_stdout = asyncio.subprocess.DEVNULL + self.journal_stderr = asyncio.subprocess.DEVNULL options = ( str(self.jailer_bin_path), @@ -280,8 +282,8 @@ async def start_jailed_firecracker(self, config_path: Path) -> asyncio.subproces self.proc = await asyncio.create_subprocess_exec( *options, stdin=asyncio.subprocess.PIPE, - stdout=journal_stdout, - stderr=journal_stderr, + stdout=self.journal_stdout, + stderr=self.journal_stderr, ) return self.proc @@ -480,6 +482,19 @@ async def teardown(self): if self.stderr_task: self.stderr_task.cancel() + if ( + self.journal_stdout + and self.journal_stdout != asyncio.subprocess.DEVNULL + and hasattr(self.journal_stdout, "close") + ): + self.journal_stdout.close() + if ( + self.journal_stderr + and self.journal_stderr != asyncio.subprocess.DEVNULL + and hasattr(self.journal_stderr, "close") + ): + self.journal_stderr.close() + # Clean mounted block devices if self.mounted_rootfs: logger.debug("Waiting for one second for the VM to shutdown") diff --git a/src/aleph/vm/hypervisors/qemu/qemuvm.py b/src/aleph/vm/hypervisors/qemu/qemuvm.py index 1d707c2a..5949fbdc 100644 --- a/src/aleph/vm/hypervisors/qemu/qemuvm.py +++ b/src/aleph/vm/hypervisors/qemu/qemuvm.py @@ -2,7 +2,7 @@ from asyncio.subprocess import Process from dataclasses import dataclass from pathlib import Path -from typing import TextIO +from typing import BinaryIO, TextIO import qmp from systemd import journal @@ -28,6 +28,8 @@ class QemuVM: interface_name: str qemu_process: Process | None = None host_volumes: list[HostVolume] + journal_stdout: TextIO | None + journal_stderr: TextIO | None def __repr__(self) -> str: if self.qemu_process: @@ -72,8 +74,8 @@ async def start( # qemu-system-x86_64 -enable-kvm -m 2048 -net nic,model=virtio # -net tap,ifname=tap0,script=no,downscript=no -drive file=alpine.qcow2,media=disk,if=virtio -nographic - journal_stdout: TextIO = journal.stream(self._journal_stdout_name) - journal_stderr: TextIO = journal.stream(self._journal_stderr_name) + self.journal_stdout: BinaryIO = journal.stream(self._journal_stdout_name) + self.journal_stderr: BinaryIO = journal.stream(self._journal_stderr_name) # hardware_resources.published ports -> not implemented at the moment # hardware_resources.seconds -> only for microvm args = [ @@ -120,8 +122,8 @@ async def start( self.qemu_process = proc = await asyncio.create_subprocess_exec( *args, stdin=asyncio.subprocess.DEVNULL, - stdout=journal_stdout, - stderr=journal_stderr, + stdout=self.journal_stdout, + stderr=self.journal_stderr, ) print( @@ -149,3 +151,8 @@ def send_shutdown_message(self): async def stop(self): """Stop the VM.""" self.send_shutdown_message() + + if self.journal_stdout and self.journal_stdout != asyncio.subprocess.DEVNULL: + self.journal_stdout.close() + if self.journal_stderr and self.journal_stderr != asyncio.subprocess.DEVNULL: + self.journal_stderr.close() From 7ee5384edabd0326a5db1f5c14803730c0561d63 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Thu, 14 Nov 2024 16:40:41 +0100 Subject: [PATCH 6/6] Update PULL_REQUEST_TEMPLATE.md for dependencies check (#722) Add a check for dependencies update --- .github/PULL_REQUEST_TEMPLATE.md | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index bcf76460..ff965a1d 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -10,6 +10,7 @@ Related ClickUp, GitHub or Jira tickets : ALEPH-XXX - [ ] New classes and functions contain docstrings explaining what they provide. - [ ] All new code is covered by relevant tests. - [ ] Documentation has been updated regarding these changes. +- [ ] Dependencies update in the project.toml have been mirrored in the Debian package build script `packaging/Makefile` ## Changes