From 1435b47d91c5645208e3049c6480bb6f550934ce Mon Sep 17 00:00:00 2001 From: Moritz Sanft <58110325+msanft@users.noreply.github.com> Date: Tue, 19 Nov 2024 16:10:09 +0100 Subject: [PATCH 1/6] packages/ociLayerTar: resolve symlinks by default Many Nix packages contain symlinks (e.g. packages aggregated into one with `symlinkJoin`) consist of symlinks. Our current implementation of the OCI layer builder does not handle symlinks at all and just copies them without copying the object they're pointing to. A sane configuration-less implementation would probably copy symlinks recursively until the target is found and copy that as well. For now, this just switches to an implementation that resolves all symlinks recursively, which is fine for all of our use-cases at the moment. --- packages/by-name/ociLayerTar/package.nix | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/by-name/ociLayerTar/package.nix b/packages/by-name/ociLayerTar/package.nix index 318f3f0957..8f7d8d6216 100644 --- a/packages/by-name/ociLayerTar/package.nix +++ b/packages/by-name/ociLayerTar/package.nix @@ -49,8 +49,11 @@ runCommandLocal "ociLayer" # Copy files into the tree (./root/) for i in ''${!srcs[@]}; do + # resolve symlinks + src=$(readlink -f ''${srcs[i]}) + mkdir -p "./root/$(dirname ''${dests[$i]})" - cp -rT "''${srcs[i]}" "./root/''${dests[$i]}" + cp -rT "''${src}" "./root/''${dests[$i]}" done # Create the layer tarball From 8c9c2473c8426c60efafc043155c39c42e366dbd Mon Sep 17 00:00:00 2001 From: Moritz Sanft <58110325+msanft@users.noreply.github.com> Date: Tue, 19 Nov 2024 16:11:19 +0100 Subject: [PATCH 2/6] packages/buildVerityMicroVM: init This adds a Nix builder to build a micro VM image for direct Linux boot, specifically for the bare-metal Kata image where this is necessary to satisfy Contrast's security assumptions made on the SNP launch digest computation. --- .../by-name/buildVerityMicroVM/package.nix | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 packages/by-name/buildVerityMicroVM/package.nix diff --git a/packages/by-name/buildVerityMicroVM/package.nix b/packages/by-name/buildVerityMicroVM/package.nix new file mode 100644 index 0000000000..c200286838 --- /dev/null +++ b/packages/by-name/buildVerityMicroVM/package.nix @@ -0,0 +1,58 @@ +# Copyright 2024 Edgeless Systems GmbH +# SPDX-License-Identifier: AGPL-3.0-only + +# Builds a micro VM image (i.e. rootfs, kernel and kernel cmdline) from a NixOS +# configuration. These components can then be booted in a microVM-fashion +# with QEMU's direct Linux boot feature. +# See: https://qemu-project.gitlab.io/qemu/system/linuxboot.html + +{ + symlinkJoin, + lib, +}: + +nixos-config: + +let + image = nixos-config.image.overrideAttrs (oldAttrs: { + passthru = oldAttrs.passthru // { + imageFileName = "${oldAttrs.pname}_${oldAttrs.version}.raw"; + }; + }); +in + +lib.throwIf + (lib.foldlAttrs ( + acc: _: partConfig: + acc || (partConfig.repartConfig.Type == "esp") + ) false nixos-config.config.image.repart.partitions) + "MicroVM images should not contain an ESP." + + symlinkJoin + { + pname = "microvm-image"; + inherit (nixos-config.config.system.image) version; + + paths = [ + nixos-config.config.system.build.kernel + nixos-config.config.system.build.initialRamdisk + image + ]; + + passthru = + let + roothash = builtins.head ( + lib.map (e: e.roothash) (builtins.fromJSON (builtins.readFile "${image}/repart-output.json")) + ); + in + { + cmdline = lib.concatStringsSep " " ( + nixos-config.config.boot.kernelParams + ++ [ + "init=${nixos-config.config.system.build.toplevel}/init" + "roothash=${roothash}" + ] + ); + inherit (image) imageFileName; + }; + } From 1cf1c674dc6e12299aa0458aec178b5de1bb9bf9 Mon Sep 17 00:00:00 2001 From: Moritz Sanft <58110325+msanft@users.noreply.github.com> Date: Tue, 19 Nov 2024 16:13:21 +0100 Subject: [PATCH 3/6] packages/kata-kernel-uvm: add config options for bare-metal use Using the Kata kernel with a baremetal NixOS image requires some additional config options to specify NixOS' sanity checks, so add them here. --- packages/by-name/kata/kata-kernel-uvm/package.nix | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/packages/by-name/kata/kata-kernel-uvm/package.nix b/packages/by-name/kata/kata-kernel-uvm/package.nix index b3cc2cb024..79d0ba442f 100644 --- a/packages/by-name/kata/kata-kernel-uvm/package.nix +++ b/packages/by-name/kata/kata-kernel-uvm/package.nix @@ -27,8 +27,19 @@ let # 3. Disable module signing to make the build reproducable. substituteInPlace $config \ --replace-fail 'CONFIG_INITRAMFS_SOURCE="initramfs.cpio.gz"' 'CONFIG_INITRAMFS_SOURCE=""' \ + --replace-fail 'CONFIG_MODULE_SIG=y' 'CONFIG_MODULE_SIG=n' \ --replace-fail '# CONFIG_DM_INIT is not set' 'CONFIG_DM_INIT=y' \ - --replace-fail 'CONFIG_MODULE_SIG=y' 'CONFIG_MODULE_SIG=n' + --replace-fail '# CONFIG_DMIID is not set' 'CONFIG_DMIID=y' \ + --replace-fail '# CONFIG_TMPFS_POSIX_ACL is not set' 'CONFIG_TMPFS_POSIX_ACL=y' \ + --replace-fail '# CONFIG_TMPFS_XATTR is not set' 'CONFIG_TMPFS_XATTR=y' \ + --replace-fail '# CONFIG_EFIVAR_FS is not set' 'CONFIG_EFIVAR_FS=y' \ + --replace-fail '# CONFIG_RD_ZSTD is not set' 'CONFIG_RD_ZSTD=y' \ + --replace-fail '# CONFIG_VFAT_FS is not se' 'CONFIG_VFAT_FS=y' \ + --replace-fail '# CONFIG_NLS_CODEPAGE_437 is not set' 'CONFIG_NLS_CODEPAGE_437=y' \ + --replace-fail '# CONFIG_NLS_ISO8859_1 is not set' 'CONFIG_NLS_ISO8859_1=y' \ + --replace-fail '# CONFIG_ATA is not set' 'CONFIG_ATA=y' + + echo "CONFIG_ATA_PIIX=y" >> $config ''; dontBuild = true; From accbe46eb68ec02734da60e4f8ec0cdbe3d435a6 Mon Sep 17 00:00:00 2001 From: Moritz Sanft <58110325+msanft@users.noreply.github.com> Date: Tue, 19 Nov 2024 16:14:33 +0100 Subject: [PATCH 4/6] packages/kata-runtime: allow booting with image and initrd Kata has a check to see if only image OR initrd are supplied, which is not needed for our use-case. So add a patch to remove that. This should probably be brought upstream in a usable fashion later on. --- ...ime-allow-initrd-AND-image-to-be-set.patch | 70 +++++++++++++++++++ .../by-name/kata/kata-runtime/package.nix | 4 ++ 2 files changed, 74 insertions(+) create mode 100644 packages/by-name/kata/kata-runtime/0017-runtime-allow-initrd-AND-image-to-be-set.patch diff --git a/packages/by-name/kata/kata-runtime/0017-runtime-allow-initrd-AND-image-to-be-set.patch b/packages/by-name/kata/kata-runtime/0017-runtime-allow-initrd-AND-image-to-be-set.patch new file mode 100644 index 0000000000..2226146eb0 --- /dev/null +++ b/packages/by-name/kata/kata-runtime/0017-runtime-allow-initrd-AND-image-to-be-set.patch @@ -0,0 +1,70 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Moritz Sanft <58110325+msanft@users.noreply.github.com> +Date: Mon, 18 Nov 2024 12:41:40 +0100 +Subject: [PATCH] runtime: allow initrd AND image to be set + +Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> +--- + .../virtcontainers/hypervisor_config_darwin.go | 2 -- + .../virtcontainers/hypervisor_config_linux.go | 2 -- + src/runtime/virtcontainers/qemu.go | 18 +++--------------- + 3 files changed, 3 insertions(+), 19 deletions(-) + +diff --git a/src/runtime/virtcontainers/hypervisor_config_darwin.go b/src/runtime/virtcontainers/hypervisor_config_darwin.go +index 1225271a2a4c5d9340022c22ee6889171bc21b93..a3398bcf6fac68e272a4ca1de962e585c4cf4fae 100644 +--- a/src/runtime/virtcontainers/hypervisor_config_darwin.go ++++ b/src/runtime/virtcontainers/hypervisor_config_darwin.go +@@ -21,8 +21,6 @@ func validateHypervisorConfig(conf *HypervisorConfig) error { + + if conf.ImagePath == "" && conf.InitrdPath == "" { + return fmt.Errorf("Missing image and initrd path") +- } else if conf.ImagePath != "" && conf.InitrdPath != "" { +- return fmt.Errorf("Image and initrd path cannot be both set") + } + + if conf.NumVCPUs == 0 { +diff --git a/src/runtime/virtcontainers/hypervisor_config_linux.go b/src/runtime/virtcontainers/hypervisor_config_linux.go +index f41cd22bd4ba96e5305ccb58e74c6d983b077974..8e1ca38eb620d58ffd4c83bbf4c666c1bc21efc3 100644 +--- a/src/runtime/virtcontainers/hypervisor_config_linux.go ++++ b/src/runtime/virtcontainers/hypervisor_config_linux.go +@@ -28,8 +28,6 @@ func validateHypervisorConfig(conf *HypervisorConfig) error { + } + } else if conf.ImagePath == "" && conf.InitrdPath == "" { + return fmt.Errorf("Missing image and initrd path") +- } else if conf.ImagePath != "" && conf.InitrdPath != "" { +- return fmt.Errorf("Image and initrd path cannot be both set") + } + + if err := conf.CheckTemplateConfig(); err != nil { +diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go +index 2c6311c067935a2c5da0a1018420bab684b670e8..3f4e143349e7467e530b5e3593f65134f9a5798c 100644 +--- a/src/runtime/virtcontainers/qemu.go ++++ b/src/runtime/virtcontainers/qemu.go +@@ -415,24 +415,12 @@ func (q *qemu) buildDevices(ctx context.Context, kernelPath string) ([]govmmQemu + return nil, nil, nil, err + } + +- assetPath, assetType, err := q.config.ImageOrInitrdAssetPath() +- if err != nil { +- return nil, nil, nil, err +- } +- +- if assetType == types.ImageAsset { +- devices, err = q.arch.appendImage(ctx, devices, assetPath) ++ devices, err = q.arch.appendImage(ctx, devices, q.config.ImagePath) + if err != nil { + return nil, nil, nil, err + } +- } else if assetType == types.InitrdAsset { +- // InitrdAsset, need to set kernel initrd path +- kernel.InitrdPath = assetPath +- } else if assetType == types.SecureBootAsset { +- // SecureBootAsset, no need to set image or initrd path +- q.Logger().Info("For IBM Z Secure Execution, initrd path should not be set") +- kernel.InitrdPath = "" +- } ++ ++ kernel.InitrdPath = q.config.InitrdPath + + if q.config.IOMMU { + devices, err = q.arch.appendIOMMU(devices) diff --git a/packages/by-name/kata/kata-runtime/package.nix b/packages/by-name/kata/kata-runtime/package.nix index 087d0aa278..a3e7bd5a4d 100644 --- a/packages/by-name/kata/kata-runtime/package.nix +++ b/packages/by-name/kata/kata-runtime/package.nix @@ -93,6 +93,10 @@ buildGoModule rec { ./0014-kata-sys-util-remove-obsolete-cgroups-dependency.patch ./0015-kata-sys-util-move-json-parsing-to-protocols-crate.patch ./0016-protocols-only-build-RLimit-impls-on-Linux.patch + + # Disable a check in Kata that prevents to set both image and initrd. + # For us, there's no practical reason not to do so. + ./0017-runtime-allow-initrd-AND-image-to-be-set.patch ]; }; From 93c473d31fc9df9e7bd3f48d0697416129816146 Mon Sep 17 00:00:00 2001 From: Moritz Sanft <58110325+msanft@users.noreply.github.com> Date: Tue, 19 Nov 2024 16:15:57 +0100 Subject: [PATCH 5/6] packages/boot-microvm: init This adds a little helper script to boot a Micro VM, as we build them for Kata bare-metal, via QEMU. --- packages/by-name/boot-microvm/package.nix | 36 +++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 packages/by-name/boot-microvm/package.nix diff --git a/packages/by-name/boot-microvm/package.nix b/packages/by-name/boot-microvm/package.nix new file mode 100644 index 0000000000..416f82a26b --- /dev/null +++ b/packages/by-name/boot-microvm/package.nix @@ -0,0 +1,36 @@ +# Copyright 2024 Edgeless Systems GmbH +# SPDX-License-Identifier: AGPL-3.0-only + +{ + writeShellApplication, + qemu, + OVMF, +}: + +# Usage example: +# outPath=$(nix build .#kata.kata-image --print-out-paths); nix run .#boot-microvm -- "${outPath}/bzImage" "${outPath}/initrd" "${outPath}/image-podvm-gpu_1-rc1.raw" "$(nix eval --raw .#kata.kata-image.cmdline)" + +writeShellApplication { + name = "boot-microvm"; + runtimeInputs = [ qemu ]; + text = '' + if [ $# -ne 4 ]; then + echo "Usage: $0 "; + exit 1; + fi + + tmpFile=$(mktemp) + cp "$3" "$tmpFile" + + qemu-system-x86_64 \ + -enable-kvm \ + -m 3G \ + -nographic \ + -drive if=pflash,format=raw,readonly=on,file=${OVMF.firmware} \ + -drive if=pflash,format=raw,readonly=on,file=${OVMF.variables} \ + -kernel "$1" \ + -initrd "$2" \ + -append "$4" \ + -drive "if=virtio,format=raw,file=$tmpFile" + ''; +} From 6b903fcdc86cdf04ae822b4f14b96e2dbfa599e1 Mon Sep 17 00:00:00 2001 From: Moritz Sanft <58110325+msanft@users.noreply.github.com> Date: Tue, 19 Nov 2024 16:17:16 +0100 Subject: [PATCH 6/6] Add NixOS image for bare-metal Kata This switches the image used in our bare-metal Kata uses (e.g. non-AKS and non-peerpods) to a NixOS image that we build in-tree as a MicroVM image (e.g. separated kernel, initrd, cmdline and rootfs). --- docs/docs/features-limitations.md | 7 + .../constants/configuration-qemu-tdx.toml | 3 + nodeinstaller/internal/constants/constants.go | 21 +- packages/by-name/OVMF-TDX/package.nix | 2 +- packages/by-name/image-podvm/package.nix | 1 + .../contrast-node-installer-image/package.nix | 14 +- .../by-name/kata/kata-image/buildimage.sh | 121 ---- .../kata/kata-image/package-index.json | 562 ------------------ packages/by-name/kata/kata-image/package.nix | 252 +------- .../by-name/kata/kata-kernel-uvm/package.nix | 1 + .../by-name/kata/kata-runtime/package.nix | 8 + .../kata/snp-launch-digest/package.nix | 28 +- .../kata/tdx-launch-digests/package.nix | 28 +- packages/by-name/mkNixosConfig/package.nix | 6 +- ...hw-x86-load-initrd-to-static-address.patch | 39 ++ packages/by-name/qemu-tdx-static/package.nix | 2 + packages/nixos/azure.nix | 2 + packages/nixos/debug.nix | 4 +- packages/nixos/image.nix | 116 ++-- packages/nixos/kata.nix | 158 ++--- packages/nixos/peerpods.nix | 103 ++++ packages/nixos/system.nix | 12 +- tools/tdx-measure/main.go | 25 +- tools/tdx-measure/rtmr/rtmr.go | 42 +- 24 files changed, 455 insertions(+), 1102 deletions(-) delete mode 100644 packages/by-name/kata/kata-image/buildimage.sh delete mode 100644 packages/by-name/kata/kata-image/package-index.json create mode 100644 packages/by-name/qemu-tdx-static/0004-hw-x86-load-initrd-to-static-address.patch create mode 100644 packages/nixos/peerpods.nix diff --git a/docs/docs/features-limitations.md b/docs/docs/features-limitations.md index 386b0bc0d4..69a1c028ef 100644 --- a/docs/docs/features-limitations.md +++ b/docs/docs/features-limitations.md @@ -34,3 +34,10 @@ Currently, this requires inspecting the iptables rules on startup or terminating The Contrast Coordinator is a singleton and can't be scaled to more than one instance. When this instance's pod is restarted, for example for node maintenance, it needs to be recovered manually. In a future release, we plan to support distributed Coordinator instances that can recover automatically. + +## Overriding Kata configuration + +Kata Containers supports [overriding certain configuration values via Kubernetes annotations](https://github.com/kata-containers/kata-containers/blob/b4da4b5e3b9b21048af9333b071235a57a3e9493/docs/how-to/how-to-set-sandbox-config-kata.md). + +It needs to be noted that setting these values is unsupported, and doing so may lead to unexpected +behaviour, as Contrast isn't tested against all possible configuration combinations. diff --git a/nodeinstaller/internal/constants/configuration-qemu-tdx.toml b/nodeinstaller/internal/constants/configuration-qemu-tdx.toml index c37fd9b4ed..7dd1b0b590 100644 --- a/nodeinstaller/internal/constants/configuration-qemu-tdx.toml +++ b/nodeinstaller/internal/constants/configuration-qemu-tdx.toml @@ -18,6 +18,9 @@ cpu_features="-vmx-rdseed-exit,pmu=off" default_vcpus = 1 default_maxvcpus = 0 default_bridges = 1 +# On TDX, when lowering this, the patch: +# packages/by-name/qemu-tdx-static/0004-hw-x86-load-initrd-to-static-address.patch +# needs to be updated accordingly. default_memory = 2048 default_maxmemory = 0 disable_block_device_use = false diff --git a/nodeinstaller/internal/constants/constants.go b/nodeinstaller/internal/constants/constants.go index 89628d918d..be75b60c20 100644 --- a/nodeinstaller/internal/constants/constants.go +++ b/nodeinstaller/internal/constants/constants.go @@ -64,17 +64,17 @@ func KataRuntimeConfig(baseDir string, platform platforms.Platform, qemuExtraKer config.Hypervisor["qemu"]["path"] = filepath.Join(baseDir, "tdx", "bin", "qemu-system-x86_64") config.Hypervisor["qemu"]["firmware"] = filepath.Join(baseDir, "tdx", "share", "OVMF.fd") config.Hypervisor["qemu"]["image"] = filepath.Join(baseDir, "share", "kata-containers.img") - config.Hypervisor["qemu"]["kernel"] = filepath.Join(baseDir, "share", "kata-kernel") config.Hypervisor["qemu"]["valid_hypervisor_paths"] = []string{filepath.Join(baseDir, "tdx", "bin", "qemu-system-x86_64")} config.Hypervisor["qemu"]["block_device_aio"] = "threads" config.Hypervisor["qemu"]["shared_fs"] = "none" - kernelParams := qemuExtraKernelParams + config.Hypervisor["qemu"]["initrd"] = filepath.Join(baseDir, "share", "kata-initrd.zst") + config.Hypervisor["qemu"]["kernel"] = filepath.Join(baseDir, "share", "kata-kernel") + // Replace the kernel params entirely (and don't append) since that's + // also what we do when calculating the launch measurement. + config.Hypervisor["qemu"]["kernel_params"] = qemuExtraKernelParams if debug { config.Hypervisor["qemu"]["enable_debug"] = true } - // Replace the kernel params entirely (and don't append) since that's - // also what we do when calculating the launch measurement. - config.Hypervisor["qemu"]["kernel_params"] = kernelParams case platforms.K3sQEMUSNP: if err := toml.Unmarshal([]byte(kataBareMetalQEMUSNPBaseConfig), &config); err != nil { return nil, fmt.Errorf("failed to unmarshal kata runtime configuration: %w", err) @@ -82,19 +82,18 @@ func KataRuntimeConfig(baseDir string, platform platforms.Platform, qemuExtraKer config.Hypervisor["qemu"]["path"] = filepath.Join(baseDir, "snp", "bin", "qemu-system-x86_64") config.Hypervisor["qemu"]["firmware"] = filepath.Join(baseDir, "snp", "share", "OVMF.fd") config.Hypervisor["qemu"]["image"] = filepath.Join(baseDir, "share", "kata-containers.img") - config.Hypervisor["qemu"]["kernel"] = filepath.Join(baseDir, "share", "kata-kernel") - delete(config.Hypervisor["qemu"], "initrd") config.Hypervisor["qemu"]["block_device_aio"] = "threads" config.Hypervisor["qemu"]["shared_fs"] = "none" config.Hypervisor["qemu"]["valid_hypervisor_paths"] = []string{filepath.Join(baseDir, "snp", "bin", "qemu-system-x86_64")} config.Hypervisor["qemu"]["rootfs_type"] = "erofs" - kernelParams := qemuExtraKernelParams + config.Hypervisor["qemu"]["initrd"] = filepath.Join(baseDir, "share", "kata-initrd.zst") + config.Hypervisor["qemu"]["kernel"] = filepath.Join(baseDir, "share", "kata-kernel") + // Replace the kernel params entirely (and don't append) since that's + // also what we do when calculating the launch measurement. + config.Hypervisor["qemu"]["kernel_params"] = qemuExtraKernelParams if debug { config.Hypervisor["qemu"]["enable_debug"] = true } - // Replace the kernel params entirely (and don't append) since that's - // also what we do when calculating the launch measurement. - config.Hypervisor["qemu"]["kernel_params"] = kernelParams default: return nil, fmt.Errorf("unsupported platform: %s", platform) } diff --git a/packages/by-name/OVMF-TDX/package.nix b/packages/by-name/OVMF-TDX/package.nix index be4941eed1..03a6930bb6 100644 --- a/packages/by-name/OVMF-TDX/package.nix +++ b/packages/by-name/OVMF-TDX/package.nix @@ -9,7 +9,7 @@ debug ? false, }: -edk2.mkDerivation "OvmfPkg/IntelTdx/IntelTdxX64.dsc" rec { +edk2.mkDerivation "OvmfPkg/IntelTdx/IntelTdxX64.dsc" { name = "OVMF-TDX"; buildFlags = lib.optionals debug [ "-D DEBUG_ON_SERIAL_PORT=TRUE" ]; diff --git a/packages/by-name/image-podvm/package.nix b/packages/by-name/image-podvm/package.nix index 26729aa10f..bf97f45e3d 100644 --- a/packages/by-name/image-podvm/package.nix +++ b/packages/by-name/image-podvm/package.nix @@ -15,5 +15,6 @@ buildVerityUKI (mkNixosConfig { debug.enable = withDebug; gpu.enable = withGPU; azure.enable = withCSP == "azure"; + peerpods.enable = true; }; }) diff --git a/packages/by-name/kata/contrast-node-installer-image/package.nix b/packages/by-name/kata/contrast-node-installer-image/package.nix index be31bc4d8b..b9f346a94f 100644 --- a/packages/by-name/kata/contrast-node-installer-image/package.nix +++ b/packages/by-name/kata/contrast-node-installer-image/package.nix @@ -47,6 +47,10 @@ let url = "file:///opt/edgeless/share/kata-kernel"; path = "/opt/edgeless/@@runtimeName@@/share/kata-kernel"; } + { + url = "file:///opt/edgeless/share/kata-initrd.zst"; + path = "/opt/edgeless/@@runtimeName@@/share/kata-initrd.zst"; + } { url = "file:///opt/edgeless/snp/bin/qemu-system-x86_64"; path = "/opt/edgeless/@@runtimeName@@/snp/bin/qemu-system-x86_64"; @@ -106,7 +110,7 @@ let } ]; inherit debugRuntime; - qemuExtraKernelParams = kata.snp-launch-digest.dmVerityArgs; + qemuExtraKernelParams = kata.kata-image.cmdline; }; destination = "/config/contrast-node-install.json"; } @@ -116,13 +120,17 @@ let kata-container-img = ociLayerTar { files = [ { - source = kata.kata-image; + source = "${kata.kata-image}/${kata.kata-image.imageFileName}"; destination = "/opt/edgeless/share/kata-containers.img"; } { - source = "${kata.kata-kernel-uvm}/bzImage"; + source = "${kata.kata-image}/bzImage"; destination = "/opt/edgeless/share/kata-kernel"; } + { + source = "${kata.kata-image}/initrd.zst"; + destination = "/opt/edgeless/share/kata-initrd.zst"; + } ]; }; diff --git a/packages/by-name/kata/kata-image/buildimage.sh b/packages/by-name/kata/kata-image/buildimage.sh deleted file mode 100644 index 18fe86df5a..0000000000 --- a/packages/by-name/kata/kata-image/buildimage.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2024 Edgeless Systems GmbH -# SPDX-License-Identifier: AGPL-3.0-only - -set -euo pipefail -shopt -s inherit_errexit - -# Image layout: -# -# +---------------------------------+-------------------+-------------------------+ -# | 512B DOS MBR (padded to 1 MiB) | p0 rootfs | p1 hashtree | -# +---------------------------------+-------------------+-------------------------+ -# | | | | -# 0 1MiB 1MiB + rootfs_size 1MiB + rootfs_size + hashtree_size - -# rootfs: erofs filesystem mounted at / (read-only) -# hashtree: dm-verity hashtree without superblock - -readonly MIB=1048576 - -in=$1 -out=$2 -tmpdir=$(mktemp -d) -trap 'rm -rf $tmpdir' EXIT -rootfs=$tmpdir/01_rootfs -hashtree=$tmpdir/02_verity_hashtree -dm_verity_file=$out/dm_verity.txt -roothash=$out/roothash -raw=$out/raw.img -uuid=c1b9d5a2-f162-11cf-9ece-0020afc76f16 -salt=0102030405060708090a0b0c0d0e0f - -if [ -z "${SOURCE_DATE_EPOCH}" ]; then - echo "SOURCE_DATE_EPOCH is not set" >&2 - exit 1 -fi - -mkdir -p "$out" - -# create the rootfs and pad it to 1MiB -mkfs.erofs \ - -z lz4 \ - -b 4096 \ - -T "$SOURCE_DATE_EPOCH" \ - -U "$uuid" \ - --tar=f \ - "$rootfs" \ - "$in" -truncate -s '%1MiB' "$rootfs" - -# create the dm-verity hashtree -verity_out=$( - veritysetup format \ - "$rootfs" \ - "$hashtree" \ - --data-block-size 4096 \ - --hash-block-size 4096 \ - --no-superblock \ - --uuid "$uuid" \ - --salt "$salt" | tee "$dm_verity_file" -) -# pad the hashtree to multiple of 1MiB -truncate -s '%1MiB' "$hashtree" -# extract dm-verity parameters from text output to individual files -sed -i 1d "$dm_verity_file" -root_hash=$(echo "$verity_out" | grep -oP 'Root hash:\s+\K\w+' | tr -d "[:space:]") -echo -n "$root_hash" >"$roothash" -hash_type=$(echo "$verity_out" | grep -oP 'Hash type:\s+\K\w+' | tr -d "[:space:]") -echo -n "$hash_type" >"$out/hash_type" -data_blocks=$(echo "$verity_out" | grep -oP 'Data blocks:\s+\K\w+' | tr -d "[:space:]") -echo -n "$data_blocks" >"$out/data_blocks" -data_block_size=$(echo "$verity_out" | grep -oP 'Data block size:\s+\K\w+' | tr -d "[:space:]") -echo -n "$data_block_size" >"$out/data_block_size" -hash_blocks=$(echo "$verity_out" | grep -oP 'Hash blocks:\s+\K\w+' | tr -d "[:space:]") -echo -n "$hash_blocks" >"$out/hash_blocks" -hash_block_size=$(echo "$verity_out" | grep -oP 'Hash block size:\s+\K\w+' | tr -d "[:space:]") -echo -n "$hash_block_size" >"$out/hash_block_size" -hash_algorithm=$(echo "$verity_out" | grep -oP 'Hash algorithm:\s+\K\w+' | tr -d "[:space:]") -echo -n "$hash_algorithm" >"$out/hash_algorithm" -echo -n "$salt" >"$out/salt" - -rootfs_size_mib=$(($(stat -c %s "$rootfs") / "$MIB")) -# full image size is dos header + rootfs + hashtree -hashtree_size_bytes=$(stat -c %s "$hashtree") -hashtree_size_mib=$(($(stat -c %s "$hashtree") / "$MIB")) -# img_size is the size of the full image in bytes -# DOS MBR (padded to 1MiB) + rootfs + hashtree -img_size_bytes=$(("$MIB" + "$rootfs_size_mib" * "$MIB" + "$hashtree_size_bytes")) - -# Where the rootfs starts in MiB -readonly rootfs_start=1 -# hash_start is the start of the hashtree in MiB -hash_start=$((rootfs_start + rootfs_size_mib)) -hash_end=$((hash_start + hashtree_size_mib)) - -rs=$(printf "%4dMiB" "$rootfs_start") -hs=$(printf "%4dMiB" "$hash_start") -he=$(printf "%4dMiB" "$hash_end") -cat < /build/rootfs/etc/kata-opa/default-policy.rego < $out/milan.hex ${lib.getExe python3Packages.sev-snp-measure} \ --mode snp \ @@ -43,11 +48,8 @@ stdenvNoCC.mkDerivation { --vcpus 1 \ --vcpu-type EPYC-Genoa \ --kernel ${kernel} \ - --append '${cmdline}' \ + --initrd ${initrd} \ + --append "${cmdline}" \ --output-format hex > $out/genoa.hex ''; - - passthru = { - inherit dmVerityArgs; - }; } diff --git a/packages/by-name/kata/tdx-launch-digests/package.nix b/packages/by-name/kata/tdx-launch-digests/package.nix index 9e3bb8dbf1..1916462fc7 100644 --- a/packages/by-name/kata/tdx-launch-digests/package.nix +++ b/packages/by-name/kata/tdx-launch-digests/package.nix @@ -11,25 +11,31 @@ debug ? false, }: let - image = kata.kata-image; - inherit (image) dmVerityArgs; - cmdlineBase = "tsc=reliable no_timer_check rcupdate.rcu_expedited=1 i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1 i8042.noaux=1 noreplace-smp reboot=k cryptomgr.notests net.ifnames=0 pci=lastbus=0 root=/dev/vda1 rootflags=ro rootfstype=erofs console=hvc0 console=hvc1 quiet systemd.show_status=false panic=1 nr_cpus=1 selinux=0 systemd.unit=kata-containers.target systemd.mask=systemd-networkd.service systemd.mask=systemd-networkd.socket scsi_mod.scan=none"; - cmdlineBaseDebug = "tsc=reliable no_timer_check rcupdate.rcu_expedited=1 i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1 i8042.noaux=1 noreplace-smp reboot=k cryptomgr.notests net.ifnames=0 pci=lastbus=0 root=/dev/vda1 rootflags=ro rootfstype=erofs console=hvc0 console=hvc1 debug systemd.show_status=true systemd.log_level=debug panic=1 nr_cpus=1 selinux=0 systemd.unit=kata-containers.target systemd.mask=systemd-networkd.service systemd.mask=systemd-networkd.socket scsi_mod.scan=none agent.log=debug agent.debug_console agent.debug_console_vport=1026"; - cmdline = "${if debug then cmdlineBaseDebug else cmdlineBase} ${dmVerityArgs}"; + ovmf-tdx = "${OVMF-TDX}/FV/OVMF.fd"; + kernel = "${kata.kata-image}/bzImage"; + initrd = "${kata.kata-image}/initrd"; + + # Kata uses a base command line and then appends the command line from the kata config (i.e. also our node-installer config). + # Thus, we need to perform the same steps when calculating the digest. + baseCmdline = if debug then kata.kata-runtime.cmdline.debug else kata.kata-runtime.cmdline.default; + cmdline = lib.strings.concatStringsSep " " [ + baseCmdline + kata.kata-image.cmdline + ]; in stdenvNoCC.mkDerivation { name = "tdx-launch-digests"; - inherit (image) version; + inherit (kata.kata-image) version; dontUnpack = true; buildPhase = '' mkdir $out - ${lib.getExe tdx-measure} mrtd -f ${OVMF-TDX}/FV/OVMF.fd > $out/mrtd.hex - ${lib.getExe tdx-measure} rtmr -f ${OVMF-TDX}/FV/OVMF.fd -k ${kata.kata-kernel-uvm}/bzImage -c '${cmdline}' 0 > $out/rtmr0.hex - ${lib.getExe tdx-measure} rtmr -f ${OVMF-TDX}/FV/OVMF.fd -k ${kata.kata-kernel-uvm}/bzImage -c '${cmdline}' 1 > $out/rtmr1.hex - ${lib.getExe tdx-measure} rtmr -f ${OVMF-TDX}/FV/OVMF.fd -k ${kata.kata-kernel-uvm}/bzImage -c '${cmdline}' 2 > $out/rtmr2.hex - ${lib.getExe tdx-measure} rtmr -f ${OVMF-TDX}/FV/OVMF.fd -k ${kata.kata-kernel-uvm}/bzImage -c '${cmdline}' 3 > $out/rtmr3.hex + ${lib.getExe tdx-measure} mrtd -f ${ovmf-tdx} > $out/mrtd.hex + ${lib.getExe tdx-measure} rtmr -f ${ovmf-tdx} -k ${kernel} -i ${initrd} -c '${cmdline}' 0 > $out/rtmr0.hex + ${lib.getExe tdx-measure} rtmr -f ${ovmf-tdx} -k ${kernel} -i ${initrd} -c '${cmdline}' 1 > $out/rtmr1.hex + ${lib.getExe tdx-measure} rtmr -f ${ovmf-tdx} -k ${kernel} -i ${initrd} -c '${cmdline}' 2 > $out/rtmr2.hex + ${lib.getExe tdx-measure} rtmr -f ${ovmf-tdx} -k ${kernel} -i ${initrd} -c '${cmdline}' 3 > $out/rtmr3.hex ''; } diff --git a/packages/by-name/mkNixosConfig/package.nix b/packages/by-name/mkNixosConfig/package.nix index 767761accb..e5b90e109d 100644 --- a/packages/by-name/mkNixosConfig/package.nix +++ b/packages/by-name/mkNixosConfig/package.nix @@ -45,7 +45,11 @@ lib.makeOverridable ( nvidia-ctk-with-config tdx-tools ; - inherit (outerPkgs.kata) kata-agent; + inherit (outerPkgs.kata) + kata-agent + kata-runtime + kata-kernel-uvm + ; }) ]; diff --git a/packages/by-name/qemu-tdx-static/0004-hw-x86-load-initrd-to-static-address.patch b/packages/by-name/qemu-tdx-static/0004-hw-x86-load-initrd-to-static-address.patch new file mode 100644 index 0000000000..597e756972 --- /dev/null +++ b/packages/by-name/qemu-tdx-static/0004-hw-x86-load-initrd-to-static-address.patch @@ -0,0 +1,39 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Moritz Sanft <58110325+msanft@users.noreply.github.com> +Date: Thu, 21 Nov 2024 14:36:23 +0100 +Subject: [PATCH] hw/x86: load initrd to static address + +For TDX RTMRs to be predictable regardless of VM memory size, we need to +load the initrd to a static address, so no dynamic value ends up in the +mapped kernel image. (as the initrd address does) +As we control the minimum VM memory size in Contrast, we just load the initrd +to the address it gets loaded to for Contrast's minimum VM memory, regardless +of if the VM has more memory, as it would otherwise happen with QEMU's 4Gi +constraint. + +Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> +--- + hw/i386/x86.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/hw/i386/x86.c b/hw/i386/x86.c +index 504575abfa98bc25e498e219a2d58d8d31e5feaa..963e940ff580e7774faaad73410a5d5b69cfb4cc 100644 +--- a/hw/i386/x86.c ++++ b/hw/i386/x86.c +@@ -953,6 +953,16 @@ void x86_load_linux(X86MachineState *x86ms, + initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1; + } + ++ // For TDX RTMRs to be predictable regardless of VM memory size, we need to ++ // load the initrd to a static address, so no dynamic value ends up in the ++ // mapped kernel image. (as the initrd address does) ++ // As we control the minimum VM memory size in Contrast, we just load the initrd ++ // to the address it gets loaded to for Contrast's minimum VM memory, regardless ++ // of if the VM has more memory, as it would otherwise happen with QEMU's 4Gi ++ // constraint. (See the above if-clause) ++ uint32_t contrast_min_memory = 0x80000000; ++ initrd_max = contrast_min_memory - acpi_data_size - 1; ++ + fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_ADDR, cmdline_addr); + fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, strlen(kernel_cmdline) + 1); + fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline); diff --git a/packages/by-name/qemu-tdx-static/package.nix b/packages/by-name/qemu-tdx-static/package.nix index 521cf11487..58af1704fa 100644 --- a/packages/by-name/qemu-tdx-static/package.nix +++ b/packages/by-name/qemu-tdx-static/package.nix @@ -61,5 +61,7 @@ in # Make the generated ACPI tables more deterministic, so that we get a # fixed hash for attestation. ./0003-i386-omit-some-unneeded-ACPI-tables.patch + # Load the initrd to a static address to make RTMRs predictable. + ./0004-hw-x86-load-initrd-to-static-address.patch ]; }) diff --git a/packages/nixos/azure.nix b/packages/nixos/azure.nix index 612b48e767..ec1e3a4c20 100644 --- a/packages/nixos/azure.nix +++ b/packages/nixos/azure.nix @@ -55,6 +55,8 @@ in }; config = lib.mkIf cfg.enable { + boot.kernelPackages = pkgs.recurseIntoAttrs (pkgs.linuxPackagesFor pkgs.kernel-podvm-azure); + boot.initrd = { kernelModules = [ "hv_storvsc" diff --git a/packages/nixos/debug.nix b/packages/nixos/debug.nix index 38958f87f8..e26d7d093a 100644 --- a/packages/nixos/debug.nix +++ b/packages/nixos/debug.nix @@ -32,7 +32,9 @@ in services.getty.autologinUser = "root"; - boot.kernelParams = [ "console=ttyS0" ]; + # required for local booting, but no boot logs in kata with this + boot.kernelParams = lib.optionals config.contrast.azure.enable [ "console=ttyS0" ]; + boot.initrd.systemd.emergencyAccess = true; systemd.enableEmergencyMode = true; }; diff --git a/packages/nixos/image.nix b/packages/nixos/image.nix index 192102e5cc..caf834473a 100644 --- a/packages/nixos/image.nix +++ b/packages/nixos/image.nix @@ -1,62 +1,80 @@ # Copyright 2024 Edgeless Systems GmbH # SPDX-License-Identifier: AGPL-3.0-only -{ config, pkgs, ... }: +{ + config, + pkgs, + lib, + ... +}: + +let + cfg = config.contrast.image; +in { - # We build the image with systemd-repart, which integrates well - # with the systemd utilities we use for dm-verity, UKI, etc. - # However, we do not use the repart unit, as we don't want - # dynamic repartitioning at run- / boot-time. - image.repart = { - name = "image-podvm-gpu"; - version = "1-rc1"; + options.contrast.image = { + microVM = lib.mkEnableOption "Build a micro VM image"; + }; - # This defines the actual partition layout. - partitions = { - # EFI System Partition, holds the UKI. - "00-esp" = { - contents = { - "/".source = pkgs.runCommand "esp-contents" { } '' - mkdir -p $out/EFI/BOOT - cp ${config.system.build.uki}/${config.system.boot.loader.ukiFile} $out/EFI/BOOT/BOOTX64.EFI - ''; - }; - repartConfig = { - Type = "esp"; - Format = "vfat"; - SizeMinBytes = "64M"; - UUID = "null"; # Fix partition UUID for reproducibility. - }; - }; + config = { + system.image.version = "1-rc1"; + + # We build the image with systemd-repart, which integrates well + # with the systemd utilities we use for dm-verity, UKI, etc. + # However, we do not use the repart unit, as we don't want + # dynamic repartitioning at run- / boot-time. + image.repart = { + name = "image-podvm-gpu"; + inherit (config.system.image) version; - # Root filesystem. - "10-root" = { - contents = { - "/pause_bundle".source = "${pkgs.pause-bundle}/pause_bundle"; + # This defines the actual partition layout. + partitions = { + # EFI System Partition, holds the UKI. + # Only build this partition if we need a bootable image (i.e. not a micro VM). + "00-esp" = lib.mkIf (!cfg.microVM) { + contents = { + "/".source = pkgs.runCommand "esp-contents" { } '' + mkdir -p $out/EFI/BOOT + cp ${config.system.build.uki}/${config.system.boot.loader.ukiFile} $out/EFI/BOOT/BOOTX64.EFI + ''; + }; + repartConfig = { + Type = "esp"; + Format = "vfat"; + SizeMinBytes = "64M"; + UUID = "null"; # Fix partition UUID for reproducibility. + }; }; - storePaths = [ config.system.build.toplevel ]; - repartConfig = { - Type = "root"; - Format = "erofs"; - Label = "root"; - Verity = "data"; - VerityMatchKey = "root"; - Minimize = "best"; - # We need to ensure that mountpoints are available. - # TODO (Maybe): This could be done more elegantly with CopyFiles and a skeleton tree in the vcs. - MakeDirectories = "/bin /boot /dev /etc /home /lib /lib64 /mnt /nix /opt /proc /root /run /srv /sys /tmp /usr/bin /var"; + + # Root filesystem. + "10-root" = { + contents = { + "/pause_bundle".source = "${pkgs.pause-bundle}/pause_bundle"; + }; + storePaths = [ config.system.build.toplevel ]; + repartConfig = { + Type = "root"; + Format = "erofs"; + Label = "root"; + Verity = "data"; + VerityMatchKey = "root"; + Minimize = "best"; + # We need to ensure that mountpoints are available. + # TODO (Maybe): This could be done more elegantly with CopyFiles and a skeleton tree in the vcs. + MakeDirectories = "/bin /boot /dev /etc /home /lib /lib64 /mnt /nix /opt /proc /root /run /srv /sys /tmp /usr/bin /var"; + }; }; - }; - # Verity hashes for the root filesystem. - "20-root-verity" = { - repartConfig = { - Type = "root-verity"; - Label = "root-verity"; - Verity = "hash"; - VerityMatchKey = "root"; - Minimize = "best"; + # Verity hashes for the root filesystem. + "20-root-verity" = { + repartConfig = { + Type = "root-verity"; + Label = "root-verity"; + Verity = "hash"; + VerityMatchKey = "root"; + Minimize = "best"; + }; }; }; }; diff --git a/packages/nixos/kata.nix b/packages/nixos/kata.nix index d31681f850..fa85326bff 100644 --- a/packages/nixos/kata.nix +++ b/packages/nixos/kata.nix @@ -1,90 +1,98 @@ # Copyright 2024 Edgeless Systems GmbH # SPDX-License-Identifier: AGPL-3.0-only -{ lib, pkgs, ... }: - { - systemd.services.kata-agent = { - description = "Kata Containers Agent"; - documentation = [ - "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/kata-agent.service" - ]; - bindsTo = [ "netns@podns.service" ]; - wants = [ "process-user-data.service" ]; - after = [ - "netns@podns.service" - "process-user-data.service" - ]; - wantedBy = [ "multi-user.target" ]; - serviceConfig = { - Type = "exec"; # Not upstream. - ExecStartPre = [ "${pkgs.coreutils}/bin/mkdir -p /run/kata-containers" ]; - ExecStart = "${lib.getExe pkgs.kata-agent} --config /run/peerpod/agent-config.toml"; - ExecStopPost = "${lib.getExe pkgs.cloud-api-adaptor.kata-agent-clean} --config /run/peerpod/agent-config.toml"; - SyslogIdentifier = "kata-agent"; - }; - environment = { - KATA_AGENT_LOG_LEVEL = "debug"; - OCICRYPT_KEYPROVIDER_CONFIG = builtins.toFile "policy.json" ( - lib.strings.toJSON { default = [ { type = "insecureAcceptAnything"; } ]; } - ); - }; + config, + lib, + pkgs, + ... +}: +let + cfg = config.contrast.kata; +in +{ + options.contrast.kata = { + enable = lib.mkEnableOption "Enable Kata (non-peerpod) support"; }; - systemd.services.agent-protocol-forwarder = { - description = "Agent Protocol Forwarder"; - documentation = [ - "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/agent-protocol-forwarder.service" - ]; - wants = [ "kata-agent.service" ]; - after = [ "kata-agent.service" ]; - wantedBy = [ "multi-user.target" ]; - unitConfig = { - DefaultDependencies = false; - }; - serviceConfig = { - Type = "notify"; - ExecStart = lib.strings.concatStringsSep " " [ - "${pkgs.cloud-api-adaptor}/bin/agent-protocol-forwarder" - "-kata-agent-namespace /run/netns/podns" - "-kata-agent-socket /run/kata-containers/agent.sock" + config = lib.mkIf cfg.enable { + # https://github.com/kata-containers/kata-containers/blob/3.10.1/src/agent/kata-containers.target + systemd.targets.kata-containers = { + description = "Kata Containers Agent Target"; + requires = [ + "basic.target" + "tmp.mount" + "kata-agent.service" ]; - Restart = "on-failure"; - RestartSec = "5s"; + wantedBy = [ "basic.target" ]; + wants = [ + "chronyd.service" + # https://github.com/kata-containers/kata-containers/blob/5869046d04553c3bd2f16fa1cfb714133050e537/tools/osbuilder/rootfs-builder/rootfs.sh#L712 + "dbus.socket" + ]; + conflicts = [ + "rescue.service" + "rescue.target" + ]; + after = [ + "basic.target" + "rescue.service" + "rescue.target" + ]; + unitConfig.AllowIsolate = true; }; - }; - systemd.services.process-user-data = { - description = "Pull configuration from metadata service"; - documentation = [ - "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/process-user-data.service" - ]; - wants = [ "network-online.target" ]; - after = [ "network-online.target" ]; - wantedBy = [ "multi-user.target" ]; - unitConfig = { - DefaultDependencies = false; + # https://github.com/kata-containers/kata-containers/blob/3.10.1/src/agent/kata-agent.service.in + systemd.services.kata-agent = { + description = "Kata Containers Agent"; + documentation = [ "https://github.com/kata-containers/kata-containers" ]; + wants = [ "kata-containers.target" ]; + after = [ "systemd-tmpfiles-setup.service" ]; # Not upstream, but required for /etc/resolv.conf bind mount. + serviceConfig = { + Type = "exec"; # Not upstream. + StandardOutput = "tty"; + ExecStart = "${lib.getExe pkgs.kata-agent}"; + LimitNOFILE = 1048576; + ExecStop = "${pkgs.coreutils}/bin/sync ; ${config.systemd.package}/bin/systemctl --force poweroff"; + FailureAction = "poweroff"; + OOMScoreAdjust = -997; + }; + # Not upstream + environment = { + KATA_AGENT_LOG_LEVEL = "debug"; + OCICRYPT_KEYPROVIDER_CONFIG = builtins.toFile "policy.json" ( + lib.strings.toJSON { default = [ { type = "insecureAcceptAnything"; } ]; } + ); + }; }; - serviceConfig = { - Type = "oneshot"; - ExecStart = "${pkgs.cloud-api-adaptor}/bin/process-user-data provision-files"; - RemainAfterExit = true; + + fileSystems."/run" = { + fsType = "tmpfs"; + options = [ + "nodev" + "nosuid" + "size=50%" + ]; + neededForBoot = true; }; - }; - systemd.services."netns@" = { - description = "Create a network namespace for pod networking"; - documentation = [ - "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/netns%40.service" - ]; - serviceConfig = { - Type = "oneshot"; - RemainAfterExit = true; - ExecStartPre = "${pkgs.iproute2}/bin/ip netns add %I"; - ExecStart = "${pkgs.iproute2}/bin/ip netns exec %I ${pkgs.iproute2}/bin/ip link set lo up"; - ExecStop = "${pkgs.iproute2}/bin/ip netns del %I"; + # Not used directly, but required for kernel-specific driver builds. + boot.kernelPackages = pkgs.recurseIntoAttrs (pkgs.linuxPackagesFor pkgs.kata-kernel-uvm); + + boot.initrd = { + # Don't require TPM2 support. (additional modules) + systemd.tpm2.enable = false; + # Don't require any of the hardware modules NixOS includes by default. + includeDefaultModules = false; + }; + + networking.resolvconf.enable = false; + systemd.tmpfiles.settings."10-etc-resolvconf"."/etc/resolv.conf".f = { + group = "root"; + mode = "0755"; + user = "root"; }; - }; - environment.etc."kata-opa/default-policy.rego".source = pkgs.cloud-api-adaptor.default-policy; + environment.etc."kata-opa/default-policy.rego".source = "${pkgs.kata-runtime.src}/src/kata-opa/allow-set-policy.rego"; + }; } diff --git a/packages/nixos/peerpods.nix b/packages/nixos/peerpods.nix new file mode 100644 index 0000000000..116768e519 --- /dev/null +++ b/packages/nixos/peerpods.nix @@ -0,0 +1,103 @@ +# Copyright 2024 Edgeless Systems GmbH +# SPDX-License-Identifier: AGPL-3.0-only + +{ + config, + lib, + pkgs, + ... +}: +let + cfg = config.contrast.peerpods; +in +{ + options.contrast.peerpods = { + enable = lib.mkEnableOption "Enable peer pods support"; + }; + + config = lib.mkIf cfg.enable { + systemd.services.kata-agent = { + description = "Kata Containers Agent"; + documentation = [ + "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/kata-agent.service" + ]; + bindsTo = [ "netns@podns.service" ]; + wants = [ "process-user-data.service" ]; + after = [ + "netns@podns.service" + "process-user-data.service" + ]; + wantedBy = [ "multi-user.target" ]; + serviceConfig = { + Type = "exec"; # Not upstream. + ExecStartPre = [ "${pkgs.coreutils}/bin/mkdir -p /run/kata-containers" ]; + ExecStart = "${lib.getExe pkgs.kata-agent} --config /run/peerpod/agent-config.toml"; + ExecStopPost = "${lib.getExe pkgs.cloud-api-adaptor.kata-agent-clean} --config /run/peerpod/agent-config.toml"; + SyslogIdentifier = "kata-agent"; + }; + environment = { + KATA_AGENT_LOG_LEVEL = "debug"; + OCICRYPT_KEYPROVIDER_CONFIG = builtins.toFile "policy.json" ( + lib.strings.toJSON { default = [ { type = "insecureAcceptAnything"; } ]; } + ); + }; + }; + + systemd.services.agent-protocol-forwarder = { + description = "Agent Protocol Forwarder"; + documentation = [ + "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/agent-protocol-forwarder.service" + ]; + wants = [ "kata-agent.service" ]; + after = [ "kata-agent.service" ]; + wantedBy = [ "multi-user.target" ]; + unitConfig = { + DefaultDependencies = false; + }; + serviceConfig = { + Type = "notify"; + ExecStart = lib.strings.concatStringsSep " " [ + "${pkgs.cloud-api-adaptor}/bin/agent-protocol-forwarder" + "-kata-agent-namespace /run/netns/podns" + "-kata-agent-socket /run/kata-containers/agent.sock" + ]; + Restart = "on-failure"; + RestartSec = "5s"; + }; + }; + + systemd.services.process-user-data = { + description = "Pull configuration from metadata service"; + documentation = [ + "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/process-user-data.service" + ]; + wants = [ "network-online.target" ]; + after = [ "network-online.target" ]; + wantedBy = [ "multi-user.target" ]; + unitConfig = { + DefaultDependencies = false; + }; + serviceConfig = { + Type = "oneshot"; + ExecStart = "${pkgs.cloud-api-adaptor}/bin/process-user-data provision-files"; + RemainAfterExit = true; + }; + }; + + systemd.services."netns@" = { + description = "Create a network namespace for pod networking"; + documentation = [ + "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/netns%40.service" + ]; + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + ExecStartPre = "${pkgs.iproute2}/bin/ip netns add %I"; + ExecStart = "${pkgs.iproute2}/bin/ip netns exec %I ${pkgs.iproute2}/bin/ip link set lo up"; + ExecStop = "${pkgs.iproute2}/bin/ip netns del %I"; + }; + }; + + environment.etc."kata-opa/default-policy.rego".source = pkgs.cloud-api-adaptor.default-policy; + }; +} diff --git a/packages/nixos/system.nix b/packages/nixos/system.nix index d11a336acc..553accc304 100644 --- a/packages/nixos/system.nix +++ b/packages/nixos/system.nix @@ -4,20 +4,24 @@ { config, lib, - pkgs, ... }: { boot.loader.grub.enable = false; - boot.kernelPackages = pkgs.recurseIntoAttrs (pkgs.linuxPackagesFor pkgs.kernel-podvm-azure); boot.kernelParams = [ "systemd.verity=yes" "selinux=0" ]; - boot.supportedFilesystems = [ "erofs" ]; + boot.supportedFilesystems = [ + "erofs" + "vfat" + ]; boot.initrd = { - supportedFilesystems = [ "erofs" ]; + supportedFilesystems = [ + "erofs" + "vfat" + ]; availableKernelModules = [ "dm_mod" "dm_verity" diff --git a/tools/tdx-measure/main.go b/tools/tdx-measure/main.go index b43f4db9d9..50f3bdb6a5 100644 --- a/tools/tdx-measure/main.go +++ b/tools/tdx-measure/main.go @@ -106,6 +106,10 @@ func newRtMrCmd() *cobra.Command { if err := cmd.MarkFlagFilename("kernel"); err != nil { panic(err) } + cmd.Flags().StringP("initrd", "i", "initrd.zst", "path to initrd file") + if err := cmd.MarkFlagFilename("initrd"); err != nil { + panic(err) + } cmd.Flags().StringP("cmdline", "c", "", "kernel command line") return cmd } @@ -136,8 +140,15 @@ func runRtMr(cmd *cobra.Command, args []string) error { if err != nil { return fmt.Errorf("can't read kernel file: %w", err) } - - digest, err = rtmr.CalcRtmr1(kernel) + initrdPath, err := cmd.Flags().GetString("initrd") + if err != nil { + return err + } + initrd, err := os.ReadFile(initrdPath) + if err != nil { + return fmt.Errorf("can't read initrd file: %w", err) + } + digest, err = rtmr.CalcRtmr1(kernel, initrd) if err != nil { return fmt.Errorf("can't calculate RTMR 1: %w", err) } @@ -146,7 +157,15 @@ func runRtMr(cmd *cobra.Command, args []string) error { if err != nil { return err } - digest, err = rtmr.CalcRtmr2(cmdLine) + initrdPath, err := cmd.Flags().GetString("initrd") + if err != nil { + return err + } + initrd, err := os.ReadFile(initrdPath) + if err != nil { + return fmt.Errorf("can't read initrd file: %w", err) + } + digest, err = rtmr.CalcRtmr2(cmdLine, initrd) if err != nil { return fmt.Errorf("can't calculate RTMR 2: %w", err) } diff --git a/tools/tdx-measure/rtmr/rtmr.go b/tools/tdx-measure/rtmr/rtmr.go index a123f29d9a..a6a3fb71a9 100644 --- a/tools/tdx-measure/rtmr/rtmr.go +++ b/tools/tdx-measure/rtmr/rtmr.go @@ -243,23 +243,33 @@ func CalcRtmr0(firmware []byte) ([48]byte, error) { } // CalcRtmr1 calculates RTMR[1] for the given kernel. -func CalcRtmr1(kernelFile []byte) ([48]byte, error) { +func CalcRtmr1(kernelFile, initrdFile []byte) ([48]byte, error) { var rtmr Rtmr - kernelHashContent, err := hashKernel(kernelFile) + + kernelHashContent, err := hashKernel(kernelFile, initrdFile) if err != nil { return [48]byte{}, fmt.Errorf("can't hash kernel: %w", err) } rtmr.hashAndExtend(kernelHashContent) + + // https://github.com/tianocore/edk2/blob/0f3867fa6ef0553e26c42f7d71ff6bdb98429742/OvmfPkg/Tcg/TdTcg2Dxe/TdTcg2Dxe.c#L2155 rtmr.hashAndExtend([]byte("Calling EFI Application from Boot Option")) + // https://github.com/tianocore/edk2/blob/0f3867fa6ef0553e26c42f7d71ff6bdb98429742/OvmfPkg/Tcg/TdTcg2Dxe/TdTcg2Dxe.c#L2243 rtmr.hashAndExtend([]byte("Exit Boot Services Invocation")) + // https://github.com/tianocore/edk2/blob/0f3867fa6ef0553e26c42f7d71ff6bdb98429742/OvmfPkg/Tcg/TdTcg2Dxe/TdTcg2Dxe.c#L2254 rtmr.hashAndExtend([]byte("Exit Boot Services Returned with Success")) return rtmr.Get(), nil } -// CalcRtmr2 calculates RTMR[2] for the given kernel command line. -func CalcRtmr2(cmdLine string) ([48]byte, error) { +// CalcRtmr2 calculates RTMR[2] for the given kernel command line and initrd. +func CalcRtmr2(cmdLine string, initrdFile []byte) ([48]byte, error) { var rtmr Rtmr + // TODO(msanft): find out which component silently adds this string to the commandline. + // Suspects: QEMU-TDX, OVMF-TDX, Linux EFI Stub + cmdLine += " initrd=initrd" + + // https://elixir.bootlin.com/linux/v6.11.8/source/drivers/firmware/efi/libstub/efi-stub-helper.c#L342 codepoints := utf16.Encode([]rune(cmdLine)) bytes := make([]byte, (len(codepoints)+1)*2) for i, codepoint := range codepoints { @@ -267,11 +277,14 @@ func CalcRtmr2(cmdLine string) ([48]byte, error) { } rtmr.hashAndExtend(bytes) + // https://elixir.bootlin.com/linux/v6.11.8/source/drivers/firmware/efi/libstub/efi-stub-helper.c#L625 + rtmr.hashAndExtend(initrdFile) + return rtmr.Get(), nil } -func hashKernel(kernelFile []byte) ([]byte, error) { - patchKernel(kernelFile) +func hashKernel(kernelFile, initrdFile []byte) ([]byte, error) { + patchKernel(kernelFile, initrdFile) kernel, err := authenticode.Parse(bytes.NewReader(kernelFile)) if err != nil { @@ -281,7 +294,7 @@ func hashKernel(kernelFile []byte) ([]byte, error) { return kernel.HashContent.Bytes(), nil } -func patchKernel(kernelFile []byte) { +func patchKernel(kernelFile, initrdFile []byte) { // QEMU patches some header bytes in the kernel before loading it into memory. // Sources: // - https://gitlab.com/qemu-project/qemu/-/blob/28ae3179fc52d2e4d870b635c4a412aab99759e7/hw/i386/x86-common.c#L837 @@ -299,4 +312,19 @@ func patchKernel(kernelFile []byte) { kernelFile[0x229] = 0x00 kernelFile[0x22A] = 0x02 kernelFile[0x22B] = 0x00 + + // https://github.com/qemu/qemu/blob/f48c205fb42be48e2e47b7e1cd9a2802e5ca17b0/hw/i386/x86.c#L1036 + // Maximum size of the initrd as calculated by QEMU. Normally, this would be dependent on the VM + // memory size, but we have a QEMU patch that removes that fixes this to make RTMR1 reproducible. + // Our QEMU patch has a commented-out line to print this value upon start, so it's easy to find + // when updating QEMU, as the value might change on QEMU updates. + initrdMax := 0x7ffd7fff + initrdSize := len(initrdFile) + initrdAddr := (initrdMax - initrdSize) & ^4095 + + // https://github.com/qemu/qemu/blob/f48c205fb42be48e2e47b7e1cd9a2802e5ca17b0/hw/i386/x86.c#L1044 + binary.LittleEndian.PutUint32(kernelFile[0x218:][:4], uint32(initrdAddr)) + + // https://github.com/qemu/qemu/blob/f48c205fb42be48e2e47b7e1cd9a2802e5ca17b0/hw/i386/x86.c#L1045 + binary.LittleEndian.PutUint32(kernelFile[0x21C:][:4], uint32(initrdSize)) }