diff --git a/.gitignore b/.gitignore index 8291286..e600fa6 100644 --- a/.gitignore +++ b/.gitignore @@ -8,5 +8,5 @@ config.yaml content-*/* *.arg .idea - -.DS_Store \ No newline at end of file +hack/*.img +.DS_Store diff --git a/hack/Earthfile b/hack/Earthfile new file mode 100644 index 0000000..c3118fa --- /dev/null +++ b/hack/Earthfile @@ -0,0 +1,16 @@ +VERSION 0.6 + +ARG OSBUILDER_VERSION=v0.7.11 +ARG OSBUILDER_IMAGE=quay.io/kairos/osbuilder-tools:$OSBUILDER_VERSION +ARG ISO_NAME=debug + +# replace with your CanvOS provider image +ARG PROVIDER_IMAGE=oci:tylergillson/ubuntu:k3s-1.26.4-v4.0.4-071c2c23 + +build: + FROM $OSBUILDER_IMAGE + WORKDIR /build + COPY . ./ + + RUN /entrypoint.sh --name $ISO_NAME --debug build-iso --squash-no-compression --date=false $PROVIDER_IMAGE --output /build/ + SAVE ARTIFACT /build/$ISO_NAME.iso kairos.iso AS LOCAL build/$ISO_NAME.iso diff --git a/hack/README.md b/hack/README.md new file mode 100644 index 0000000..e01a8e8 --- /dev/null +++ b/hack/README.md @@ -0,0 +1,17 @@ +# Debugging Kairos + +If you're facing hard-to-diagnose issues with your custom provider image, you can use the scripts in this directory to obtain verbose Kairos output. + +## Steps +1. Use earthly to generate an ISO from your CanvOS provider image: + ``` + earthly +build --PROVIDER_IMAGE= # e.g., oci:tylergillson/ubuntu:k3s-1.26.4-v4.0.4-071c2c23 + ``` + If successful, `build/debug.iso` will be created. + +2. Launch a local VM based on the debug ISO using QEMU and pipe all output to a log file: + ``` + ./launch-qemu.sh build/debug.iso | tee out.log + ``` + +3. Once the VM boots, use `reboot` to return to the GRUB menu, then select your desired entry and hit `x` to edit it. Add `rd.debug rd.immucore.debug` to the end of the `linux` line for your selected GRUB menu entry, then hit `CTRL+x` to boot with your edits. You should see verbose Kairos debug logs and they will be persisted to `out.log`. diff --git a/hack/build/.keep b/hack/build/.keep new file mode 100644 index 0000000..e69de29 diff --git a/hack/launch-qemu.sh b/hack/launch-qemu.sh new file mode 100755 index 0000000..9aaba67 --- /dev/null +++ b/hack/launch-qemu.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Screenshot capability: +# https://unix.stackexchange.com/a/476617 + +if [ ! -e disk.img ]; then + qemu-img create -f qcow2 disk.img 60g +fi + +# -nic bridge,br=br0,model=virtio-net-pci \ +qemu-system-x86_64 \ + -enable-kvm \ + -cpu "${CPU:=host}" \ + -nographic \ + -spice port=9000,addr=127.0.0.1,disable-ticketing=yes \ + -m ${MEMORY:=10096} \ + -smp ${CORES:=5} \ + -monitor unix:/tmp/qemu-monitor.sock,server=on,wait=off \ + -serial mon:stdio \ + -rtc base=utc,clock=rt \ + -chardev socket,path=qga.sock,server=on,wait=off,id=qga0 \ + -device virtio-serial \ + -device virtserialport,chardev=qga0,name=org.qemu.guest_agent.0 \ + -drive if=virtio,media=disk,file=disk.img \ + -drive if=ide,media=cdrom,file="${1}" diff --git a/test/templates/two-node-cluster-profile.json.tmpl b/test/templates/two-node-cluster-profile.json.tmpl new file mode 100644 index 0000000..41542e2 --- /dev/null +++ b/test/templates/two-node-cluster-profile.json.tmpl @@ -0,0 +1,64 @@ +{ + "metadata": { + "name": "_____place_holder_____", + "description": "", + "labels": {} + }, + "spec": { + "version": "1.0.0", + "template": { + "type": "infra", + "cloudType": "edge-native", + "packs": [ + { + "name": "edge-native-byoi", + "type": "spectro", + "layer": "os", + "version": "1.0.0", + "tag": "1.0.0", + "values": "pack:\n content:\n images:\n - image: \"{{.spectro.pack.edge-native-byoi.options.system.uri}}\"\n # Below config is default value, please uncomment if you want to modify default values\n #drain:\n #cordon: true\n #timeout: 60 # The length of time to wait before giving up, zero means infinite\n #gracePeriod: 60 # Period of time in seconds given to each pod to terminate gracefully. If negative, the default value specified in the pod will be used\n #ignoreDaemonSets: true\n #deleteLocalData: true # Continue even if there are pods using emptyDir (local data that will be deleted when the node is drained)\n #force: true # Continue even if there are pods that do not declare a controller\n #disableEviction: false # Force drain to use delete, even if eviction is supported. This will bypass checking PodDisruptionBudgets, use with caution\n #skipWaitForDeleteTimeout: 60 # If pod DeletionTimestamp older than N seconds, skip waiting for the pod. Seconds must be greater than 0 to skip.\nstylusPackage: container://OCI_REGISTRY/stylus-linux-amd64:v0.0.0-STYLUS_HASH\noptions:\n system.uri: \"OCI_REGISTRY/ubuntu:k3s-1.26.4-v4.0.4-STYLUS_HASH\"", + "registry": { + "metadata": { + "uid": "_____place_holder_____", + "name": "Public Repo", + "kind": "pack", + "isPrivate": false + } + } + }, + { + "name": "edge-k3s", + "type": "spectro", + "layer": "k8s", + "version": "1.26.4", + "tag": "1.26.4", + "values": "cluster:\n config: |\n flannel-backend: host-gw\n disable-network-policy: false\n disable:\n - traefik\n - local-storage\n - servicelb\n - metrics-server\n\n # configure the pod cidr range\n cluster-cidr: \"192.170.0.0/16\"\n\n # configure service cidr range\n service-cidr: \"192.169.0.0/16\"\n\n # kubeconfig must be in run for the stylus operator to manage the cluster\n write-kubeconfig: /run/kubeconfig\n write-kubeconfig-mode: 600\n\n # additional component settings to harden installation\n kube-apiserver-arg:\n - anonymous-auth=true\n - profiling=false\n - disable-admission-plugins=AlwaysAdmit\n - default-not-ready-toleration-seconds=60\n - default-unreachable-toleration-seconds=60\n - enable-admission-plugins=AlwaysPullImages,NamespaceLifecycle,ServiceAccount,NodeRestriction\n - audit-log-path=/var/log/apiserver/audit.log\n - audit-policy-file=/etc/kubernetes/audit-policy.yaml\n - audit-log-maxage=30\n - audit-log-maxbackup=10\n - audit-log-maxsize=100\n - authorization-mode=RBAC,Node\n - tls-cipher-suites=TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_128_GCM_SHA256\n kube-controller-manager-arg:\n - profiling=false\n - terminated-pod-gc-threshold=25\n - use-service-account-credentials=true\n - feature-gates=RotateKubeletServerCertificate=true\n - node-monitor-period=5s\n - node-monitor-grace-period=20s\n - pod-eviction-timeout=20s\n kube-scheduler-arg:\n - profiling=false\n kubelet-arg:\n - read-only-port=0\n - event-qps=0\n - feature-gates=RotateKubeletServerCertificate=true\n - protect-kernel-defaults=true\n - tls-cipher-suites=TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_128_GCM_SHA256\n - rotate-server-certificates=true\nstages:\n initramfs:\n - sysctl:\n vm.overcommit_memory: 1\n kernel.panic: 10\n kernel.panic_on_oops: 1\n kernel.printk: \"0 4 0 7\"\n - directories:\n - path: \"/var/log/apiserver\"\n permissions: 0644\n files:\n - path: /etc/hosts\n permission: \"0644\"\n content: |\n 127.0.0.1 localhost\n - path: \"/etc/kubernetes/audit-policy.yaml\"\n owner_string: \"root\"\n permission: 0600\n content: |\n apiVersion: audit.k8s.io/v1\n kind: Policy\n rules:\n - level: None\n users: [\"system:kube-proxy\"]\n verbs: [\"watch\"]\n resources:\n - group: \"\" # core\n resources: [\"endpoints\", \"services\", \"services/status\"]\n - level: None\n users: [\"system:unsecured\"]\n namespaces: [\"kube-system\"]\n verbs: [\"get\"]\n resources:\n - group: \"\" # core\n resources: [\"configmaps\"]\n - level: None\n users: [\"kubelet\"] # legacy kubelet identity\n verbs: [\"get\"]\n resources:\n - group: \"\" # core\n resources: [\"nodes\", \"nodes/status\"]\n - level: None\n userGroups: [\"system:nodes\"]\n verbs: [\"get\"]\n resources:\n - group: \"\" # core\n resources: [\"nodes\", \"nodes/status\"]\n - level: None\n users:\n - system:kube-controller-manager\n - system:kube-scheduler\n - system:serviceaccount:kube-system:endpoint-controller\n verbs: [\"get\", \"update\"]\n namespaces: [\"kube-system\"]\n resources:\n - group: \"\" # core\n resources: [\"endpoints\"]\n - level: None\n users: [\"system:apiserver\"]\n verbs: [\"get\"]\n resources:\n - group: \"\" # core\n resources: [\"namespaces\", \"namespaces/status\", \"namespaces/finalize\"]\n - level: None\n users: [\"cluster-autoscaler\"]\n verbs: [\"get\", \"update\"]\n namespaces: [\"kube-system\"]\n resources:\n - group: \"\" # core\n resources: [\"configmaps\", \"endpoints\"]\n # Don't log HPA fetching metrics.\n - level: None\n users:\n - system:kube-controller-manager\n verbs: [\"get\", \"list\"]\n resources:\n - group: \"metrics.k8s.io\"\n # Don't log these read-only URLs.\n - level: None\n nonResourceURLs:\n - /healthz*\n - /version\n - /swagger*\n # Don't log events requests.\n - level: None\n resources:\n - group: \"\" # core\n resources: [\"events\"]\n # node and pod status calls from nodes are high-volume and can be large, don't log responses for expected updates from nodes\n - level: Request\n users: [\"kubelet\", \"system:node-problem-detector\", \"system:serviceaccount:kube-system:node-problem-detector\"]\n verbs: [\"update\",\"patch\"]\n resources:\n - group: \"\" # core\n resources: [\"nodes/status\", \"pods/status\"]\n omitStages:\n - \"RequestReceived\"\n - level: Request\n userGroups: [\"system:nodes\"]\n verbs: [\"update\",\"patch\"]\n resources:\n - group: \"\" # core\n resources: [\"nodes/status\", \"pods/status\"]\n omitStages:\n - \"RequestReceived\"\n # deletecollection calls can be large, don't log responses for expected namespace deletions\n - level: Request\n users: [\"system:serviceaccount:kube-system:namespace-controller\"]\n verbs: [\"deletecollection\"]\n omitStages:\n - \"RequestReceived\"\n # Secrets, ConfigMaps, and TokenReviews can contain sensitive \u0026 binary data,\n # so only log at the Metadata level.\n - level: Metadata\n resources:\n - group: \"\" # core\n resources: [\"secrets\", \"configmaps\"]\n - group: authentication.k8s.io\n resources: [\"tokenreviews\"]\n omitStages:\n - \"RequestReceived\"\n # Get repsonses can be large; skip them.\n - level: Request\n verbs: [\"get\", \"list\", \"watch\"]\n resources:\n - group: \"\" # core\n - group: \"admissionregistration.k8s.io\"\n - group: \"apiextensions.k8s.io\"\n - group: \"apiregistration.k8s.io\"\n - group: \"apps\"\n - group: \"authentication.k8s.io\"\n - group: \"authorization.k8s.io\"\n - group: \"autoscaling\"\n - group: \"batch\"\n - group: \"certificates.k8s.io\"\n - group: \"extensions\"\n - group: \"metrics.k8s.io\"\n - group: \"networking.k8s.io\"\n - group: \"policy\"\n - group: \"rbac.authorization.k8s.io\"\n - group: \"settings.k8s.io\"\n - group: \"storage.k8s.io\"\n omitStages:\n - \"RequestReceived\"\n # Default level for known APIs\n - level: RequestResponse\n resources:\n - group: \"\" # core\n - group: \"admissionregistration.k8s.io\"\n - group: \"apiextensions.k8s.io\"\n - group: \"apiregistration.k8s.io\"\n - group: \"apps\"\n - group: \"authentication.k8s.io\"\n - group: \"authorization.k8s.io\"\n - group: \"autoscaling\"\n - group: \"batch\"\n - group: \"certificates.k8s.io\"\n - group: \"extensions\"\n - group: \"metrics.k8s.io\"\n - group: \"networking.k8s.io\"\n - group: \"policy\"\n - group: \"rbac.authorization.k8s.io\"\n - group: \"settings.k8s.io\"\n - group: \"storage.k8s.io\"\n omitStages:\n - \"RequestReceived\"\n # Default level for all other requests.\n - level: Metadata\n omitStages:\n - \"RequestReceived\"\npack:\n palette:\n config:\n oidc:\n identityProvider: noauth", + "registry": { + "metadata": { + "uid": "_____place_holder_____", + "name": "Public Repo", + "kind": "pack", + "isPrivate": false + } + } + }, + { + "name": "cni-custom", + "type": "spectro", + "layer": "cni", + "version": "0.1.0", + "tag": "0.1.0", + "values": "manifests:\n byo-cni:\n contents: |\n apiVersion: v1\n kind: ConfigMap\n metadata:\n name: custom-cni\n data:\n # property-like keys; each key maps to a simple value\n custom-cni: \"byo-cni\"", + "registry": { + "metadata": { + "uid": "_____place_holder_____", + "name": "Public Repo", + "kind": "pack", + "isPrivate": false + } + } + } + ] + } + } +} diff --git a/test/templates/two-node-create.json.tmpl b/test/templates/two-node-create.json.tmpl new file mode 100644 index 0000000..a607d0d --- /dev/null +++ b/test/templates/two-node-create.json.tmpl @@ -0,0 +1,131 @@ +{ + "metadata": { + "annotations": {}, + "name": "_____place_holder_____", + "labels": {} + }, + "spec": { + "cloudConfig": { + "controlPlaneEndpoint": { + "host": "_____place_holder_____", + "type": "IP" + }, + "sshKeys": [ + "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDDYZpId/d19xuzNnbjkWxlTvctATcwYz+Fre3qOUkkrFJljx39pduukR38Pms8oeEPk6B+GBzwARk9xkEK2SUW+B6ZzCVaBXMHzLiuyzYK9mcHLEnSaYIT7njdAqcFzpBamkPkhUAfsWDcsjgnz0Q7Ilmdi42MW1mqR9M+FibB89Qg/EdFxD0J+VtD/MOZfSDPMEQ+azZMgcWRICn1N9Ods3uH8FCF+PAwVEBo19x34P5xqIyZ7QJjFvNoV96Sr8JuUJWXzMJ6R+7HbH5BMceRsDVd+ZUSX5tQDG4nPrWRVdJN3stLtLNADprXV5BSrDaMOqWK034Or4AI+sqTvmHIBy/b0U4dWAQiJWD6QkLG673UG2qwyZ4GJI4D0KkR7Frj2zwtcufnwHop69R36uJn5xkjJUG92B5GbfolbSjzo0PsQ+Q5NKRJDZZ7conw5RkRb4DYrt17D6BZKbw0X5Gd22MdgPPcnjs4JiZTeKXGkM0kDlTD5jjA4nCs6IEQhI1QLiicHLO5algTf1JHyRUgdMbJA0zlVITDtid3cvRup3JpZW9cdxu3NTqsRRauZj33mfpeRLnuJ2y+cLaWBkkAPpjO87/caUezJJ0r3qzXkIXLu4zCe1RRoZfERUlGvLK+LRUC8IadFTGJl6UhJBApe1UydydOakK45uUBAkDYfw== spectro2023" + ], + "staticIp": false + }, + "machinePoolConfig": [ + { + "cloudConfig": { + "edgeHosts": [ + { + "hostUid": "_____place_holder_____", + "nicName": "_____place_holder_____", + "staticIP": "", + "isTwoNodeCandidate": true + } + ] + }, + "poolConfig": { + "name": "master-pool", + "labels": [ + "master" + ], + "isControlPlane": true, + "useControlPlaneAsWorker": true, + "taints": [], + "additionalLabels": {}, + "nodeRepaveInterval": 0, + "updateStrategy": { + "type": "RollingUpdateScaleOut" + }, + "machinePoolProperties": { + "archType": "amd64" + }, + "size": 1, + "maxSize": 3, + "minSize": 1 + } + }, + { + "cloudConfig": { + "edgeHosts": [ + { + "hostUid": "_____place_holder_____", + "nicName": "_____place_holder_____", + "staticIP": "", + "isTwoNodeCandidate": true + } + ] + }, + "poolConfig": { + "name": "worker-pool", + "labels": [ + "worker" + ], + "taints": [], + "additionalLabels": {}, + "nodeRepaveInterval": 0, + "updateStrategy": { + "type": "RollingUpdateScaleOut" + }, + "machinePoolProperties": { + "archType": "amd64" + }, + "size": 1, + "maxSize": 3, + "minSize": 1 + } + } + ], + "cloudAccountUid": null, + "edgeHostUid": "", + "profiles": [ + { + "uid": "_____place_holder_____", + "packValues": [ + { + "tag": "1.0.0", + "name": "edge-native-byoi", + "type": "spectro", + "values": "pack:\n content:\n images:\n - image: \"{{.spectro.pack.edge-native-byoi.options.system.uri}}\"\n # Below config is default value, please uncomment if you want to modify default values\n #drain:\n #cordon: true\n #timeout: 60 # The length of time to wait before giving up, zero means infinite\n #gracePeriod: 60 # Period of time in seconds given to each pod to terminate gracefully. If negative, the default value specified in the pod will be used\n #ignoreDaemonSets: true\n #deleteLocalData: true # Continue even if there are pods using emptyDir (local data that will be deleted when the node is drained)\n #force: true # Continue even if there are pods that do not declare a controller\n #disableEviction: false # Force drain to use delete, even if eviction is supported. This will bypass checking PodDisruptionBudgets, use with caution\n #skipWaitForDeleteTimeout: 60 # If pod DeletionTimestamp older than N seconds, skip waiting for the pod. Seconds must be greater than 0 to skip.\nstylusPackage: container://OCI_REGISTRY/stylus-linux-amd64:v0.0.0-STYLUS_HASH\noptions:\n system.uri: \"OCI_REGISTRY/ubuntu:k3s-1.26.4-v4.0.4-STYLUS_HASH\"", + "manifests": [] + }, + { + "tag": "1.26.4", + "name": "edge-k3s", + "type": "spectro", + "values": "cluster:\n config: |\n flannel-backend: host-gw\n disable-network-policy: false\n disable:\n - traefik\n - local-storage\n - servicelb\n - metrics-server\n\n # configure the pod cidr range\n cluster-cidr: \"192.170.0.0/16\"\n\n # configure service cidr range\n service-cidr: \"192.169.0.0/16\"\n\n # kubeconfig must be in run for the stylus operator to manage the cluster\n write-kubeconfig: /run/kubeconfig\n write-kubeconfig-mode: 600\n\n # additional component settings to harden installation\n kube-apiserver-arg:\n - anonymous-auth=true\n - profiling=false\n - disable-admission-plugins=AlwaysAdmit\n - default-not-ready-toleration-seconds=60\n - default-unreachable-toleration-seconds=60\n - enable-admission-plugins=AlwaysPullImages,NamespaceLifecycle,ServiceAccount,NodeRestriction\n - audit-log-path=/var/log/apiserver/audit.log\n - audit-policy-file=/etc/kubernetes/audit-policy.yaml\n - audit-log-maxage=30\n - audit-log-maxbackup=10\n - audit-log-maxsize=100\n - authorization-mode=RBAC,Node\n - tls-cipher-suites=TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_128_GCM_SHA256\n kube-controller-manager-arg:\n - profiling=false\n - terminated-pod-gc-threshold=25\n - use-service-account-credentials=true\n - feature-gates=RotateKubeletServerCertificate=true\n - node-monitor-period=5s\n - node-monitor-grace-period=20s\n - pod-eviction-timeout=20s\n kube-scheduler-arg:\n - profiling=false\n kubelet-arg:\n - read-only-port=0\n - event-qps=0\n - feature-gates=RotateKubeletServerCertificate=true\n - protect-kernel-defaults=true\n - tls-cipher-suites=TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_128_GCM_SHA256\n - rotate-server-certificates=true\nstages:\n initramfs:\n - sysctl:\n vm.overcommit_memory: 1\n kernel.panic: 10\n kernel.panic_on_oops: 1\n kernel.printk: \"0 4 0 7\"\n - directories:\n - path: \"/var/log/apiserver\"\n permissions: 0644\n files:\n - path: /etc/hosts\n permission: \"0644\"\n content: |\n 127.0.0.1 localhost\n - path: \"/etc/kubernetes/audit-policy.yaml\"\n owner_string: \"root\"\n permission: 0600\n content: |\n apiVersion: audit.k8s.io/v1\n kind: Policy\n rules:\n - level: None\n users: [\"system:kube-proxy\"]\n verbs: [\"watch\"]\n resources:\n - group: \"\" # core\n resources: [\"endpoints\", \"services\", \"services/status\"]\n - level: None\n users: [\"system:unsecured\"]\n namespaces: [\"kube-system\"]\n verbs: [\"get\"]\n resources:\n - group: \"\" # core\n resources: [\"configmaps\"]\n - level: None\n users: [\"kubelet\"] # legacy kubelet identity\n verbs: [\"get\"]\n resources:\n - group: \"\" # core\n resources: [\"nodes\", \"nodes/status\"]\n - level: None\n userGroups: [\"system:nodes\"]\n verbs: [\"get\"]\n resources:\n - group: \"\" # core\n resources: [\"nodes\", \"nodes/status\"]\n - level: None\n users:\n - system:kube-controller-manager\n - system:kube-scheduler\n - system:serviceaccount:kube-system:endpoint-controller\n verbs: [\"get\", \"update\"]\n namespaces: [\"kube-system\"]\n resources:\n - group: \"\" # core\n resources: [\"endpoints\"]\n - level: None\n users: [\"system:apiserver\"]\n verbs: [\"get\"]\n resources:\n - group: \"\" # core\n resources: [\"namespaces\", \"namespaces/status\", \"namespaces/finalize\"]\n - level: None\n users: [\"cluster-autoscaler\"]\n verbs: [\"get\", \"update\"]\n namespaces: [\"kube-system\"]\n resources:\n - group: \"\" # core\n resources: [\"configmaps\", \"endpoints\"]\n # Don't log HPA fetching metrics.\n - level: None\n users:\n - system:kube-controller-manager\n verbs: [\"get\", \"list\"]\n resources:\n - group: \"metrics.k8s.io\"\n # Don't log these read-only URLs.\n - level: None\n nonResourceURLs:\n - /healthz*\n - /version\n - /swagger*\n # Don't log events requests.\n - level: None\n resources:\n - group: \"\" # core\n resources: [\"events\"]\n # node and pod status calls from nodes are high-volume and can be large, don't log responses for expected updates from nodes\n - level: Request\n users: [\"kubelet\", \"system:node-problem-detector\", \"system:serviceaccount:kube-system:node-problem-detector\"]\n verbs: [\"update\",\"patch\"]\n resources:\n - group: \"\" # core\n resources: [\"nodes/status\", \"pods/status\"]\n omitStages:\n - \"RequestReceived\"\n - level: Request\n userGroups: [\"system:nodes\"]\n verbs: [\"update\",\"patch\"]\n resources:\n - group: \"\" # core\n resources: [\"nodes/status\", \"pods/status\"]\n omitStages:\n - \"RequestReceived\"\n # deletecollection calls can be large, don't log responses for expected namespace deletions\n - level: Request\n users: [\"system:serviceaccount:kube-system:namespace-controller\"]\n verbs: [\"deletecollection\"]\n omitStages:\n - \"RequestReceived\"\n # Secrets, ConfigMaps, and TokenReviews can contain sensitive & binary data,\n # so only log at the Metadata level.\n - level: Metadata\n resources:\n - group: \"\" # core\n resources: [\"secrets\", \"configmaps\"]\n - group: authentication.k8s.io\n resources: [\"tokenreviews\"]\n omitStages:\n - \"RequestReceived\"\n # Get repsonses can be large; skip them.\n - level: Request\n verbs: [\"get\", \"list\", \"watch\"]\n resources:\n - group: \"\" # core\n - group: \"admissionregistration.k8s.io\"\n - group: \"apiextensions.k8s.io\"\n - group: \"apiregistration.k8s.io\"\n - group: \"apps\"\n - group: \"authentication.k8s.io\"\n - group: \"authorization.k8s.io\"\n - group: \"autoscaling\"\n - group: \"batch\"\n - group: \"certificates.k8s.io\"\n - group: \"extensions\"\n - group: \"metrics.k8s.io\"\n - group: \"networking.k8s.io\"\n - group: \"policy\"\n - group: \"rbac.authorization.k8s.io\"\n - group: \"settings.k8s.io\"\n - group: \"storage.k8s.io\"\n omitStages:\n - \"RequestReceived\"\n # Default level for known APIs\n - level: RequestResponse\n resources:\n - group: \"\" # core\n - group: \"admissionregistration.k8s.io\"\n - group: \"apiextensions.k8s.io\"\n - group: \"apiregistration.k8s.io\"\n - group: \"apps\"\n - group: \"authentication.k8s.io\"\n - group: \"authorization.k8s.io\"\n - group: \"autoscaling\"\n - group: \"batch\"\n - group: \"certificates.k8s.io\"\n - group: \"extensions\"\n - group: \"metrics.k8s.io\"\n - group: \"networking.k8s.io\"\n - group: \"policy\"\n - group: \"rbac.authorization.k8s.io\"\n - group: \"settings.k8s.io\"\n - group: \"storage.k8s.io\"\n omitStages:\n - \"RequestReceived\"\n # Default level for all other requests.\n - level: Metadata\n omitStages:\n - \"RequestReceived\"\npack:\n palette:\n config:\n oidc:\n identityProvider: noauth", + "manifests": [] + }, + { + "tag": "0.1.0", + "name": "cni-custom", + "type": "spectro", + "values": "manifests:\n byo-cni:\n contents: |\n apiVersion: v1\n kind: ConfigMap\n metadata:\n name: custom-cni\n data:\n # property-like keys; each key maps to a simple value\n custom-cni: \"byo-cni\"", + "manifests": [] + } + ] + } + ], + "policies": { + "scanPolicy": {} + }, + "clusterConfig": { + "machineManagementConfig": { + "osPatchConfig": { + "schedule": "", + "patchOnBoot": false, + "rebootIfRequired": false + } + }, + "updateWorkerPoolsInParallel": true, + "resources": { + "namespaces": [], + "rbacs": [] + }, + "location": null + } + } +} \ No newline at end of file diff --git a/test/test-two-node.sh b/test/test-two-node.sh new file mode 100755 index 0000000..ff50714 --- /dev/null +++ b/test/test-two-node.sh @@ -0,0 +1,486 @@ +#!/bin/bash + +set -e + +# Usage +# ----- +# +# 1. Install prerequisites: +# - docker (https://docs.docker.com/engine/install/) +# - earthly (https://earthly.dev/get-earthly) +# - git (https://github.com/git-guides/install-git) +# - govc (https://github.com/vmware/govmomi/blob/main/govc/README.md#installation) +# - jq (https://jqlang.github.io/jq/download/) +# - mkisofs (https://command-not-found.com/mkisofs) +# +# 2. Clone CanvOS and checkout this branch. +# +# 3. Create a .netrc file in the CanvOS repo root with GitHub +# credentials capable of cloning Spectro Cloud internal repos +# (required for building stylus). +# +# 4. Edit the global variables below as needed. +# +# 5. Source and execute this script: +# +# source ./test/test-two-node.sh +# ./test/test-two-node.sh + +# Edit these variables + +# govc vars +export GOVC_USERNAME=@vsphere.local +export GOVC_PASSWORD= +export GOVC_URL=10.10.128.10 +export GOVC_INSECURE=true +export GOVC_DATACENTER=Datacenter +export GOVC_DATASTORE=vsanDatastore2 +export GOVC_NETWORK=VM-NETWORK +export GOVC_RESOURCE_POOL= +export GOVC_FOLDER= + +# vSphere vars +export HOST_SUFFIX= # required to ensure unique edge host IDs +export ISO_FOLDER= e.g. "ISO/01-tyler" +export STYLUS_ISO="${ISO_FOLDER}/stylus-dev-amd64.iso" +export NIC_NAME=ens160 + +# palette vars +export API_KEY= +export PROJECT_UID= +export EDGE_REGISTRATION_TOKEN= +export DOMAIN=dev.spectrocloud.com +export PUBLIC_PACK_REPO_UID= # this varies per Palette tenant +export CLUSTER_NAME=two-node +export CLUSTER_PROFILE_UID= # if left blank, a cluster profile will be created +export CLUSTER_VIP= # choose an unassigned VIP + +# image vars +export OCI_REGISTRY=ttl.sh + +# Do not edit anything below + +declare -a vm_array=("two-node-one-$HOST_SUFFIX" "two-node-two-$HOST_SUFFIX") +export HOST_1="${vm_array[0]}-$HOST_SUFFIX" +export HOST_2="${vm_array[1]}-$HOST_SUFFIX" + +function create_canvos_args() { +cat < .arg +CUSTOM_TAG=twonode +IMAGE_REGISTRY=$OCI_REGISTRY +OS_DISTRIBUTION=ubuntu +IMAGE_REPO=ubuntu +OS_VERSION=22 +K8S_DISTRIBUTION=k3s +ISO_NAME=palette-edge-installer +ARCH=amd64 +HTTPS_PROXY= +HTTP_PROXY= +PROXY_CERT_PATH= +UPDATE_KERNEL=false +EOF +} + +function create_userdata() { +cat < build/user-data +#cloud-config +cluster: + providerConfig: + cluster-init: "no" +stylus: + site: + edgeHostToken: "$EDGE_REGISTRATION_TOKEN" + name: "$1-$HOST_SUFFIX" + paletteEndpoint: "$DOMAIN" + debug: true + twoNode: + enabled: true + livenessSeconds: 15 +install: + poweroff: true +users: + - name: kairos + passwd: kairos +EOF +echo "created build/user-data" +} + +function create_iso() { + touch meta-data + mkisofs -output build/user-data-$2.iso -volid cidata -joliet -rock $1 meta-data + rm -f meta-data +} + +function create_userdata_isos() { + echo Creating user-data ISOs... + for vm in "${vm_array[@]}"; do + create_userdata $vm + create_iso build/user-data $vm + done +} + +function upload_userdata_isos() { + echo Uploading user-data ISOs... + for vm in "${vm_array[@]}"; do + govc datastore.upload --ds=$GOVC_DATASTORE --dc=$GOVC_DATACENTER "build/user-data-${vm}.iso" "${ISO_FOLDER}/user-data-${vm}.iso" + done +} + +function upload_stylus_iso() { + iso=palette-edge-installer-stylus-${STYLUS_HASH}-k3s-${PROVIDER_K3S_HASH}.iso + echo Uploading installer ISO $iso... + govc datastore.upload --ds=$GOVC_DATASTORE --dc=$GOVC_DATACENTER build/$iso $STYLUS_ISO +} + +function create_vms() { + echo Creating VMs... + for vm in "${vm_array[@]}"; do + govc vm.create -m 8192 -c 4 -disk 100GB -net.adapter vmxnet3 -iso=$STYLUS_ISO -on=false -pool=$GOVC_RESOURCE_POOL $vm + dev=$(govc device.cdrom.add -vm $vm) + govc device.cdrom.insert -vm=$vm -device=$dev "${ISO_FOLDER}/user-data-${vm}.iso" + govc vm.power -on $vm + done +} + +function destroy_vms() { + for vm in "${vm_array[@]}"; do + govc vm.destroy $vm + done +} + +function wait_for_vms_to_power_off() { + echo Waiting for both VMs to be flashed and power off... + while true; do + powerState1=$(govc vm.info -json=true "${vm_array[0]}" | jq -r .[][0].runtime.powerState) + powerState2=$(govc vm.info -json=true "${vm_array[1]}" | jq -r .[][0].runtime.powerState) + if [ "$powerState1" = "poweredOff" ] && [ "$powerState2" = "poweredOff" ]; then + echo VMs powered off! + break + fi + echo "VMs not powered off, sleeping for 5s..." + sleep 5 + done +} + +function reboot_vms() { + echo "Ejecting installer ISO & rebooting VMs..." + for vm in "${vm_array[@]}"; do + govc device.ls -vm=$vm + govc vm.power -off -force $vm + govc device.cdrom.eject -vm=$vm -device=cdrom-3000 + govc vm.power -on $vm + done +} + +function wait_until_edge_hosts_ready() { + echo Waiting for both Edge Hosts to register and become healthy... + while true; do + set +e + ready=$(curl -s -X POST https://$DOMAIN/v1/dashboard/edgehosts/search \ + -H "ApiKey: $API_KEY" \ + -H "Content-Type: application/json" \ + -H "ProjectUid: $PROJECT_UID" \ + -d \ + ' + { + "filter": { + "conjuction": "and", + "filterGroups": [ + { + "conjunction": "and", + "filters": [ + { + "property": "state", + "type": "string", + "condition": { + "string": { + "operator": "eq", + "negation": false, + "match": { + "conjunction": "or", + "values": [ + "ready", + "unpaired" + ] + }, + "ignoreCase": false + } + } + } + ] + } + ] + }, + "sort": [] + } + ' | jq -e 'select(.items != []).items | map(. | select(.status.health.state == "healthy")) | length') + set -e + if [ -z ${ready} ]; then + ready=0 + fi + if [ $ready = 2 ]; then + echo Both Edge Hosts are healthy! + break + fi + echo "Only $ready/2 Edge Hosts are healthy, sleeping for 5s..." + sleep 5 + done +} + +function destroy_edge_hosts() { + readarray -t edgeHosts < <(curl -s -X POST https://$DOMAIN/v1/dashboard/edgehosts/search \ + -H "ApiKey: $API_KEY" \ + -H "Content-Type: application/json" \ + -H "ProjectUid: $PROJECT_UID" \ + -d \ + ' + { + "filter": { + "conjuction": "and", + "filterGroups": [ + { + "conjunction": "and", + "filters": [ + { + "property": "state", + "type": "string", + "condition": { + "string": { + "operator": "eq", + "negation": false, + "match": { + "conjunction": "or", + "values": [ + "ready", + "unpaired" + ] + }, + "ignoreCase": false + } + } + } + ] + } + ] + }, + "sort": [] + } + ' | jq -r '.items[].metadata.uid') + for host in "${edgeHosts[@]}"; do + curl -s -X DELETE https://$DOMAIN/v1/edgehosts/$host \ + -H "ApiKey: $API_KEY" \ + -H "Content-Type: application/json" \ + -H "ProjectUid: $PROJECT_UID" + echo Deleted Edge Host $host + done +} + +function prepare_cluster_profile() { + if [ -z "${STYLUS_HASH}" ]; then + echo STYLUS_HASH is unset. Please execute build_all and retry. + exit 1 + fi + jq ' + .metadata.name = env.CLUSTER_NAME | + .spec.template.packs[0].registry.metadata.uid = env.PUBLIC_PACK_REPO_UID | + .spec.template.packs[1].registry.metadata.uid = env.PUBLIC_PACK_REPO_UID | + .spec.template.packs[2].registry.metadata.uid = env.PUBLIC_PACK_REPO_UID | + .spec.template.packs[0].values |= gsub("OCI_REGISTRY"; env.OCI_REGISTRY) | + .spec.template.packs[0].values |= gsub("STYLUS_HASH"; env.STYLUS_HASH) + ' test/templates/two-node-cluster-profile.json.tmpl > two-node-cluster-profile.json +} + +function create_cluster_profile() { + export CLUSTER_PROFILE_UID=$(curl -s -X POST https://$DOMAIN/v1/clusterprofiles/import?publish=true \ + -H "ApiKey: $API_KEY" \ + -H "Content-Type: application/json" \ + -H "ProjectUid: $PROJECT_UID" \ + -d @two-node-cluster-profile.json | jq -r .uid) + rm -f two-node-cluster-profile.json + if [ "$CLUSTER_PROFILE_UID" = "null" ]; then + echo Cluster Profile creation failed as it already exists. Please delete it and retry. + exit 1 + fi + echo "Cluster Profile $CLUSTER_PROFILE_UID created" +} + +function destroy_cluster_profile() { + curl -s -X DELETE https://$DOMAIN/v1/clusterprofiles/$CLUSTER_PROFILE_UID \ + -H "ApiKey: $API_KEY" \ + -H "Content-Type: application/json" \ + -H "ProjectUid: $PROJECT_UID" + echo "Cluster Profile $CLUSTER_PROFILE_UID deleted" +} + +function prepare_cluster() { + if [ -z "${STYLUS_HASH}" ]; then + echo STYLUS_HASH is unset. Please execute build_all and retry. + exit 1 + fi + if nslookup $CLUSTER_VIP >/dev/null; then + echo CLUSTER_VIP: $CLUSTER_VIP is allocated. Please retry with an unallocated VIP. + exit 1 + fi + jq ' + .metadata.name = env.CLUSTER_NAME | + .spec.cloudConfig.controlPlaneEndpoint.host = env.CLUSTER_VIP | + .spec.machinePoolConfig[0].cloudConfig.edgeHosts[0].hostUid = env.HOST_1 | + .spec.machinePoolConfig[0].cloudConfig.edgeHosts[0].nicName = env.NIC_NAME | + .spec.machinePoolConfig[1].cloudConfig.edgeHosts[0].hostUid = env.HOST_2 | + .spec.machinePoolConfig[1].cloudConfig.edgeHosts[0].nicName = env.NIC_NAME | + .spec.profiles[0].uid = env.CLUSTER_PROFILE_UID | + .spec.profiles[0].packValues[0].values |= gsub("OCI_REGISTRY"; env.OCI_REGISTRY) | + .spec.profiles[0].packValues[0].values |= gsub("STYLUS_HASH"; env.STYLUS_HASH) + ' test/templates/two-node-create.json.tmpl > two-node-create.json +} + +function create_cluster() { + uid=$(curl -s -X POST https://$DOMAIN/v1/spectroclusters/edge-native?ProjectUid=$PROJECT_UID \ + -H "ApiKey: $API_KEY" \ + -H "Content-Type: application/json" \ + -H "ProjectUid: $PROJECT_UID" \ + -d @two-node-create.json | jq -r .uid) + rm -f two-node-create.json + echo Cluster $uid created +} + +function destroy_cluster() { + clusterUid=$1 + curl -s -X PATCH https://$DOMAIN/v1/spectroclusters/$clusterUid/status/conditions \ + -H "ApiKey: $API_KEY" \ + -H "Content-Type: application/json" \ + -H "ProjectUid: $PROJECT_UID" \ + -d \ + ' + [ + { + "message": "cleaned up", + "reason": "CloudInfrastructureCleanedUp", + "status": "True", + "type": "CloudInfrastructureCleanedUp" + } + ] + ' + echo "Cluster $clusterUid deleted" +} + +function build_provider_k3s() { + echo "Building provider-k3s image..." + earthly +build-provider-package \ + --platform=linux/amd64 \ + --IMAGE_REPOSITORY=${OCI_REGISTRY} \ + --VERSION=${PROVIDER_K3S_HASH} + docker push ${OCI_REGISTRY}/provider-k3s:${PROVIDER_K3S_HASH} +} + +function build_stylus_package_and_framework() { + echo "Building stylus image and stylus framework image..." + earthly --allow-privileged +package \ + --platform=linux/amd64 \ + --IMAGE_REPOSITORY=${OCI_REGISTRY} \ + --BASE_IMAGE=quay.io/kairos/core-opensuse-leap:v2.3.2 \ + --VERSION=v0.0.0-${STYLUS_HASH} + docker push ${OCI_REGISTRY}/stylus-linux-amd64:v0.0.0-${STYLUS_HASH} + docker push ${OCI_REGISTRY}/stylus-framework-linux-amd64:v0.0.0-${STYLUS_HASH} +} + +function build_canvos() { + echo "Building provider image & installer ISO..." + earthly +build-all-images \ + --ARCH=amd64 \ + --PROVIDER_BASE=${OCI_REGISTRY}/provider-k3s:${PROVIDER_K3S_HASH} \ + --STYLUS_BASE=${OCI_REGISTRY}/stylus-framework-linux-amd64:v0.0.0-${STYLUS_HASH} \ + --ISO_NAME=palette-edge-installer-stylus-${STYLUS_HASH}-k3s-${PROVIDER_K3S_HASH} \ + --IMAGE_REGISTRY=${OCI_REGISTRY} \ + --TWO_NODE=true \ + --CUSTOM_TAG=${STYLUS_HASH} + docker push ${OCI_REGISTRY}/ubuntu:k3s-1.26.4-v4.0.4-${STYLUS_HASH} + docker push ${OCI_REGISTRY}/ubuntu:k3s-1.27.2-v4.0.4-${STYLUS_HASH} +} + +function build_all() { + + # optionally build/rebuild provider-k3s + test -d ../provider-k3s || ( cd .. && git clone https://github.com/kairos-io/provider-k3s -b two-node ) + cd ../provider-k3s + export PROVIDER_K3S_HASH=$(git describe --always) + ( + docker image ls --format "{{.Repository}}:{{.Tag}}" | \ + grep -q ${OCI_REGISTRY}/provider-k3s:${PROVIDER_K3S_HASH} + ) || ( build_provider_k3s ) + + # optionally build/rebuild stylus images + test -d ../stylus || ( cd .. && git clone https://github.com/spectrocloud/stylus -b 2-node-health-checks ) + cd ../stylus + export STYLUS_HASH=$(git describe --always) + ( + docker image ls --format "{{.Repository}}:{{.Tag}}" | \ + grep -q $OCI_REGISTRY/stylus-linux-amd64:v0.0.0-${STYLUS_HASH} + ) || ( build_stylus_package_and_framework ) + + # optionally build/rebuild provider image & installer ISO + cd ../CanvOS + ( + test -f build/palette-edge-installer-stylus-${STYLUS_HASH}-k3s-${PROVIDER_K3S_HASH}.iso && \ + docker image ls --format "{{.Repository}}:{{.Tag}}" | \ + grep -q ${OCI_REGISTRY}/ubuntu:k3s-1.26.4-v4.0.4-${STYLUS_HASH} + ) || ( build_canvos ) +} + +function clean_all() { + docker images | grep $OCI_REGISTRY | awk '{print $3;}' | xargs docker rmi --force + docker images | grep palette-installer | awk '{print $3;}' | xargs docker rmi --force + docker kill earthly-buildkitd + docker container prune --force + docker volume rm earthly-cache + docker volume prune --force + docker system prune --force +} + +function main() { + + # build all required edge artifacts + build_all + + # upload installer ISO to vSphere + upload_stylus_iso + + # create & upload user-data ISOs, configured to enable two node mode + create_userdata_isos + upload_userdata_isos + + # create VMs in vSphere, wait for the installation phase to complete, + # then power them off, remove the installer ISO, and reboot them + create_vms + wait_for_vms_to_power_off + reboot_vms + + # wait for the VMs to register with Palette and appear as Edge Hosts + wait_until_edge_hosts_ready + + # optionally create a two node Cluster Profile using the latest artifact + # versions - can be skipped by specifying the UID + if [ -z "${CLUSTER_PROFILE_UID}" ]; then + prepare_cluster_profile + create_cluster_profile + fi + + # create a new Edge Native cluster in Palette using the Edge Hosts + # provisioned above, plus the two node Cluster Profile + prepare_cluster + create_cluster +} + +# This line and the if condition below allow sourcing the script without executing +# the main function +(return 0 2>/dev/null) && sourced=1 || sourced=0 + +if [[ $sourced == 1 ]]; then + set +e + echo "You can now use any of these functions:" + echo "" + grep ^function ${BASH_SOURCE[0]} | grep -v main | awk '{gsub(/function /,""); gsub(/\(\) \{/,""); print;}' + echo +else + main +fi diff --git a/user-data.template b/user-data.template index d963177..bb2d503 100644 --- a/user-data.template +++ b/user-data.template @@ -1,12 +1,4 @@ #cloud-config - -# This is currently required to enable a two node cluster -# should be removed in the future and only the twoNode section -# in stylus should control this. -cluster: - env: - two-node: "true" - stylus: site: # host for hubble api to register device. @@ -22,11 +14,6 @@ stylus: key1: value1 key2: value2 key3: value3 - # enable two node cluster and add a custom health check script - # see also Dockerfile on how to add this script to the image - twoNode: - enabled: true - health-check-script: /opt/spectrocloud/bin/custom-health-check.sh # name of the device, this may also be referred to as the edge id or edge host id. If no edge host name is specified # one will be generated from the device serial number. If stylus cannot the device serial number a random id will