From 9dda19c47731f6ce371d2377b879e12db31531aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20=C5=9Awi=C4=85tek?= Date: Thu, 26 Oct 2023 15:09:23 +0200 Subject: [PATCH] feat(operator): add proper health checks Use dedicated controller-runtime facilities to create health and readiness checks for the webhook server. Unfortunately it seems like this doesn't solve the webhook unavailability issue right after starting the operator. I've added an explicit 10 second wait to the E2E tests for now, we'll have to see if that's enough to fix the problem. --- Makefile | 7 ++++++ .../templates/resources.yaml | 25 +++++++------------ kuttl-test-helm-certmanager.yaml | 3 +-- kuttl-test-helm-custom-configuration.yaml | 3 +-- kuttl-test-helm.yaml | 4 +-- kuttl-test.yaml | 2 +- .../config/default/manager_webhook_patch.yaml | 25 +++++++------------ operator/main.go | 24 ++++++++++++++---- 8 files changed, 49 insertions(+), 44 deletions(-) diff --git a/Makefile b/Makefile index a1d48c14..b06e35c3 100644 --- a/Makefile +++ b/Makefile @@ -42,6 +42,13 @@ e2e-helm-certmanager: e2e e2e-helm-custom-configuration: KUTTL_CONFIG = kuttl-test-helm-custom-configuration.yaml e2e-helm-custom-configuration: e2e +# We sleep for 10 seconds here because webhooks can mysteriously be unavailable even though the readiness check passes +.PHONY: e2e-wait-until-operator-ready +e2e-wait-until-operator-ready: + kubectl wait --for=condition=available --timeout 300s deploy --all -n tailing-sidecar-system + kubectl wait --for=condition=ready --timeout 300s pod --all -n tailing-sidecar-system + sleep 10 + build-push-deploy: build-push-sidecar build-push-deploy-operator build-push-sidecar: diff --git a/helm/tailing-sidecar-operator/templates/resources.yaml b/helm/tailing-sidecar-operator/templates/resources.yaml index 365cdb10..98ccd5d0 100644 --- a/helm/tailing-sidecar-operator/templates/resources.yaml +++ b/helm/tailing-sidecar-operator/templates/resources.yaml @@ -474,29 +474,22 @@ spec: protocol: TCP startupProbe: httpGet: - scheme: HTTPS - path: /add-tailing-sidecars-v1-pod - port: 9443 - httpHeaders: - - name: Accept - value: application/json - - name: Content-Type - value: application/json + path: /readyz + port: 8081 {{- if .Values.operator.startupProbe}} {{ toYaml .Values.operator.startupProbe | indent 10 }} {{ else }} periodSeconds: 3 {{ end }} + readinessProbe: + httpGet: + path: /readyz + port: 8081 + periodSeconds: 10 livenessProbe: httpGet: - scheme: HTTPS - path: /add-tailing-sidecars-v1-pod - port: 9443 - httpHeaders: - - name: Accept - value: application/json - - name: Content-Type - value: application/json + path: /healthz + port: 8081 {{- if .Values.operator.livenessProbe}} {{ toYaml .Values.operator.livenessProbe | indent 10 }} {{ else }} diff --git a/kuttl-test-helm-certmanager.yaml b/kuttl-test-helm-certmanager.yaml index 1020295c..e5de68bd 100644 --- a/kuttl-test-helm-certmanager.yaml +++ b/kuttl-test-helm-certmanager.yaml @@ -14,5 +14,4 @@ kindContainers: commands: - command: make -C ./operator deploy-cert-manager - command: helm upgrade --install test-release ./helm/tailing-sidecar-operator -f ./helm/tests/values.withCertManager.yaml -n tailing-sidecar-system --create-namespace - - command: kubectl wait --for=condition=available --timeout 300s deploy -l app.kubernetes.io/name=tailing-sidecar-operator -n tailing-sidecar-system - - command: kubectl wait --for=condition=ready --timeout 300s pod -l app.kubernetes.io/name=tailing-sidecar-operator -n tailing-sidecar-system + - command: make e2e-wait-until-operator-ready diff --git a/kuttl-test-helm-custom-configuration.yaml b/kuttl-test-helm-custom-configuration.yaml index 1e4c6bb0..b50a76c7 100644 --- a/kuttl-test-helm-custom-configuration.yaml +++ b/kuttl-test-helm-custom-configuration.yaml @@ -13,5 +13,4 @@ kindContainers: - registry.localhost:5000/sumologic/tailing-sidecar:test commands: - command: helm upgrade --install test-release ./helm/tailing-sidecar-operator -f ./helm/tests/values.withCustomConfiguration.yaml -n tailing-sidecar-system --create-namespace - - command: kubectl wait --for=condition=available --timeout 300s deploy -l app.kubernetes.io/name=tailing-sidecar-operator -n tailing-sidecar-system - - command: kubectl wait --for=condition=ready --timeout 300s pod -l app.kubernetes.io/name=tailing-sidecar-operator -n tailing-sidecar-system + - command: make e2e-wait-until-operator-ready diff --git a/kuttl-test-helm.yaml b/kuttl-test-helm.yaml index 8ee0edd5..61906f32 100644 --- a/kuttl-test-helm.yaml +++ b/kuttl-test-helm.yaml @@ -13,5 +13,5 @@ kindContainers: - registry.localhost:5000/sumologic/tailing-sidecar:test commands: - command: helm upgrade --install test-release ./helm/tailing-sidecar-operator -f ./helm/tests/values.yaml -n tailing-sidecar-system --create-namespace - - command: kubectl wait --for=condition=available --timeout 300s deploy -l app.kubernetes.io/name=tailing-sidecar-operator -n tailing-sidecar-system - - command: kubectl wait --for=condition=ready --timeout 300s pod -l app.kubernetes.io/name=tailing-sidecar-operator -n tailing-sidecar-system + + - command: make e2e-wait-until-operator-ready diff --git a/kuttl-test.yaml b/kuttl-test.yaml index 21fdb98d..8ed89cef 100644 --- a/kuttl-test.yaml +++ b/kuttl-test.yaml @@ -14,4 +14,4 @@ kindContainers: commands: - command: make -C ./operator deploy-cert-manager - command: make -C ./operator deploy IMG="registry.localhost:5000/sumologic/tailing-sidecar-operator:test" TAILING_SIDECAR_IMG="registry.localhost:5000/sumologic/tailing-sidecar:test" - - command: kubectl wait --for=condition=ready --timeout 300s pod -l control-plane=tailing-sidecar-operator -n tailing-sidecar-system + - command: make e2e-wait-until-operator-ready diff --git a/operator/config/default/manager_webhook_patch.yaml b/operator/config/default/manager_webhook_patch.yaml index 2d3fb476..e59a0593 100644 --- a/operator/config/default/manager_webhook_patch.yaml +++ b/operator/config/default/manager_webhook_patch.yaml @@ -14,25 +14,18 @@ spec: protocol: TCP startupProbe: httpGet: - scheme: HTTPS - path: /add-tailing-sidecars-v1-pod - port: 9443 - httpHeaders: - - name: Accept - value: application/json - - name: Content-Type - value: application/json + path: /readyz + port: 8081 periodSeconds: 3 + readinessProbe: + httpGet: + path: /readyz + port: 8081 + periodSeconds: 10 livenessProbe: httpGet: - scheme: HTTPS - path: /add-tailing-sidecars-v1-pod - port: 9443 - httpHeaders: - - name: Accept - value: application/json - - name: Content-Type - value: application/json + path: /healthz + port: 8081 initialDelaySeconds: 1 periodSeconds: 10 volumeMounts: diff --git a/operator/main.go b/operator/main.go index dc921507..e92bf516 100644 --- a/operator/main.go +++ b/operator/main.go @@ -26,6 +26,7 @@ import ( clientgoscheme "k8s.io/client-go/kubernetes/scheme" _ "k8s.io/client-go/plugin/pkg/client/auth/gcp" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/webhook" @@ -53,6 +54,7 @@ func init() { func main() { var metricsAddr string + var healthAddr string var enableLeaderElection bool var tailingSidecarImage string var configPath string @@ -60,6 +62,7 @@ func main() { var err error flag.StringVar(&metricsAddr, "metrics-addr", ":8080", "The address the metric endpoint binds to.") + flag.StringVar(&healthAddr, "health-addr", ":8081", "The address the health check endpoint binds to.") flag.BoolVar(&enableLeaderElection, "enable-leader-election", false, "Enable leader election for controller manager. "+ "Enabling this will ensure there is only one active controller manager.") @@ -92,11 +95,12 @@ func main() { Metrics: metricsserver.Options{ BindAddress: metricsAddr, }, - LeaderElection: enableLeaderElection, - LeaderElectionID: "7b555970.sumologic.com", - LeaseDuration: (*time.Duration)(&config.LeaderElection.LeaseDuration), - RenewDeadline: (*time.Duration)(&config.LeaderElection.RenewDeadline), - RetryPeriod: (*time.Duration)(&config.LeaderElection.RetryPeriod), + HealthProbeBindAddress: healthAddr, + LeaderElection: enableLeaderElection, + LeaderElectionID: "7b555970.sumologic.com", + LeaseDuration: (*time.Duration)(&config.LeaderElection.LeaseDuration), + RenewDeadline: (*time.Duration)(&config.LeaderElection.RenewDeadline), + RetryPeriod: (*time.Duration)(&config.LeaderElection.RetryPeriod), }) if err != nil { setupLog.Error(err, "unable to start manager") @@ -129,6 +133,16 @@ func main() { }) mgr.Add(webhookServer) + if err = mgr.AddReadyzCheck("readyz", webhookServer.StartedChecker()); err != nil { + setupLog.Error(err, "unable to set up readiness check") + os.Exit(1) + } + + if err = mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to set up health check") + os.Exit(1) + } + setupLog.Info("starting manager") if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { setupLog.Error(err, "problem running manager")