Skip to content

Commit

Permalink
feat(operator): add proper health checks
Browse files Browse the repository at this point in the history
Use dedicated controller-runtime facilities to create health and
readiness checks for the webhook server.

Unfortunately it seems like this doesn't solve the webhook
unavailability issue right after starting the operator. I've added an
explicit 10 second wait to the E2E tests for now, we'll have to see if
that's enough to fix the problem.
  • Loading branch information
swiatekm committed Oct 26, 2023
1 parent 013c0ae commit 9dda19c
Show file tree
Hide file tree
Showing 8 changed files with 49 additions and 44 deletions.
7 changes: 7 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ e2e-helm-certmanager: e2e
e2e-helm-custom-configuration: KUTTL_CONFIG = kuttl-test-helm-custom-configuration.yaml
e2e-helm-custom-configuration: e2e

# We sleep for 10 seconds here because webhooks can mysteriously be unavailable even though the readiness check passes
.PHONY: e2e-wait-until-operator-ready
e2e-wait-until-operator-ready:
kubectl wait --for=condition=available --timeout 300s deploy --all -n tailing-sidecar-system
kubectl wait --for=condition=ready --timeout 300s pod --all -n tailing-sidecar-system
sleep 10

build-push-deploy: build-push-sidecar build-push-deploy-operator

build-push-sidecar:
Expand Down
25 changes: 9 additions & 16 deletions helm/tailing-sidecar-operator/templates/resources.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -474,29 +474,22 @@ spec:
protocol: TCP
startupProbe:
httpGet:
scheme: HTTPS
path: /add-tailing-sidecars-v1-pod
port: 9443
httpHeaders:
- name: Accept
value: application/json
- name: Content-Type
value: application/json
path: /readyz
port: 8081
{{- if .Values.operator.startupProbe}}
{{ toYaml .Values.operator.startupProbe | indent 10 }}
{{ else }}
periodSeconds: 3
{{ end }}
readinessProbe:
httpGet:
path: /readyz
port: 8081
periodSeconds: 10
livenessProbe:
httpGet:
scheme: HTTPS
path: /add-tailing-sidecars-v1-pod
port: 9443
httpHeaders:
- name: Accept
value: application/json
- name: Content-Type
value: application/json
path: /healthz
port: 8081
{{- if .Values.operator.livenessProbe}}
{{ toYaml .Values.operator.livenessProbe | indent 10 }}
{{ else }}
Expand Down
3 changes: 1 addition & 2 deletions kuttl-test-helm-certmanager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,4 @@ kindContainers:
commands:
- command: make -C ./operator deploy-cert-manager
- command: helm upgrade --install test-release ./helm/tailing-sidecar-operator -f ./helm/tests/values.withCertManager.yaml -n tailing-sidecar-system --create-namespace
- command: kubectl wait --for=condition=available --timeout 300s deploy -l app.kubernetes.io/name=tailing-sidecar-operator -n tailing-sidecar-system
- command: kubectl wait --for=condition=ready --timeout 300s pod -l app.kubernetes.io/name=tailing-sidecar-operator -n tailing-sidecar-system
- command: make e2e-wait-until-operator-ready
3 changes: 1 addition & 2 deletions kuttl-test-helm-custom-configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,4 @@ kindContainers:
- registry.localhost:5000/sumologic/tailing-sidecar:test
commands:
- command: helm upgrade --install test-release ./helm/tailing-sidecar-operator -f ./helm/tests/values.withCustomConfiguration.yaml -n tailing-sidecar-system --create-namespace
- command: kubectl wait --for=condition=available --timeout 300s deploy -l app.kubernetes.io/name=tailing-sidecar-operator -n tailing-sidecar-system
- command: kubectl wait --for=condition=ready --timeout 300s pod -l app.kubernetes.io/name=tailing-sidecar-operator -n tailing-sidecar-system
- command: make e2e-wait-until-operator-ready
4 changes: 2 additions & 2 deletions kuttl-test-helm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ kindContainers:
- registry.localhost:5000/sumologic/tailing-sidecar:test
commands:
- command: helm upgrade --install test-release ./helm/tailing-sidecar-operator -f ./helm/tests/values.yaml -n tailing-sidecar-system --create-namespace
- command: kubectl wait --for=condition=available --timeout 300s deploy -l app.kubernetes.io/name=tailing-sidecar-operator -n tailing-sidecar-system
- command: kubectl wait --for=condition=ready --timeout 300s pod -l app.kubernetes.io/name=tailing-sidecar-operator -n tailing-sidecar-system

- command: make e2e-wait-until-operator-ready
2 changes: 1 addition & 1 deletion kuttl-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ kindContainers:
commands:
- command: make -C ./operator deploy-cert-manager
- command: make -C ./operator deploy IMG="registry.localhost:5000/sumologic/tailing-sidecar-operator:test" TAILING_SIDECAR_IMG="registry.localhost:5000/sumologic/tailing-sidecar:test"
- command: kubectl wait --for=condition=ready --timeout 300s pod -l control-plane=tailing-sidecar-operator -n tailing-sidecar-system
- command: make e2e-wait-until-operator-ready
25 changes: 9 additions & 16 deletions operator/config/default/manager_webhook_patch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,18 @@ spec:
protocol: TCP
startupProbe:
httpGet:
scheme: HTTPS
path: /add-tailing-sidecars-v1-pod
port: 9443
httpHeaders:
- name: Accept
value: application/json
- name: Content-Type
value: application/json
path: /readyz
port: 8081
periodSeconds: 3
readinessProbe:
httpGet:
path: /readyz
port: 8081
periodSeconds: 10
livenessProbe:
httpGet:
scheme: HTTPS
path: /add-tailing-sidecars-v1-pod
port: 9443
httpHeaders:
- name: Accept
value: application/json
- name: Content-Type
value: application/json
path: /healthz
port: 8081
initialDelaySeconds: 1
periodSeconds: 10
volumeMounts:
Expand Down
24 changes: 19 additions & 5 deletions operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
"sigs.k8s.io/controller-runtime/pkg/webhook"
Expand Down Expand Up @@ -53,13 +54,15 @@ func init() {

func main() {
var metricsAddr string
var healthAddr string
var enableLeaderElection bool
var tailingSidecarImage string
var configPath string
var config Config
var err error

flag.StringVar(&metricsAddr, "metrics-addr", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&healthAddr, "health-addr", ":8081", "The address the health check endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "enable-leader-election", false,
"Enable leader election for controller manager. "+
"Enabling this will ensure there is only one active controller manager.")
Expand Down Expand Up @@ -92,11 +95,12 @@ func main() {
Metrics: metricsserver.Options{
BindAddress: metricsAddr,
},
LeaderElection: enableLeaderElection,
LeaderElectionID: "7b555970.sumologic.com",
LeaseDuration: (*time.Duration)(&config.LeaderElection.LeaseDuration),
RenewDeadline: (*time.Duration)(&config.LeaderElection.RenewDeadline),
RetryPeriod: (*time.Duration)(&config.LeaderElection.RetryPeriod),
HealthProbeBindAddress: healthAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: "7b555970.sumologic.com",
LeaseDuration: (*time.Duration)(&config.LeaderElection.LeaseDuration),
RenewDeadline: (*time.Duration)(&config.LeaderElection.RenewDeadline),
RetryPeriod: (*time.Duration)(&config.LeaderElection.RetryPeriod),
})
if err != nil {
setupLog.Error(err, "unable to start manager")
Expand Down Expand Up @@ -129,6 +133,16 @@ func main() {
})
mgr.Add(webhookServer)

if err = mgr.AddReadyzCheck("readyz", webhookServer.StartedChecker()); err != nil {
setupLog.Error(err, "unable to set up readiness check")
os.Exit(1)
}

if err = mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up health check")
os.Exit(1)
}

setupLog.Info("starting manager")
if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
setupLog.Error(err, "problem running manager")
Expand Down

0 comments on commit 9dda19c

Please sign in to comment.