From 4beccf3974f60516b2d8f868826960c9683e39dc Mon Sep 17 00:00:00 2001 From: Damien Ciabrini Date: Mon, 5 Aug 2024 15:53:42 +0200 Subject: [PATCH] Do not start galera as joiner with 1-replica cluster The mariadb operator checks for available pods in the galera statefulset to determine whether to start mysqld as a bootstrap or a joiner node on all the pods that remain to be started. When galera is deployed as a 1-replica cluster (e.g. in CI), there is a small time window after the statefulset has been probed and galera marked as 'bootstrapped', where the single pod can crash before being probed. If so, the operator will try to restart the pod as a 'joiner', which is invalid. Add a specific check for 1-replica deployments, so that the operator bails out and requeue the event when a pod is identified as a joiner. This allows the operator to reprobe the galera state restart the pod correctly, in order to avoid an unecessary error in the logs. Jira: OSPRH-7821 --- controllers/galera_controller.go | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/controllers/galera_controller.go b/controllers/galera_controller.go index abac7c0c..4ced8c23 100644 --- a/controllers/galera_controller.go +++ b/controllers/galera_controller.go @@ -691,7 +691,7 @@ func (r *GaleraReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res // Note: // . A pod is available in the statefulset if the pod's readiness // probe returns true (i.e. galera is running in the pod and clustered) - // . Cluster is bootstrapped if as soon as one pod is available + // . Cluster is bootstrapped as soon as one pod is available instance.Status.Bootstrapped = statefulset.Status.AvailableReplicas > 0 if instance.Status.Bootstrapped { @@ -708,8 +708,17 @@ func (r *GaleraReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res } } + runningPods := getRunningPodsMissingGcomm(ctx, podList.Items, instance, helper, r.config) + // Special case for 1-node deployment: if the statefulset reports 1 node is available + // but the pod shows up in runningPods (i.e. NotReady), do not consider it a joiner. + // Wait for the two statuses to re-sync after another k8s probe is run. + if *instance.Spec.Replicas == 1 && len(runningPods) == 1 { + log.Info("Galera node no longer running. Requeuing") + return ctrl.Result{RequeueAfter: time.Duration(3) * time.Second}, nil + } + // The other 'Running' pods can join the existing cluster. - for _, pod := range getRunningPodsMissingGcomm(ctx, podList.Items, instance, helper, r.config) { + for _, pod := range runningPods { name := pod.Name joinerURI := buildGcommURI(instance) log.Info("Pushing gcomm URI to joiner", "pod", name)