Skip to content

Commit

Permalink
Fix clean up of old attributes when containers are not restarting
Browse files Browse the repository at this point in the history
When a pod crashes while in the process of starting, the operator
cleans up outdated attributes in the galera CR status.

The operator wrongly assumes that it can probe a container's state
as soon as it gets a pod object from the API server, which is not always
true (e.g when the pod is in "Pending" state).

Fix the attribute clean up by always checking the state of the pod's
container before inspecting its container ID.

Jira: OSPRH-9411
  • Loading branch information
dciabrin committed Sep 27, 2024
1 parent 61d230f commit b8de389
Showing 1 changed file with 13 additions and 3 deletions.
16 changes: 13 additions & 3 deletions controllers/galera_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,16 @@ func getRunningPodsMissingGcomm(ctx context.Context, pods []corev1.Pod, instance
return
}

// getGaleraContainerID retrieves the ContainerID of the galera container running in a pod
func getGaleraContainerID(pod *corev1.Pod) (found bool, CID string) {
for _, container := range pod.Status.ContainerStatuses {
if container.Name == "galera" {
return true, container.ContainerID
}
}
return false, ""
}

// isGaleraContainerStartedAndWaiting checks whether the galera container is waiting for a gcomm_uri file
func isGaleraContainerStartedAndWaiting(ctx context.Context, pod *corev1.Pod, instance *mariadbv1.Galera, h *helper.Helper, config *rest.Config) bool {
waiting := false
Expand Down Expand Up @@ -282,14 +292,14 @@ func assertPodsAttributesValidity(helper *helper.Helper, instance *mariadbv1.Gal
// A node can have various attributes depending on its known state.
// A ContainerID attribute is only present if the node is being started.
attrCID := instance.Status.Attributes[pod.Name].ContainerID
podCID := pod.Status.ContainerStatuses[0].ContainerID
if attrCID != "" && attrCID != podCID {
containerFound, podCID := getGaleraContainerID(&pod)
if !containerFound || (attrCID != "" && attrCID != podCID) {
// This gcomm URI was pushed in a pod which was restarted
// before the attribute got cleared, which means the pod
// failed to start galera. Clear the attribute here, and
// reprobe the pod's state in the next reconcile loop
clearPodAttributes(instance, pod.Name)
util.LogForObject(helper, "Pod restarted while galera was starting", instance, "pod", pod.Name, "current pod ID", podCID, "recorded ID", attrCID)
util.LogForObject(helper, "Pod restarted while galera was starting", instance, "pod", pod.Name, "recorded ID", attrCID)
}
}
}
Expand Down

0 comments on commit b8de389

Please sign in to comment.