From a4dd8b13e8818207174abe004be9453ca87b434b Mon Sep 17 00:00:00 2001 From: Damien Ciabrini Date: Tue, 23 Jul 2024 08:08:46 -0400 Subject: [PATCH] Fix non-Primary checks in liveness probe The liveness probe did not parse non-Primary condition out of the mysql CLI command. Consequently, the liveness did not fail whe a galera node was disconnected from the primary partition and the galera pod could not restart automatically, leading to long delays before restart or sometimes full cluster disruption. Fix the way probes are handled and refactors bits to allow more precise conditions in startup/readiness/liveness probes. Jira: OSPRH-8862 --- templates/galera/bin/mysql_probe.sh | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/templates/galera/bin/mysql_probe.sh b/templates/galera/bin/mysql_probe.sh index 4b190b1a..a1b262f3 100755 --- a/templates/galera/bin/mysql_probe.sh +++ b/templates/galera/bin/mysql_probe.sh @@ -1,27 +1,37 @@ #!/bin/bash -set -eu +set -u # This secret is mounted by k8s and always up to date read -s -u 3 3< /var/lib/secrets/dbpassword MYSQL_PWD || true export MYSQL_PWD PROBE_USER=root +function mysql_status_check { + local status=$1 + local expect=$2 + set -x + mysql -u${PROBE_USER} -sNEe "show status like '${status}';" | tail -1 | grep -w -e "${expect}" +} # Consider the pod has "started" once mysql is reachable +# and is part of the primary partition if [ "$1" = "startup" ]; then - mysql -u${PROBE_USER} -sNe "select(1);" + mysql_status_check wsrep_cluster_status Primary exit $? fi -set -x +# readiness and liveness probes are run by k8s only after start probe succeeded case "$1" in readiness) # If the node is e.g. a donor, it cannot serve traffic - mysql -u${PROBE_USER} -sNe "show status like 'wsrep_local_state_comment';" | grep -w -e Synced;; + mysql_status_check wsrep_local_state_comment Synced + ;; liveness) - # If the node is not in the primary partition, restart it - mysql -u${PROBE_USER} -sNe "show status like 'wsrep_cluster_status';" | grep -w -e Primary;; + # If the node is not in the primary partition, the failed liveness probe + # will make k8s restart this pod + mysql_status_check wsrep_cluster_status Primary + ;; *) echo "Invalid probe option '$1'" exit 1;;