Skip to content

Commit

Permalink
Merge pull request #272 from dciabrin/safe_to_bootstrap
Browse files Browse the repository at this point in the history
Fast bootstrap using safe_to_bootstrap flag
  • Loading branch information
openshift-merge-bot[bot] authored Oct 9, 2024
2 parents 6f4b7a6 + 67d53bd commit 200b937
Show file tree
Hide file tree
Showing 8 changed files with 201 additions and 45 deletions.
11 changes: 11 additions & 0 deletions api/bases/mariadb.openstack.org_galeras.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,21 @@ spec:
gcomm:
description: Gcomm URI used to connect to the galera cluster
type: string
no_grastate:
description: This galera node has its state recovered from the
DB
type: boolean
safe_to_bootstrap:
description: This galera node can bootstrap a galera cluster
type: boolean
seqno:
description: Last recorded replication sequence number in the
DB
type: string
uuid:
description: UUID of the partition that is seen by the galera
node
type: string
required:
- seqno
type: object
Expand Down
6 changes: 6 additions & 0 deletions api/v1beta1/galera_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,14 @@ type GaleraSpecCore struct {

// GaleraAttributes holds startup information for a Galera host
type GaleraAttributes struct {
// UUID of the partition that is seen by the galera node
UUID string `json:"uuid,omitempty"`
// Last recorded replication sequence number in the DB
Seqno string `json:"seqno"`
// This galera node can bootstrap a galera cluster
SafeToBootstrap bool `json:"safe_to_bootstrap,omitempty"`
// This galera node has its state recovered from the DB
NoGrastate bool `json:"no_grastate,omitempty"`
// Gcomm URI used to connect to the galera cluster
Gcomm string `json:"gcomm,omitempty"`
// Identifier of the container at the time the gcomm URI was injected
Expand Down
11 changes: 11 additions & 0 deletions config/crd/bases/mariadb.openstack.org_galeras.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,21 @@ spec:
gcomm:
description: Gcomm URI used to connect to the galera cluster
type: string
no_grastate:
description: This galera node has its state recovered from the
DB
type: boolean
safe_to_bootstrap:
description: This galera node can bootstrap a galera cluster
type: boolean
seqno:
description: Last recorded replication sequence number in the
DB
type: string
uuid:
description: UUID of the partition that is seen by the galera
node
type: string
required:
- seqno
type: object
Expand Down
67 changes: 49 additions & 18 deletions controllers/galera_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package controllers
import (
"bytes"
"context"
"encoding/json"
"fmt"
"sort"
"strconv"
Expand Down Expand Up @@ -94,20 +95,31 @@ func GetLog(ctx context.Context, controller string) logr.Logger {
//

// findBestCandidate returns the node with the lowest seqno
func findBestCandidate(status *mariadbv1.GaleraStatus) string {
sortednodes := maps.Keys(status.Attributes)
func findBestCandidate(g *mariadbv1.Galera) (node string, found bool) {
sortednodes := maps.Keys(g.Status.Attributes)
sort.Strings(sortednodes)
bestnode := ""
bestseqno := -1
for _, node := range sortednodes {
seqno := status.Attributes[node].Seqno
// On clean shutdown, galera sets the last
// stopped node as 'safe to bootstrap', so use
// this hint when we can
if g.Status.Attributes[node].SafeToBootstrap {
return node, true
}
seqno := g.Status.Attributes[node].Seqno
intseqno, _ := strconv.Atoi(seqno)
if intseqno >= bestseqno {
bestnode = node
bestseqno = intseqno
}
}
return bestnode //"galera-0"
// if we pass here, a candidate is only valid if we
// inspected all the expected replicas (e.g. typically 3)
if len(g.Status.Attributes) != int(*g.Spec.Replicas) {
return "", false
}
return bestnode, true //"galera-0"
}

// buildGcommURI builds a gcomm URI for a galera instance
Expand Down Expand Up @@ -240,18 +252,22 @@ func injectGcommURI(ctx context.Context, h *helper.Helper, config *rest.Config,
}

// retrieveSequenceNumber probes a pod's galera instance for sequence number
func retrieveSequenceNumber(ctx context.Context, helper *helper.Helper, config *rest.Config, instance *mariadbv1.Galera, pod *corev1.Pod) error {
err := mariadb.ExecInPod(ctx, helper, config, instance.Namespace, pod.Name, "galera",
func retrieveSequenceNumber(ctx context.Context, helper *helper.Helper, config *rest.Config, instance *mariadbv1.Galera, pod *corev1.Pod) (errStr []string, err error) {
errStr = nil
err = mariadb.ExecInPod(ctx, helper, config, instance.Namespace, pod.Name, "galera",
[]string{"/bin/bash", "/var/lib/operator-scripts/detect_last_commit.sh"},
func(stdout *bytes.Buffer, _ *bytes.Buffer) error {
seqno := strings.TrimSuffix(stdout.String(), "\n")
attr := mariadbv1.GaleraAttributes{
Seqno: seqno,
func(stdout *bytes.Buffer, stderr *bytes.Buffer) error {
var attr mariadbv1.GaleraAttributes
if err := json.Unmarshal(stdout.Bytes(), &attr); err != nil {
return err
}
if stderr.Len() > 0 {
errStr = strings.Split(strings.TrimSuffix(stderr.String(), "\n"), "\n")
}
instance.Status.Attributes[pod.Name] = attr
return nil
})
return err
return
}

// clearPodAttributes clears information known by the operator about a pod
Expand Down Expand Up @@ -737,7 +753,7 @@ func (r *GaleraReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res
for _, pod := range getReadyPods(podList.Items) {
name := pod.Name
if _, found := instance.Status.Attributes[name]; found {
log.Info("Galera started on", "pod", pod.Name)
log.Info("Galera started", "pod", name)
clearPodAttributes(instance, name)
}
}
Expand Down Expand Up @@ -777,21 +793,36 @@ func (r *GaleraReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res
// . any other status means the the pod is starting/restarting. We can't
// exec into the pod yet, so we will probe it in another reconcile loop.
if !instance.Status.Bootstrapped && !isBootstrapInProgress(instance) {
var node string
found := false
for _, pod := range getRunningPodsMissingAttributes(ctx, podList.Items, instance, helper, r.config) {
name := pod.Name
util.LogForObject(helper, fmt.Sprintf("Pod %s running, retrieve seqno", name), instance)
err := retrieveSequenceNumber(ctx, helper, r.config, instance, &pod)
warn, err := retrieveSequenceNumber(ctx, helper, r.config, instance, &pod)
if len(warn) > 0 {
util.LogForObject(helper, fmt.Sprintf("Warning: %q", warn), instance)
}
if err != nil {
log.Error(err, "Failed to retrieve seqno for ", "name", name)
log.Error(err, fmt.Sprintf("Failed to retrieve seqno for %s", name))
return ctrl.Result{}, err
}
log.Info("", "Pod", name, "seqno:", instance.Status.Attributes[name].Seqno)
log.Info(fmt.Sprintf("Attributes retrieved for %s", name),
"UUID", instance.Status.Attributes[name].UUID,
"Seqno", instance.Status.Attributes[name].Seqno,
"SafeToBootstrap", instance.Status.Attributes[name].SafeToBootstrap,
)
if instance.Status.Attributes[name].SafeToBootstrap {
node = name
found = true
break
}
}

// Check if we have enough info to bootstrap the cluster now
if (len(instance.Status.Attributes) > 0) &&
(len(instance.Status.Attributes) == len(podList.Items)) {
node := findBestCandidate(&instance.Status)
if !found {
node, found = findBestCandidate(instance)
}
if found {
pod := getPodFromName(podList.Items, node)
log.Info("Pushing gcomm URI to bootstrap", "pod", node)
// Setting the gcomm attribute marks this pod as 'currently bootstrapping the cluster'
Expand Down
9 changes: 8 additions & 1 deletion pkg/mariadb/statefulset.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ func StatefulSet(g *mariadbv1.Galera, configHash string) *appsv1.StatefulSet {
},
corev1.LabelHostname,
)
if g.Spec.NodeSelector != nil && len(g.Spec.NodeSelector) > 0 {
if len(g.Spec.NodeSelector) > 0 {
sts.Spec.Template.Spec.NodeSelector = g.Spec.NodeSelector
}

Expand Down Expand Up @@ -164,6 +164,13 @@ func getGaleraContainers(g *mariadbv1.Galera, configHash string) []corev1.Contai
},
},
},
Lifecycle: &corev1.Lifecycle{
PreStop: &corev1.LifecycleHandler{
Exec: &corev1.ExecAction{
Command: []string{"/bin/bash", "/var/lib/operator-scripts/mysql_shutdown.sh"},
},
},
},
}}
logSideCar := corev1.Container{
Image: g.Spec.ContainerImage,
Expand Down
4 changes: 4 additions & 0 deletions pkg/mariadb/volumes.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ func getGaleraVolumes(g *mariadbv1.Galera) []corev1.Volume {
Key: "mysql_probe.sh",
Path: "mysql_probe.sh",
},
{
Key: "mysql_shutdown.sh",
Path: "mysql_shutdown.sh",
},
{
Key: "detect_last_commit.sh",
Path: "detect_last_commit.sh",
Expand Down
90 changes: 64 additions & 26 deletions templates/galera/bin/detect_last_commit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,74 @@ recover_args="--datadir=/var/lib/mysql \
--skip-networking \
--wsrep-cluster-address=gcomm://localhost"
recovery_file_regex='s/.*WSREP\:.*position\s*recovery.*--log_error='\''\([^'\'']*\)'\''.*/\1/p'
recovered_position_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p'
recovered_position_uuid_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position\:\ \(.*\)\:.*$/\1/p'
recovered_position_seqno_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p'

grastate_file=/var/lib/mysql/grastate.dat
gvwstate_file=/var/lib/mysql/gvwstate.dat

uuid=""
seqno=""
safe_to_bootstrap=0
no_grastate=0

function json_summary {
declare -a out
if [ -n "$uuid" ]; then out+=( "\"uuid\":\"$uuid\"" ); fi
if [ -n "$seqno" ]; then out+=( "\"seqno\":\"$seqno\"" ); fi
if [ $safe_to_bootstrap -ne 0 ]; then out+=( '"safe_to_bootstrap":true' ); fi
if [ $no_grastate -ne 0 ]; then out+=( '"no_grastate":true' ); fi
IFS=, ; echo "{${out[*]}}"
}

trap json_summary EXIT

# codership/galera#354
# Some ungraceful shutdowns can leave an empty gvwstate.dat on
# disk. This will prevent galera to join the cluster if it is
# configured to attempt PC recovery. Removing that file makes the
# node fall back to the normal, unoptimized joining process.
if [ -f /var/lib/mysql/gvwstate.dat ] && \
[ ! -s /var/lib/mysql/gvwstate.dat ]; then
echo "empty /var/lib/mysql/gvwstate.dat detected, removing it to prevent PC recovery failure at next restart" >&2
rm -f /var/lib/mysql/gvwstate.dat
if [ -f $gvwstate_file ] && \
[ ! -s $gvwstate_file ]; then
echo "empty $gvwstate_file detected, removing it to prevent PC recovery failure at next restart" >&2
rm -f $gvwstate_file
fi

# Attempt to retrieve the seqno information and safe_to_bootstrap hint
# from the saved state file on disk

if [ -f $grastate_file ]; then
uuid="$(cat $grastate_file | sed -n 's/^uuid.\s*\(.*\)\s*$/\1/p')"
seqno="$(cat $grastate_file | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')"
safe_to_bootstrap="$(cat $grastate_file | sed -n 's/^safe_to_bootstrap.\s*\(.*\)\s*$/\1/p')"

if [ -z "$uuid" ] || \
[ "$uuid" = "00000000-0000-0000-0000-000000000000" ]; then
safe_to_bootstrap=0
fi
if [ "$safe_to_bootstrap" = "1" ]; then
if [ -z "$seqno" ] || [ "$seqno" = "-1" ]; then
safe_to_bootstrap=0
fi
fi
fi

echo "attempting to detect last commit version by reading grastate.dat" >&2
last_commit="$(cat /var/lib/mysql/grastate.dat | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')"
if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then
# If the seqno could not be retrieved, inspect the mysql database

if [ -z "$seqno" ] || [ "$seqno" = "-1" ]; then
tmp=$(mktemp)
chown mysql:mysql $tmp

# if we pass here because grastate.dat doesn't exist,
# try not to bootstrap from this node if possible
# if [ ! -f /var/lib/mysql/grastate.dat ]; then
# set_no_grastate
# fi

echo "now attempting to detect last commit version using 'mysqld_safe --wsrep-recover'" >&2
# if we pass here because grastate.dat doesn't exist, report it
if [ ! -f /var/lib/mysql/grastate.dat ]; then
no_grastate=1
fi

mysqld_safe --wsrep-recover $recover_args --log-error=$tmp 1>&2
mysqld_safe --wsrep-recover $recover_args --log-error=$tmp >/dev/null

last_commit="$(cat $tmp | sed -n $recovered_position_regex | tail -1)"
if [ -z "$last_commit" ]; then
seqno="$(cat $tmp | sed -n "$recovered_position_seqno_regex" | tail -1)"
uuid="$(cat $tmp | sed -n "$recovered_position_uuid_regex" | tail -1)"
if [ -z "$seqno" ]; then
# Galera uses InnoDB's 2pc transactions internally. If
# server was stopped in the middle of a replication, the
# recovery may find a "prepared" XA transaction in the
Expand All @@ -52,25 +89,26 @@ if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then
# since the DB will get resynchronized anyway
echo "local node was not shutdown properly. Rollback stuck transaction with --tc-heuristic-recover" >&2
mysqld_safe --wsrep-recover $recover_args \
--tc-heuristic-recover=rollback --log-error=$tmp 2>/dev/null
--tc-heuristic-recover=rollback --log-error=$tmp >/dev/null 2>&1

last_commit="$(cat $tmp | sed -n $recovered_position_regex | tail -1)"
if [ ! -z "$last_commit" ]; then
seqno="$(cat $tmp | sed -n "$recovered_position_seqno_regex" | tail -1)"
uuid="$(cat $tmp | sed -n "$recovered_position_uuid_regex" | tail -1)"
if [ ! -z "$seqno" ]; then
echo "State recovered. force SST at next restart for full resynchronization" >&2
rm -f /var/lib/mysql/grastate.dat
# try not to bootstrap from this node if possible
# set_no_grastate
no_grastate=1
fi
fi
fi
fi
rm -f $tmp
fi

if [ ! -z "$last_commit" ]; then
echo "$last_commit"
exit 0
else

if [ -z "$seqno" ]; then
echo "Unable to detect last known write sequence number" >&2
exit 1
fi

# json data is printed on exit
48 changes: 48 additions & 0 deletions templates/galera/bin/mysql_shutdown.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/bin/bash

# NOTE(dciabrin) we might use downward API to populate those in the future
PODNAME=$HOSTNAME
SERVICE=${PODNAME/-galera-[0-9]*/}

# API server config
APISERVER=https://kubernetes.default.svc
SERVICEACCOUNT=/var/run/secrets/kubernetes.io/serviceaccount
NAMESPACE=$(cat ${SERVICEACCOUNT}/namespace)
TOKEN=$(cat ${SERVICEACCOUNT}/token)
CACERT=${SERVICEACCOUNT}/ca.crt

function log() {
echo "$(date +%F_%H_%M_%S) `basename $0` $*"
}

# Log in mariadb's log file if configured, so the output of this script
# is captured when logToDisk is enabled in the galera CR
LOGFILE=$(my_print_defaults mysqld | grep log-error | cut -d= -f2)
if [ -f "$LOGFILE" ]; then
exec &> >(cat >> "$LOGFILE") 2>&1
else
exec &> >(cat >> /proc/1/fd/1) 2>&1
fi

# On update, k8s performs a rolling restart, but on resource deletion,
# all pods are deleted concurrently due to the fact that we require
# PodManagementPolicy: appsv1.ParallelPodManagement for bootstrapping
# the cluster. So try to stop the nodes sequentially so that
# the last galera node stopped can set a "safe_to_bootstrap" flag.

if curl -s --cacert ${CACERT} --header "Content-Type:application/json" --header "Authorization: Bearer ${TOKEN}" -X GET ${APISERVER}/api/v1/namespaces/openstack/pods/${PODNAME} | grep -q '"code": *401'; then
log "Galera resource is being deleted"
nth=$(( ${PODNAME//*-/} + 1 ))
while : ; do
size=$(mysql -uroot -p"${DB_ROOT_PASSWORD}" -sNEe "show status like 'wsrep_cluster_size';" | tail -1)
if [ ${size:-0} -gt $nth ]; then
log "Waiting for cluster to scale down"
sleep 2
else
break
fi
done
fi

log "Shutting down local galera node"
mysqladmin -uroot -p"${DB_ROOT_PASSWORD}" shutdown

0 comments on commit 200b937

Please sign in to comment.