SENTINEL FAILOVER COORDINATED actually does a leader election

Doing a proper leader election allows to increase the time available for the actual FAILOVER command, as other Sentinels will not attempt to initiate another failover during the failover timeout. Interestingly, Sentinel does not take the reported primary up/down status into account when counting replies to leader election. Thus, Sentinel will simply proceed with the failover once we reach the quorum even if we don't are in O_DOWN state. Signed-off-by: Simon Baatz <[email protected]>
valkey-io · Dec 1, 2024 · 55473d8 · 55473d8
1 parent 439f367
commit 55473d8
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 13 deletions.
diff --git a/src/commands/sentinel-failover.json b/src/commands/sentinel-failover.json
@@ -19,7 +19,7 @@
         ],
         "reply_schema": {
             "const": "OK",
-            "description": "Force a fail over as if the primary was not reachable, and without asking for agreement to other Sentinels."
+            "description": "Force a fail over of the primary. Without options, the fail over is executed immediately as if the primary was not reachable. Using `COORDINATED`, fail over seeking agreement from other Sentinels and using coordinated fail over."
         },
         "arguments": [
             {

diff --git a/src/sentinel.c b/src/sentinel.c
@@ -144,6 +144,9 @@ static mstime_t sentinel_default_failover_timeout = 60 * 3 * 1000;
 #define SENTINEL_VALKEY_NO_FAILOVER 1 /* Failover supported, no failover ongoing */
 #define SENTINEL_VALKEY_FAILOVER 2    /* Failover supported, failover ongoing */
 
+/* sentinelAskPrimaryStateToOtherSentinels flags */
+#define SENTINEL_ASK_FORCED (1 << 0)
+
 /* The link to a sentinelValkeyInstance. When we have the same set of Sentinels
  * monitoring many primaries, we have different instances representing the
  * same Sentinels, one per primary, and we need to share the hiredis connections
@@ -421,6 +424,7 @@ int sentinelSendPing(sentinelValkeyInstance *ri);
 int sentinelForceHelloUpdateForPrimary(sentinelValkeyInstance *primary);
 sentinelValkeyInstance *getSentinelValkeyInstanceByAddrAndRunID(dict *instances, char *ip, int port, char *runid);
 void sentinelSimFailureCrash(void);
+void sentinelAskPrimaryStateToOtherSentinels(sentinelValkeyInstance *primary, int flags);
 
 /* ========================= Dictionary types =============================== */
 
@@ -3890,7 +3894,7 @@ void sentinelCommand(client *c) {
             return;
         if (c->argc == 4) {
             if (!strcasecmp(c->argv[3]->ptr, "coordinated")) {
-                coordinated = SRI_COORD_FAILOVER;
+                coordinated = 1;
                 if (ri->valkey_failover_state == SENTINEL_VALKEY_FAILOVER_NS) {
                     addReplyError(c, "-NOGOODPRIMARY Primary does not support FAILOVER command");
                     return;
@@ -3909,8 +3913,19 @@ void sentinelCommand(client *c) {
             return;
         }
         serverLog(LL_NOTICE, "Executing user requested FAILOVER of '%s'", ri->name);
+        ri->s_down_since_time = mstime();
+        ri->flags |= SRI_S_DOWN;
         sentinelStartFailover(ri);
-        ri->flags |= SRI_FORCE_FAILOVER | coordinated;
+        if (coordinated) {
+            ri->flags |= SRI_COORD_FAILOVER;
+            /* Initiate a leader election, The SENTINEL_FAILOVER_STATE_WAIT_START
+               state will wait until we are elected. */
+            sentinelAskPrimaryStateToOtherSentinels(ri, SENTINEL_ASK_FORCED);
+        } else {
+            /* SRI_FORCE_FAILOVER will cause the SENTINEL_FAILOVER_STATE_WAIT_START
+               state to regard us as leader (without election). */
+            ri->flags |= SRI_FORCE_FAILOVER;
+        }
         addReply(c, shared.ok);
     } else if (!strcasecmp(c->argv[1]->ptr, "pending-scripts")) {
         /* SENTINEL PENDING-SCRIPTS */
@@ -4523,7 +4538,6 @@ void sentinelReceiveIsPrimaryDownReply(redisAsyncContext *c, void *reply, void *
  * SENTINEL IS-PRIMARY-DOWN-BY-ADDR requests to other sentinels
  * in order to get the replies that allow to reach the quorum
  * needed to mark the primary in ODOWN state and trigger a failover. */
-#define SENTINEL_ASK_FORCED (1 << 0)
 void sentinelAskPrimaryStateToOtherSentinels(sentinelValkeyInstance *primary, int flags) {
     dictIterator *di;
     dictEntry *de;
@@ -5079,25 +5093,28 @@ void sentinelFailoverSelectReplica(sentinelValkeyInstance *ri) {
 
 void sentinelFailoverSendFailover(sentinelValkeyInstance *ri) {
     int retval;
+    mstime_t time_passed = mstime() - ri->failover_state_change_time;
 
+    /* If we don't have enough time left (1 second) for the FAILOVER command
+     * timeout, then abort the failover. */
+    if (ri->failover_timeout - time_passed < 1000) {
+        sentinelEvent(LL_WARNING, "-failover-abort-master-timeout", ri, "%@");
+        sentinelAbortFailover(ri);
+        return;
+    }
     /* We can't send the command to the master if it is now
-     * disconnected. Retry again and again with this state until the timeout
-     * is reached, then abort the failover. */
-    mstime_t time_passed = mstime() - ri->failover_state_change_time;
+     * disconnected. Retry again and again with this state (until the timeout
+     * is reached and we abort the failover.) */
     if (ri->link->disconnected) {
-        if (time_passed > ri->failover_timeout) {
-            sentinelEvent(LL_WARNING, "-failover-abort-master-timeout", ri, "%@");
-            sentinelAbortFailover(ri);
-        }
         return;
     }
 
-    /* Send FAILOVER command to switch the role of the master and the
+    /* Send FAILOVER command to switch the role of the primary and the
      * promoted replica. We actually register a generic callback for this
      * command as we don't really care about the reply. We check if it worked
      * indirectly observing if INFO returns a different role (master instead of
      * slave). */
-    retval = sentinelFailoverTo(ri, ri->promoted_replica->addr, ri->down_after_period);
+    retval = sentinelFailoverTo(ri, ri->promoted_replica->addr, ri->failover_timeout - time_passed);
     if (retval != C_OK) return;
     sentinelEvent(LL_NOTICE, "+failover-state-wait-promotion",
                   ri->promoted_replica, "%@");