Sentinel master reboot fix (#9438)

Add master-reboot-down-after-period as a configurable parameter, to make it possible to trigger a failover from a master that is responding with `-LOADING` for a long time after being restarted.
author: Wen Hui <wen.hui.ware@gmail.com> 2021-11-30 11:46:15 -0500
committer: GitHub <noreply@github.com> 2021-11-30 18:46:15 +0200
commit: 2afa41f62847b92ba66f5579b4731494fac3afc3 (patch)
tree: af2f348618efddd5b7a06ec5acfbd7070c92ed82
parent: af072c26bcf62dc4be1cfee7c2ca7242ac2d16aa (diff)
download: redis-2afa41f62847b92ba66f5579b4731494fac3afc3.tar.gz
3 files changed, 157 insertions, 3 deletions
diff --git a/sentinel.conf b/sentinel.conf
index c5341168e..b145ae518 100644
--- a/sentinel.conf
+++ b/sentinel.conf
@@ -339,3 +339,13 @@ SENTINEL resolve-hostnames no
 # to retain the hostnames when announced, enable announce-hostnames below.
 #
 SENTINEL announce-hostnames no
+
+# When master_reboot_down_after_period is set to 0, Sentinel does not fail over
+# when receiving a -LOADING response from a master. This was the only supported
+# behavior before version 7.0.
+#
+# Otherwise, Sentinel will use this value as the time (in ms) it is willing to
+# accept a -LOADING response after a master has been rebooted, before failing
+# over.
+
+SENTINEL master-reboot-down-after-period mymaster 0
diff --git a/src/sentinel.c b/src/sentinel.c
index 1db3bc261..297f0591f 100644
--- a/src/sentinel.c
+++ b/src/sentinel.c
@@ -76,6 +76,7 @@ typedef struct sentinelAddr {
 #define SRI_RECONF_DONE (1<<10)     /* Slave synchronized with new master. */
 #define SRI_FORCE_FAILOVER (1<<11)  /* Force failover with master up. */
 #define SRI_SCRIPT_KILL_SENT (1<<12) /* SCRIPT KILL already sent on -BUSY */
+#define SRI_MASTER_REBOOT  (1<<13)   /* Master was detected as rebooting */
 
 /* Note: times are in milliseconds. */
 #define SENTINEL_PING_PERIOD 1000
@@ -193,6 +194,8 @@ typedef struct sentinelRedisInstance {
     mstime_t s_down_since_time; /* Subjectively down since time. */
     mstime_t o_down_since_time; /* Objectively down since time. */
     mstime_t down_after_period; /* Consider it down after that period. */
+    mstime_t master_reboot_down_after_period; /* Consider master down after that period. */
+    mstime_t master_reboot_since_time; /* master reboot time since time. */
     mstime_t info_refresh;  /* Time at which we received INFO output from it. */
     dict *renamed_commands;     /* Commands renamed in this instance:
                                    Sentinel will use the alternative commands
@@ -1294,8 +1297,8 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char *
     ri->last_master_down_reply_time = mstime();
     ri->s_down_since_time = 0;
     ri->o_down_since_time = 0;
-    ri->down_after_period = master ? master->down_after_period :
-                            sentinel_default_down_after;
+    ri->down_after_period = master ? master->down_after_period : sentinel_default_down_after;
+    ri->master_reboot_down_after_period = 0;
     ri->master_link_down_time = 0;
     ri->auth_pass = NULL;
     ri->auth_user = NULL;
@@ -1971,6 +1974,13 @@ const char *sentinelHandleConfiguration(char **argv, int argc) {
         if ((sentinel.announce_hostnames = yesnotoi(argv[1])) == -1) {
             return "Please specify yes or no for the announce-hostnames option.";
         }
+    } else if (!strcasecmp(argv[0],"master-reboot-down-after-period") && argc == 3) {
+        /* master-reboot-down-after-period <name> <milliseconds> */
+        ri = sentinelGetMasterByName(argv[1]);
+        if (!ri) return "No such master with specified name.";
+        ri->master_reboot_down_after_period = atoi(argv[2]);
+        if (ri->master_reboot_down_after_period < 0)
+            return "negative time parameter.";
     } else {
         return "Unrecognized sentinel configuration statement.";
     }
@@ -2090,6 +2100,15 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) {
             /* rewriteConfigMarkAsProcessed is handled after the loop */
         }
 
+        /* sentinel master-reboot-down-after-period */
+        if (master->master_reboot_down_after_period != 0) {
+            line = sdscatprintf(sdsempty(),
+                "sentinel master-reboot-down-after-period %s %ld",
+                master->name, (long) master->master_reboot_down_after_period);
+            rewriteConfigRewriteLine(state,"sentinel master-reboot-down-after-period",line,1);
+            /* rewriteConfigMarkAsProcessed is handled after the loop */
+        }
+
         /* sentinel config-epoch */
         line = sdscatprintf(sdsempty(),
             "sentinel config-epoch %s %llu",
@@ -2214,6 +2233,7 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) {
     rewriteConfigMarkAsProcessed(state,"sentinel known-replica");
     rewriteConfigMarkAsProcessed(state,"sentinel known-sentinel");
     rewriteConfigMarkAsProcessed(state,"sentinel rename-command");
+    rewriteConfigMarkAsProcessed(state,"sentinel master-reboot-down-after-period");
 }
 
 /* This function uses the config rewriting Redis engine in order to persist
@@ -2456,6 +2476,12 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
             } else {
                 if (strncmp(ri->runid,l+7,40) != 0) {
                     sentinelEvent(LL_NOTICE,"+reboot",ri,"%@");
+
+                    if (ri->flags & SRI_MASTER && ri->master_reboot_down_after_period != 0) {
+                        ri->flags |= SRI_MASTER_REBOOT;
+                        ri->master_reboot_since_time = mstime();
+                    }
+
                     sdsfree(ri->runid);
                     ri->runid = sdsnewlen(l+7,40);
                 }
@@ -2723,6 +2749,10 @@ void sentinelPingReplyCallback(redisAsyncContext *c, void *reply, void *privdata
         {
             link->last_avail_time = mstime();
             link->act_ping_time = 0; /* Flag the pong as received. */
+
+            if (ri->flags & SRI_MASTER_REBOOT && strncmp(r->str,"PONG",4) == 0)
+                ri->flags &= ~SRI_MASTER_REBOOT;
+
         } else {
             /* Send a SCRIPT KILL command if the instance appears to be
              * down because of a busy script. */
@@ -4255,6 +4285,15 @@ void sentinelSetCommand(client *c) {
                 dictAdd(ri->renamed_commands,oldname,newname);
             }
             changes++;
+        } else if (!strcasecmp(option,"master-reboot-down-after-period") && moreargs > 0) {
+            /* master-reboot-down-after-period <milliseconds> */
+            robj *o = c->argv[++j];
+            if (getLongLongFromObject(o,&ll) == C_ERR || ll < 0) {
+                badarg = j;
+                goto badfmt;
+            }
+            ri->master_reboot_down_after_period = ll;
+            changes++;
         } else {
             addReplyErrorFormat(c,"Unknown option or number of arguments for "
                                   "SENTINEL SET '%s'", option);
@@ -4358,7 +4397,9 @@ void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) {
         (ri->flags & SRI_MASTER &&
          ri->role_reported == SRI_SLAVE &&
          mstime() - ri->role_reported_time >
-          (ri->down_after_period+sentinel_info_period*2)))
+          (ri->down_after_period+sentinel_info_period*2)) ||
+          (ri->flags & SRI_MASTER_REBOOT && 
+           mstime()-ri->master_reboot_since_time > ri->master_reboot_down_after_period))
     {
         /* Is subjectively down */
         if ((ri->flags & SRI_S_DOWN) == 0) {
diff --git a/tests/sentinel/tests/12-master-reboot.tcl b/tests/sentinel/tests/12-master-reboot.tcl
new file mode 100644
index 000000000..1fdd91d6a
--- /dev/null
+++ b/tests/sentinel/tests/12-master-reboot.tcl
@@ -0,0 +1,103 @@
+# Check the basic monitoring and failover capabilities.
+source "../tests/includes/init-tests.tcl"
+
+
+if {$::simulate_error} {
+    test "This test will fail" {
+        fail "Simulated error"
+    }
+}
+
+
+# Reboot an instance previously in very short time but do not check if it is loading
+proc reboot_instance {type id} {
+    set dirname "${type}_${id}"
+    set cfgfile [file join $dirname $type.conf]
+    set port [get_instance_attrib $type $id port]
+
+    # Execute the instance with its old setup and append the new pid
+    # file for cleanup.
+    set pid [exec_instance $type $dirname $cfgfile]
+    set_instance_attrib $type $id pid $pid
+    lappend ::pids $pid
+
+    # Check that the instance is running
+    if {[server_is_up 127.0.0.1 $port 100] == 0} {
+        set logfile [file join $dirname log.txt]
+        puts [exec tail $logfile]
+        abort_sentinel_test "Problems starting $type #$id: ping timeout, maybe server start failed, check $logfile"
+    }
+
+    # Connect with it with a fresh link
+    set link [redis 127.0.0.1 $port 0 $::tls]
+    $link reconnect 1
+    set_instance_attrib $type $id link $link
+}
+
+
+test "Master reboot in very short time" {
+    set old_port [RPort $master_id]
+    set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster]
+    assert {[lindex $addr 1] == $old_port}
+    
+    R $master_id debug populate 10000
+    R $master_id bgsave
+    R $master_id config set key-load-delay 1500
+    R $master_id config set loading-process-events-interval-bytes 1024
+    R $master_id config rewrite
+
+    foreach_sentinel_id id {
+        S $id SENTINEL SET mymaster master-reboot-down-after-period 5000
+        S $id sentinel debug ping-period 500
+        S $id sentinel debug ask-period 500 
+    }
+
+    kill_instance redis $master_id
+    reboot_instance redis $master_id
+    
+    foreach_sentinel_id id {        
+        wait_for_condition 1000 100 {
+            [lindex [S $id SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] 1] != $old_port
+        } else {
+            fail "At least one Sentinel did not receive failover info"
+        }
+    }
+
+    set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster]
+    set master_id [get_instance_id_by_port redis [lindex $addr 1]]
+
+    # Make sure the instance load all the dataset
+    while 1 {
+        catch {[$link ping]} retval
+        if {[string match {*LOADING*} $retval]} {
+            after 100
+            continue
+        } else {
+            break
+        }
+    }
+}
+
+test "New master [join $addr {:}] role matches" {
+    assert {[RI $master_id role] eq {master}}
+}
+
+test "All the other slaves now point to the new master" {
+    foreach_redis_id id {
+        if {$id != $master_id && $id != 0} {
+            wait_for_condition 1000 50 {
+                [RI $id master_port] == [lindex $addr 1]
+            } else {
+                fail "Redis ID $id not configured to replicate with new master"
+            }
+        }
+    }
+}
+
+test "The old master eventually gets reconfigured as a slave" {
+    wait_for_condition 1000 50 {
+        [RI 0 master_port] == [lindex $addr 1]
+    } else {
+        fail "Old master not reconfigured as slave of new master"
+    }
+}
+\ No newline at end of file
author	Wen Hui <wen.hui.ware@gmail.com>	2021-11-30 11:46:15 -0500
committer	GitHub <noreply@github.com>	2021-11-30 18:46:15 +0200
commit	2afa41f62847b92ba66f5579b4731494fac3afc3 (patch)
tree	af2f348618efddd5b7a06ec5acfbd7070c92ed82
parent	af072c26bcf62dc4be1cfee7c2ca7242ac2d16aa (diff)
download	redis-2afa41f62847b92ba66f5579b4731494fac3afc3.tar.gz