Cluster: ability to prevent slaves from failing over their masters.

This commit, in some parts derived from PR #3041 which is no longer possible to merge (because the user deleted the original branch), implements the ability of slaves to have a special configuration preventing that they try to start a failover when the master is failing. There are multiple reasons for wanting this, and the feautre was requested in issue #3021 time ago. The differences between this patch and the original PR are the following: 1. The flag is saved/loaded on the nodes configuration. 2. The 'myself' node is now flag-aware, the flag is updated as needed when the configuration is changed via CONFIG SET. 3. The flag name uses NOFAILOVER instead of NO_FAILOVER to be consistent with existing NOADDR. 4. The redis.conf documentation was rewritten. Thanks to @deep011 for the original patch.
author: antirez <antirez@gmail.com> 2018-03-14 13:46:36 +0100
committer: antirez <antirez@gmail.com> 2018-03-14 16:31:46 +0100
commit: 70597a3011c6a392f81975bdda3a49f942d8e342 (patch)
tree: 3600ded787a9840a1eb85be83b736d4ec8b036e1
parent: 16cad10a0c845a7dbc6156b0ef144ab2470c3ce5 (diff)
download: redis-70597a3011c6a392f81975bdda3a49f942d8e342.tar.gz
6 files changed, 70 insertions, 2 deletions
diff --git a/redis.conf b/redis.conf
index 53984826d..43562c5bc 100644
--- a/redis.conf
+++ b/redis.conf
@@ -904,6 +904,16 @@ lua-time-limit 5000
 #
 # cluster-require-full-coverage yes
 
+# This option, when set to yes, prevents slaves from trying to failover its
+# master during master failures. However the master can still perform a
+# manual failover, if forced to do so.
+#
+# This is useful in different scenarios, especially in the case of multiple
+# data center operations, where we want one side to never be promoted if not
+# in the case of a total DC failure.
+#
+# cluster-slave-no-failover no
+
 # In order to setup your cluster make sure to read the documentation
 # available at http://redis.io web site.
 
diff --git a/src/cluster.c b/src/cluster.c
index b6fb19523..f853ff927 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -201,6 +201,8 @@ int clusterLoadConfig(char *filename) {
                 n->flags |= CLUSTER_NODE_HANDSHAKE;
             } else if (!strcasecmp(s,"noaddr")) {
                 n->flags |= CLUSTER_NODE_NOADDR;
+            } else if (!strcasecmp(s,"nofailover")) {
+                n->flags |= CLUSTER_NODE_NOFAILOVER;
             } else if (!strcasecmp(s,"noflags")) {
                 /* nothing to do */
             } else {
@@ -407,6 +409,22 @@ int clusterLockConfig(char *filename) {
     return C_OK;
 }
 
+/* Some flags (currently just the NOFAILOVER flag) may need to be updated
+ * in the "myself" node based on the current configuration of the node,
+ * that may change at runtime via CONFIG SET. This function changes the
+ * set of flags in myself->flags accordingly. */
+void clusterUpdateMyselfFlags(void) {
+    int oldflags = myself->flags;
+    int nofailover = server.cluster_slave_no_failover ?
+                     CLUSTER_NODE_NOFAILOVER : 0;
+    myself->flags &= ~CLUSTER_NODE_NOFAILOVER;
+    myself->flags |= nofailover;
+    if (myself->flags != oldflags) {
+        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
+                             CLUSTER_TODO_UPDATE_STATE);
+    }
+}
+
 void clusterInit(void) {
     int saveconf = 0;
 
@@ -497,6 +515,7 @@ void clusterInit(void) {
 
     server.cluster->mf_end = 0;
     resetManualFailover();
+    clusterUpdateMyselfFlags();
 }
 
 /* Reset a node performing a soft or hard reset:
@@ -1808,6 +1827,18 @@ int clusterProcessPacket(clusterLink *link) {
             }
         }
 
+        /* Copy the CLUSTER_NODE_NOFAILOVER flag from what the sender
+         * announced. This is a dynamic flag that we receive from the
+         * sender, and the latest status must be trusted. We need it to
+         * be propagated because the slave ranking used to understand the
+         * delay of each slave in the voting process, needs to know
+         * what are the instances really competing. */
+        if (sender) {
+            int nofailover = flags & CLUSTER_NODE_NOFAILOVER;
+            sender->flags &= ~CLUSTER_NODE_NOFAILOVER;
+            sender->flags |= nofailover;
+        }
+
         /* Update the node address if it changed. */
         if (sender && type == CLUSTERMSG_TYPE_PING &&
             !nodeInHandshake(sender) &&
@@ -2723,6 +2754,7 @@ int clusterGetSlaveRank(void) {
     myoffset = replicationGetSlaveOffset();
     for (j = 0; j < master->numslaves; j++)
         if (master->slaves[j] != myself &&
+            !nodeCantFailover(master->slaves[j]) &&
             master->slaves[j]->repl_offset > myoffset) rank++;
     return rank;
 }
@@ -2860,10 +2892,13 @@ void clusterHandleSlaveFailover(void) {
      * of an automatic or manual failover:
      * 1) We are a slave.
      * 2) Our master is flagged as FAIL, or this is a manual failover.
-     * 3) It is serving slots. */
+     * 3) We don't have the no failover configuration set, and this is
+     *    not a manual failover.
+     * 4) It is serving slots. */
     if (nodeIsMaster(myself) ||
         myself->slaveof == NULL ||
         (!nodeFailed(myself->slaveof) && !manual_failover) ||
+        (server.cluster_slave_no_failover && !manual_failover) ||
         myself->slaveof->numslots == 0)
     {
         /* There are no reasons to failover, so we set the reason why we
@@ -3239,6 +3274,9 @@ void clusterCron(void) {
     handshake_timeout = server.cluster_node_timeout;
     if (handshake_timeout < 1000) handshake_timeout = 1000;
 
+    /* Update myself flags. */
+    clusterUpdateMyselfFlags();
+
     /* Check if we have disconnected nodes and re-establish the connection.
      * Also update a few stats while we are here, that can be used to make
      * better decisions in other part of the code. */
@@ -3837,7 +3875,8 @@ static struct redisNodeFlags redisNodeFlagsTable[] = {
     {CLUSTER_NODE_PFAIL,        "fail?,"},
     {CLUSTER_NODE_FAIL,         "fail,"},
     {CLUSTER_NODE_HANDSHAKE,    "handshake,"},
-    {CLUSTER_NODE_NOADDR,       "noaddr,"}
+    {CLUSTER_NODE_NOADDR,       "noaddr,"},
+    {CLUSTER_NODE_NOFAILOVER,   "nofailover,"}
 };
 
 /* Concatenate the comma separated list of node flags to the given SDS
diff --git a/src/cluster.h b/src/cluster.h
index af85841c9..f2b9a4ecf 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -16,6 +16,7 @@
 #define CLUSTER_DEFAULT_NODE_TIMEOUT 15000
 #define CLUSTER_DEFAULT_SLAVE_VALIDITY 10 /* Slave max data age factor. */
 #define CLUSTER_DEFAULT_REQUIRE_FULL_COVERAGE 1
+#define CLUSTER_DEFAULT_SLAVE_NO_FAILOVER 0 /* Failover by default. */
 #define CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */
 #define CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */
 #define CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */
@@ -55,6 +56,7 @@ typedef struct clusterLink {
 #define CLUSTER_NODE_NOADDR   64  /* We don't know the address of this node */
 #define CLUSTER_NODE_MEET 128     /* Send a MEET message to this node */
 #define CLUSTER_NODE_MIGRATE_TO 256 /* Master elegible for replica migration. */
+#define CLUSTER_NODE_NOFAILOVER 512 /* Slave will not try to failver. */
 #define CLUSTER_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
 
 #define nodeIsMaster(n) ((n)->flags & CLUSTER_NODE_MASTER)
@@ -64,6 +66,7 @@ typedef struct clusterLink {
 #define nodeWithoutAddr(n) ((n)->flags & CLUSTER_NODE_NOADDR)
 #define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL)
 #define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL)
+#define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER)
 
 /* Reasons why a slave is not able to failover. */
 #define CLUSTER_CANT_FAILOVER_NONE 0
diff --git a/src/config.c b/src/config.c
index 976f7e4bc..e283e29b7 100644
--- a/src/config.c
+++ b/src/config.c
@@ -635,6 +635,14 @@ void loadServerConfigFromString(char *config) {
                 err = "cluster slave validity factor must be zero or positive";
                 goto loaderr;
             }
+        } else if (!strcasecmp(argv[0],"cluster-slave-no-failover") &&
+                   argc == 2)
+        {
+            server.cluster_slave_no_failover = yesnotoi(argv[1]);
+            if (server.cluster_slave_no_failover == -1) {
+                err = "argument must be 'yes' or 'no'";
+                goto loaderr;
+            }
         } else if (!strcasecmp(argv[0],"lua-time-limit") && argc == 2) {
             server.lua_time_limit = strtoll(argv[1],NULL,10);
         } else if (!strcasecmp(argv[0],"slowlog-log-slower-than") &&
@@ -998,6 +1006,8 @@ void configSetCommand(client *c) {
     } config_set_bool_field(
       "cluster-require-full-coverage",server.cluster_require_full_coverage) {
     } config_set_bool_field(
+      "cluster-slave-no-failover",server.cluster_slave_no_failover) {
+    } config_set_bool_field(
       "aof-rewrite-incremental-fsync",server.aof_rewrite_incremental_fsync) {
     } config_set_bool_field(
       "aof-load-truncated",server.aof_load_truncated) {
@@ -1291,6 +1301,8 @@ void configGetCommand(client *c) {
     /* Bool (yes/no) values */
     config_get_bool_field("cluster-require-full-coverage",
             server.cluster_require_full_coverage);
+    config_get_bool_field("cluster-slave-no-failover",
+            server.cluster_slave_no_failover);
     config_get_bool_field("no-appendfsync-on-rewrite",
             server.aof_no_fsync_on_rewrite);
     config_get_bool_field("slave-serve-stale-data",
@@ -2023,6 +2035,7 @@ int rewriteConfig(char *path) {
     rewriteConfigYesNoOption(state,"cluster-enabled",server.cluster_enabled,0);
     rewriteConfigStringOption(state,"cluster-config-file",server.cluster_configfile,CONFIG_DEFAULT_CLUSTER_CONFIG_FILE);
     rewriteConfigYesNoOption(state,"cluster-require-full-coverage",server.cluster_require_full_coverage,CLUSTER_DEFAULT_REQUIRE_FULL_COVERAGE);
+    rewriteConfigYesNoOption(state,"cluster-slave-no-failover",server.cluster_slave_no_failover,CLUSTER_DEFAULT_SLAVE_NO_FAILOVER);
     rewriteConfigNumericalOption(state,"cluster-node-timeout",server.cluster_node_timeout,CLUSTER_DEFAULT_NODE_TIMEOUT);
     rewriteConfigNumericalOption(state,"cluster-migration-barrier",server.cluster_migration_barrier,CLUSTER_DEFAULT_MIGRATION_BARRIER);
     rewriteConfigNumericalOption(state,"cluster-slave-validity-factor",server.cluster_slave_validity_factor,CLUSTER_DEFAULT_SLAVE_VALIDITY);
diff --git a/src/server.c b/src/server.c
index 0d0b7ce0e..d70282394 100644
--- a/src/server.c
+++ b/src/server.c
@@ -1433,6 +1433,7 @@ void initServerConfig(void) {
     server.cluster_migration_barrier = CLUSTER_DEFAULT_MIGRATION_BARRIER;
     server.cluster_slave_validity_factor = CLUSTER_DEFAULT_SLAVE_VALIDITY;
     server.cluster_require_full_coverage = CLUSTER_DEFAULT_REQUIRE_FULL_COVERAGE;
+    server.cluster_slave_no_failover = CLUSTER_DEFAULT_SLAVE_NO_FAILOVER;
     server.cluster_configfile = zstrdup(CONFIG_DEFAULT_CLUSTER_CONFIG_FILE);
     server.cluster_announce_ip = CONFIG_DEFAULT_CLUSTER_ANNOUNCE_IP;
     server.cluster_announce_port = CONFIG_DEFAULT_CLUSTER_ANNOUNCE_PORT;
diff --git a/src/server.h b/src/server.h
index a607014fd..5abb8bdca 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1161,6 +1161,8 @@ struct redisServer {
     int cluster_slave_validity_factor; /* Slave max data age for failover. */
     int cluster_require_full_coverage; /* If true, put the cluster down if
                                           there is at least an uncovered slot.*/
+    int cluster_slave_no_failover;  /* Prevent slave from starting a failover
+                                       if the master is in failure state. */
     char *cluster_announce_ip;  /* IP address to announce on cluster bus. */
     int cluster_announce_port;     /* base port to announce on cluster bus. */
     int cluster_announce_bus_port; /* bus port to announce on cluster bus. */
author	antirez <antirez@gmail.com>	2018-03-14 13:46:36 +0100
committer	antirez <antirez@gmail.com>	2018-03-14 16:31:46 +0100
commit	70597a3011c6a392f81975bdda3a49f942d8e342 (patch)
tree	3600ded787a9840a1eb85be83b736d4ec8b036e1
parent	16cad10a0c845a7dbc6156b0ef144ab2470c3ce5 (diff)
download	redis-70597a3011c6a392f81975bdda3a49f942d8e342.tar.gz