Cluster: replica migration with delay.

We wait a fixed amount of time (5 seconds currently) much greater than the usual Cluster node to node communication latency, before migrating. This way when a failover occurs, before detecting the new master as a target for migration, we give the time to its natural slaves (the slaves of the failed over master) to announce they switched to the new master, preventing an useless migration operation.
author: antirez <antirez@gmail.com> 2015-12-11 09:19:06 +0100
committer: antirez <antirez@gmail.com> 2015-12-11 09:23:15 +0100
commit: 6b8a6c2ad9094e2fa6416c9a0216248a64d9dd8d (patch)
tree: 3eac1bd2c066c4d12b07063e0157bb845ceefbe5
parent: 7399195c3f0b585d3cb16bb396fe975d761e8521 (diff)
download: redis-6b8a6c2ad9094e2fa6416c9a0216248a64d9dd8d.tar.gz
2 files changed, 39 insertions, 17 deletions
diff --git a/src/cluster.c b/src/cluster.c
index 69629182e..15211d259 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -671,6 +671,7 @@ clusterNode *createClusterNode(char *nodename, int flags) {
     node->port = 0;
     node->fail_reports = listCreate();
     node->voted_time = 0;
+    node->orphaned_time = 0;
     node->repl_offset_time = 0;
     node->repl_offset = 0;
     listSetFreeMethod(node->fail_reports,zfree);
@@ -2910,30 +2911,44 @@ void clusterHandleSlaveMigration(int max_slaves) {
 
     /* Step 3: Idenitfy a candidate for migration, and check if among the
      * masters with the greatest number of ok slaves, I'm the one with the
-     * smaller node ID.
+     * smallest node ID (the "candidate slave").
      *
-     * Note that this means that eventually a replica migration will occurr
+     * Note: this means that eventually a replica migration will occurr
      * since slaves that are reachable again always have their FAIL flag
-     * cleared. At the same time this does not mean that there are no
-     * race conditions possible (two slaves migrating at the same time), but
-     * this is extremely unlikely to happen, and harmless. */
+     * cleared, so eventually there must be a candidate. At the same time
+     * this does not mean that there are no race conditions possible (two
+     * slaves migrating at the same time), but this is unlikely to
+     * happen, and harmless when happens. */
     candidate = myself;
     di = dictGetSafeIterator(server.cluster->nodes);
     while((de = dictNext(di)) != NULL) {
         clusterNode *node = dictGetVal(de);
-        int okslaves;
+        int okslaves = 0, is_orphaned = 1;
 
-        /* Only iterate over working masters. */
-        if (nodeIsSlave(node) || nodeFailed(node)) continue;
-        /* We want to migrate only if this master used to have slaves or
-         * if failed over a master that had slaves. This way we only migrate
-         * to instances that were supposed to have replicas. */
-        if (!(node->flags & CLUSTER_NODE_MIGRATE_TO)) continue;
-        okslaves = clusterCountNonFailingSlaves(node);
+        /* We want to migrate only if this master is working, orphaned, and
+         * used to have slaves or if failed over a master that had slaves
+         * (MIGRATE_TO flag). This way we only migrate to instances that were
+         * supposed to have replicas. */
+        if (nodeIsSlave(node) || nodeFailed(node)) is_orphaned = 0;
+        if (!(node->flags & CLUSTER_NODE_MIGRATE_TO)) is_orphaned = 0;
 
-        if (okslaves == 0 && target == NULL && node->numslots > 0)
-            target = node;
+        /* Check number of working slaves. */
+        if (nodeIsMaster(node)) okslaves = clusterCountNonFailingSlaves(node);
+        if (okslaves > 0) is_orphaned = 0;
 
+        if (is_orphaned) {
+            if (!target && node->numslots > 0) target = node;
+
+            /* Track the starting time of the orphaned condition for this
+             * master. */
+            if (!node->orphaned_time) node->orphaned_time = mstime();
+        } else {
+            node->orphaned_time = 0;
+        }
+
+        /* Check if I'm the slave candidate for the migration: attached
+         * to a master with the maximum number of slaves and with the smallest
+         * node ID. */
         if (okslaves == max_slaves) {
             for (j = 0; j < node->numslaves; j++) {
                 if (memcmp(node->slaves[j]->name,
@@ -2948,8 +2963,13 @@ void clusterHandleSlaveMigration(int max_slaves) {
     dictReleaseIterator(di);
 
     /* Step 4: perform the migration if there is a target, and if I'm the
-     * candidate. */
-    if (target && candidate == myself) {
+     * candidate, but only if the master is continuously orphaned for a
+     * couple of seconds, so that during failovers, we give some time to
+     * the natural slaves of this instance to advertise their switch from
+     * the old master to the new one. */
+    if (target && candidate == myself &&
+        (mstime()-target->orphaned_time) > CLUSTER_SLAVE_MIGRATION_DELAY)
+    {
         serverLog(LL_WARNING,"Migrating to orphaned master %.40s",
             target->name);
         clusterSetMaster(target);
diff --git a/src/cluster.h b/src/cluster.h
index e8f9bff71..08a11e9a3 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -23,6 +23,7 @@
 #define CLUSTER_DEFAULT_MIGRATION_BARRIER 1
 #define CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */
 #define CLUSTER_MF_PAUSE_MULT 2 /* Master pause manual failover mult. */
+#define CLUSTER_SLAVE_MIGRATION_DELAY 5000 /* Delay for slave migration. */
 
 /* Redirection errors returned by getNodeByQuery(). */
 #define CLUSTER_REDIR_NONE 0          /* Node can serve the request. */
@@ -93,6 +94,7 @@ typedef struct clusterNode {
     mstime_t fail_time;      /* Unix time when FAIL flag was set */
     mstime_t voted_time;     /* Last time we voted for a slave of this master */
     mstime_t repl_offset_time;  /* Unix time we received offset for this node */
+    mstime_t orphaned_time;     /* Starting time of orphaned master condition */
     long long repl_offset;      /* Last known repl offset for this node. */
     char ip[NET_IP_STR_LEN];  /* Latest known IP address of this node */
     int port;                   /* Latest known port of this node */
author	antirez <antirez@gmail.com>	2015-12-11 09:19:06 +0100
committer	antirez <antirez@gmail.com>	2015-12-11 09:23:15 +0100
commit	6b8a6c2ad9094e2fa6416c9a0216248a64d9dd8d (patch)
tree	3eac1bd2c066c4d12b07063e0157bb845ceefbe5
parent	7399195c3f0b585d3cb16bb396fe975d761e8521 (diff)
download	redis-6b8a6c2ad9094e2fa6416c9a0216248a64d9dd8d.tar.gz