Cluster: added progressive election delay according to slave rank.

Note that when we compute the initial delay, there are probably still more up to date information to receive from slaves with new offsets, so the delay is recomputed when new data is available.
author: antirez <antirez@gmail.com> 2014-01-29 16:51:11 +0100
committer: antirez <antirez@gmail.com> 2014-01-29 16:53:45 +0100
commit: 940531e9b71dc1538cdaf96744a2b87e339ce836 (patch)
tree: 8dc8f9277b08b5be88d05a44b73d258a569ad175 /src
parent: 6f54032080d03c8e0145f9f816ce21879b2c98df (diff)
download: redis-940531e9b71dc1538cdaf96744a2b87e339ce836.tar.gz
2 files changed, 28 insertions, 3 deletions
diff --git a/src/cluster.c b/src/cluster.c
index 21da937a6..f154d8f4e 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -301,6 +301,7 @@ void clusterInit(void) {
         dictCreate(&clusterNodesBlackListDictType,NULL);
     server.cluster->failover_auth_time = 0;
     server.cluster->failover_auth_count = 0;
+    server.cluster->failover_auth_rank = 0;
     server.cluster->failover_auth_epoch = 0;
     server.cluster->last_vote_epoch = 0;
     server.cluster->stats_bus_messages_sent = 0;
@@ -2000,13 +2001,36 @@ void clusterHandleSlaveFailover(void) {
     {
         server.cluster->failover_auth_time = mstime() +
             500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */
-            data_age / 10 + /* Add 100 milliseconds for every second of age. */
             random() % 500; /* Random delay between 0 and 500 milliseconds. */
         server.cluster->failover_auth_count = 0;
         server.cluster->failover_auth_sent = 0;
+        server.cluster->failover_auth_rank = clusterGetSlaveRank();
+        /* We add another delay that is proportional to the slave rank.
+         * Specifically 1 second * rank. This way slaves that have a probably
+         * less updated replication offset, are penalized. */
+        server.cluster->failover_auth_time +=
+            server.cluster->failover_auth_rank * 1000;
         redisLog(REDIS_WARNING,
-            "Start of election delayed for %lld milliseconds.",
-            server.cluster->failover_auth_time - mstime());
+            "Start of election delayed for %lld milliseconds (rank is #%d).",
+            server.cluster->failover_auth_time - mstime(),
+            server.cluster->failover_auth_rank);
+        return;
+    }
+
+    /* It is possible that we received more updated offsets from other
+     * slaves for the same master since we computed our election delay.
+     * Update the delay if our rank changed. */
+    if (server.cluster->failover_auth_sent == 0) {
+        int newrank = clusterGetSlaveRank();
+        if (newrank > server.cluster->failover_auth_rank) {
+            long long added_delay =
+                (newrank - server.cluster->failover_auth_rank) * 1000;
+            server.cluster->failover_auth_time += added_delay;
+            server.cluster->failover_auth_rank = newrank;
+            redisLog(REDIS_WARNING,
+                "Slave rank updated to #%d, added %lld milliseconds of delay.",
+                newrank, added_delay);
+        }
         return;
     }
 
diff --git a/src/cluster.h b/src/cluster.h
index 7ca20c05e..c07b37381 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -97,6 +97,7 @@ typedef struct clusterState {
     mstime_t failover_auth_time; /* Time of previous or next election. */
     int failover_auth_count;    /* Number of votes received so far. */
     int failover_auth_sent;     /* True if we already asked for votes. */
+    int failover_auth_rank;     /* This slave rank for current auth request. */
     uint64_t failover_auth_epoch; /* Epoch of the current election. */
     /* The followign fields are uesd by masters to take state on elections. */
     uint64_t last_vote_epoch;   /* Epoch of the last vote granted. */
author	antirez <antirez@gmail.com>	2014-01-29 16:51:11 +0100
committer	antirez <antirez@gmail.com>	2014-01-29 16:53:45 +0100
commit	940531e9b71dc1538cdaf96744a2b87e339ce836 (patch)
tree	8dc8f9277b08b5be88d05a44b73d258a569ad175 /src
parent	6f54032080d03c8e0145f9f816ce21879b2c98df (diff)
download	redis-940531e9b71dc1538cdaf96744a2b87e339ce836.tar.gz