2 files changed, 28 insertions, 3 deletions
diff --git a/src/cluster.c b/src/cluster.c
index 21da937a6..f154d8f4e 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -301,6 +301,7 @@ void clusterInit(void) {
         dictCreate(&clusterNodesBlackListDictType,NULL);
     server.cluster->failover_auth_time = 0;
     server.cluster->failover_auth_count = 0;
+    server.cluster->failover_auth_rank = 0;
     server.cluster->failover_auth_epoch = 0;
     server.cluster->last_vote_epoch = 0;
     server.cluster->stats_bus_messages_sent = 0;
@@ -2000,13 +2001,36 @@ void clusterHandleSlaveFailover(void) {
     {
         server.cluster->failover_auth_time = mstime() +
             500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */
-            data_age / 10 + /* Add 100 milliseconds for every second of age. */
             random() % 500; /* Random delay between 0 and 500 milliseconds. */
         server.cluster->failover_auth_count = 0;
         server.cluster->failover_auth_sent = 0;
+        server.cluster->failover_auth_rank = clusterGetSlaveRank();
+        /* We add another delay that is proportional to the slave rank.
+         * Specifically 1 second * rank. This way slaves that have a probably
+         * less updated replication offset, are penalized. */
+        server.cluster->failover_auth_time +=
+            server.cluster->failover_auth_rank * 1000;
         redisLog(REDIS_WARNING,
-            "Start of election delayed for %lld milliseconds.",
-            server.cluster->failover_auth_time - mstime());
+            "Start of election delayed for %lld milliseconds (rank is #%d).",
+            server.cluster->failover_auth_time - mstime(),
+            server.cluster->failover_auth_rank);
+        return;
+    }
+
+    /* It is possible that we received more updated offsets from other
+     * slaves for the same master since we computed our election delay.
+     * Update the delay if our rank changed. */
+    if (server.cluster->failover_auth_sent == 0) {
+        int newrank = clusterGetSlaveRank();
+        if (newrank > server.cluster->failover_auth_rank) {
+            long long added_delay =
+                (newrank - server.cluster->failover_auth_rank) * 1000;
+            server.cluster->failover_auth_time += added_delay;
+            server.cluster->failover_auth_rank = newrank;
+            redisLog(REDIS_WARNING,
+                "Slave rank updated to #%d, added %lld milliseconds of delay.",
+                newrank, added_delay);
+        }
         return;
     }
 
diff --git a/src/cluster.h b/src/cluster.h
index 7ca20c05e..c07b37381 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -97,6 +97,7 @@ typedef struct clusterState {
     mstime_t failover_auth_time; /* Time of previous or next election. */
     int failover_auth_count;    /* Number of votes received so far. */
     int failover_auth_sent;     /* True if we already asked for votes. */
+    int failover_auth_rank;     /* This slave rank for current auth request. */
     uint64_t failover_auth_epoch; /* Epoch of the current election. */
     /* The followign fields are uesd by masters to take state on elections. */
     uint64_t last_vote_epoch;   /* Epoch of the last vote granted. */