diff options
-rw-r--r-- | src/cluster.c | 30 | ||||
-rw-r--r-- | src/cluster.h | 1 |
2 files changed, 28 insertions, 3 deletions
diff --git a/src/cluster.c b/src/cluster.c index 21da937a6..f154d8f4e 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -301,6 +301,7 @@ void clusterInit(void) { dictCreate(&clusterNodesBlackListDictType,NULL); server.cluster->failover_auth_time = 0; server.cluster->failover_auth_count = 0; + server.cluster->failover_auth_rank = 0; server.cluster->failover_auth_epoch = 0; server.cluster->last_vote_epoch = 0; server.cluster->stats_bus_messages_sent = 0; @@ -2000,13 +2001,36 @@ void clusterHandleSlaveFailover(void) { { server.cluster->failover_auth_time = mstime() + 500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */ - data_age / 10 + /* Add 100 milliseconds for every second of age. */ random() % 500; /* Random delay between 0 and 500 milliseconds. */ server.cluster->failover_auth_count = 0; server.cluster->failover_auth_sent = 0; + server.cluster->failover_auth_rank = clusterGetSlaveRank(); + /* We add another delay that is proportional to the slave rank. + * Specifically 1 second * rank. This way slaves that have a probably + * less updated replication offset, are penalized. */ + server.cluster->failover_auth_time += + server.cluster->failover_auth_rank * 1000; redisLog(REDIS_WARNING, - "Start of election delayed for %lld milliseconds.", - server.cluster->failover_auth_time - mstime()); + "Start of election delayed for %lld milliseconds (rank is #%d).", + server.cluster->failover_auth_time - mstime(), + server.cluster->failover_auth_rank); + return; + } + + /* It is possible that we received more updated offsets from other + * slaves for the same master since we computed our election delay. + * Update the delay if our rank changed. */ + if (server.cluster->failover_auth_sent == 0) { + int newrank = clusterGetSlaveRank(); + if (newrank > server.cluster->failover_auth_rank) { + long long added_delay = + (newrank - server.cluster->failover_auth_rank) * 1000; + server.cluster->failover_auth_time += added_delay; + server.cluster->failover_auth_rank = newrank; + redisLog(REDIS_WARNING, + "Slave rank updated to #%d, added %lld milliseconds of delay.", + newrank, added_delay); + } return; } diff --git a/src/cluster.h b/src/cluster.h index 7ca20c05e..c07b37381 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -97,6 +97,7 @@ typedef struct clusterState { mstime_t failover_auth_time; /* Time of previous or next election. */ int failover_auth_count; /* Number of votes received so far. */ int failover_auth_sent; /* True if we already asked for votes. */ + int failover_auth_rank; /* This slave rank for current auth request. */ uint64_t failover_auth_epoch; /* Epoch of the current election. */ /* The followign fields are uesd by masters to take state on elections. */ uint64_t last_vote_epoch; /* Epoch of the last vote granted. */ |