summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorantirez <antirez@gmail.com>2013-04-26 11:11:21 +0200
committerantirez <antirez@gmail.com>2013-04-26 17:02:13 +0200
commit48ede0d84d43cc40addaa7de5e88daafe717abba (patch)
tree2635974e2b30959940be10d4621cbde483b1b842
parent1965e22aa1daf2321579dc0e2d14ac7e0cf31dbe (diff)
downloadredis-48ede0d84d43cc40addaa7de5e88daafe717abba.tar.gz
Sentinel: only demote old master into slave under certain conditions.
We used to always turn a master into a slave if the DEMOTE flag was set, as this was a resurrecting master instance. However the following race condition is possible for a Sentinel that got partitioned or internal issues (tilt mode), and was not able to refresh the state in the meantime: 1) Sentinel X is running, master is instance "A". 3) "A" fails, sentinels will promote slave "B" as master. 2) Sentinel X goes down because of a network partition. 4) "A" returns available, Sentinels will demote it as a slave. 5) "B" fails, other Sentinels will promote slave "A" as master. 6) At this point Sentinel X comes back. When "X" comes back he thinks that: "B" is the master. "A" is the slave to demote. We want to avoid that Sentinel "X" will demote "A" into a slave. We also want that Sentinel "X" will detect that the conditions changed and will reconfigure itself to monitor the right master. There are two main ways for the Sentinel to reconfigure itself after this event: 1) If "B" is reachable and already configured as a slave by other sentinels, "X" will perform a redirection to "A". 2) If there are not the conditions to demote "A", the fact that "A" reports to be a master will trigger a failover detection in "X", that will end into a reconfiguraiton to monitor "A". However if the Sentinel was not reachable, its state may not be updated, so in case it titled, or was partiitoned from the master instance of the slave to demote, the new implementation waits some time (enough to guarantee we can detect the new INFO, and new DOWN conditions). If after some time still there are not the right condiitons to demote the instance, the DEMOTE flag is cleared.
-rw-r--r--src/sentinel.c57
1 files changed, 46 insertions, 11 deletions
diff --git a/src/sentinel.c b/src/sentinel.c
index 6e592ae1e..959f26e35 100644
--- a/src/sentinel.c
+++ b/src/sentinel.c
@@ -405,7 +405,7 @@ void initSentinel(void) {
/* Initialize various data structures. */
sentinel.masters = dictCreate(&instancesDictType,NULL);
sentinel.tilt = 0;
- sentinel.tilt_start_time = mstime();
+ sentinel.tilt_start_time = 0;
sentinel.previous_time = mstime();
sentinel.running_scripts = 0;
sentinel.scripts_queue = listCreate();
@@ -1132,7 +1132,6 @@ int sentinelResetMastersByPattern(char *pattern, int flags) {
* TODO: make this reset so that original sentinels are re-added with
* same ip / port / runid.
*/
-
int sentinelResetMasterAndChangeAddress(sentinelRedisInstance *master, char *ip, int port) {
sentinelAddr *oldaddr, *newaddr;
@@ -1141,12 +1140,26 @@ int sentinelResetMasterAndChangeAddress(sentinelRedisInstance *master, char *ip,
sentinelResetMaster(master,SENTINEL_NO_FLAGS);
oldaddr = master->addr;
master->addr = newaddr;
+ master->o_down_since_time = 0;
+ master->s_down_since_time = 0;
+
/* Release the old address at the end so we are safe even if the function
* gets the master->addr->ip and master->addr->port as arguments. */
releaseSentinelAddr(oldaddr);
return REDIS_OK;
}
+/* Return non-zero if there was no SDOWN or ODOWN error associated to this
+ * instance in the latest 'ms' milliseconds. */
+int sentinelRedisInstanceNoDownFor(sentinelRedisInstance *ri, mstime_t ms) {
+ mstime_t most_recent;
+
+ most_recent = ri->s_down_since_time;
+ if (ri->o_down_since_time > most_recent)
+ most_recent = ri->o_down_since_time;
+ return most_recent == 0 || (mstime() - most_recent) > ms;
+}
+
/* ============================ Config handling ============================= */
char *sentinelHandleConfiguration(char **argv, int argc) {
sentinelRedisInstance *ri;
@@ -1466,17 +1479,37 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
/* Act if a slave turned into a master. */
if ((ri->flags & SRI_SLAVE) && role == SRI_MASTER) {
if (ri->flags & SRI_DEMOTE) {
- int retval;
+ /* If this sentinel was partitioned from the slave's master,
+ * or tilted recently, wait some time before to act,
+ * so that DOWN and roles info will be refreshed. */
+ if (!sentinelRedisInstanceNoDownFor(ri->master,
+ SENTINEL_INFO_PERIOD*2))
+ return;
+ if (mstime()-sentinel.tilt_start_time <
+ SENTINEL_TILT_PERIOD+ri->master->down_after_period*2)
+ return;
- /* Old master returned back? Turn it into a slave ASAP.
+ /* Old master returned back? Turn it into a slave ASAP if:
+ *
* We'll clear this flag only when we have the acknowledge
* that it's a slave again. */
- retval = redisAsyncCommand(ri->cc,
- sentinelDiscardReplyCallback, NULL, "SLAVEOF %s %d",
- ri->master->addr->ip,
- ri->master->addr->port);
- if (retval == REDIS_OK)
- sentinelEvent(REDIS_NOTICE,"+demote-old-slave",ri,"%@");
+ if (ri->master->flags & SRI_MASTER &&
+ (ri->master->flags & (SRI_S_DOWN|SRI_O_DOWN)) == 0 &&
+ (mstime() - ri->master->info_refresh) < SENTINEL_INFO_PERIOD*2)
+ {
+ int retval;
+ retval = redisAsyncCommand(ri->cc,
+ sentinelDiscardReplyCallback, NULL, "SLAVEOF %s %d",
+ ri->master->addr->ip,
+ ri->master->addr->port);
+ if (retval == REDIS_OK)
+ sentinelEvent(REDIS_NOTICE,"+demote-old-slave",ri,"%@");
+ } else {
+ /* Otherwise if there are not the conditions to demote, we
+ * no longer trust the DEMOTE flag and remove it. */
+ ri->flags &= ~SRI_DEMOTE;
+ sentinelEvent(REDIS_NOTICE,"-demote-flag-cleared",ri,"%@");
+ }
} else if (!(ri->master->flags & SRI_FAILOVER_IN_PROGRESS) &&
(runid_changed || first_runid))
{
@@ -1533,6 +1566,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
ri->master->failover_state_change_time = mstime();
ri->master->promoted_slave = ri;
ri->flags |= SRI_PROMOTED;
+ ri->flags &= ~SRI_DEMOTE;
sentinelCallClientReconfScript(ri->master,SENTINEL_OBSERVER,
"start", ri->master->addr,ri->addr);
/* We are an observer, so we can only assume that the leader
@@ -1575,7 +1609,8 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
}
}
- /* Detect if the old master was demoted as slave. */
+ /* Detect if the old master was demoted as slave and generate the
+ * +slave event. */
if (role == SRI_SLAVE && ri->flags & SRI_DEMOTE) {
sentinelEvent(REDIS_NOTICE,"+slave",ri,"%@");
ri->flags &= ~SRI_DEMOTE;