diff options
Diffstat (limited to 'src/sentinel.c')
-rw-r--r-- | src/sentinel.c | 305 |
1 files changed, 263 insertions, 42 deletions
diff --git a/src/sentinel.c b/src/sentinel.c index f8e1e607d..1b9658f34 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -78,21 +78,27 @@ typedef struct sentinelAddr { #define SRI_SCRIPT_KILL_SENT (1<<12) /* SCRIPT KILL already sent on -BUSY */ /* Note: times are in milliseconds. */ -#define SENTINEL_INFO_PERIOD 10000 #define SENTINEL_PING_PERIOD 1000 -#define SENTINEL_ASK_PERIOD 1000 -#define SENTINEL_PUBLISH_PERIOD 2000 -#define SENTINEL_DEFAULT_DOWN_AFTER 30000 + +static mstime_t sentinel_info_period = 10000; +static mstime_t sentinel_ping_period = SENTINEL_PING_PERIOD; +static mstime_t sentinel_ask_period = 1000; +static mstime_t sentinel_publish_period = 2000; +static mstime_t sentinel_default_down_after = 30000; +static mstime_t sentinel_tilt_trigger = 2000; +static mstime_t sentinel_tilt_period = SENTINEL_PING_PERIOD * 30; +static mstime_t sentinel_slave_reconf_timeout = 10000; +static mstime_t sentinel_min_link_reconnect_period = 15000; +static mstime_t sentinel_election_timeout = 10000; +static mstime_t sentinel_script_max_runtime = 60000; /* 60 seconds max exec time. */ +static mstime_t sentinel_script_retry_delay = 30000; /* 30 seconds between retries. */ +static mstime_t sentinel_default_failover_timeout = 60*3*1000; + #define SENTINEL_HELLO_CHANNEL "__sentinel__:hello" -#define SENTINEL_TILT_TRIGGER 2000 -#define SENTINEL_TILT_PERIOD (SENTINEL_PING_PERIOD*30) #define SENTINEL_DEFAULT_SLAVE_PRIORITY 100 -#define SENTINEL_SLAVE_RECONF_TIMEOUT 10000 #define SENTINEL_DEFAULT_PARALLEL_SYNCS 1 -#define SENTINEL_MIN_LINK_RECONNECT_PERIOD 15000 -#define SENTINEL_DEFAULT_FAILOVER_TIMEOUT (60*3*1000) #define SENTINEL_MAX_PENDING_COMMANDS 100 -#define SENTINEL_ELECTION_TIMEOUT 10000 + #define SENTINEL_MAX_DESYNC 1000 #define SENTINEL_DEFAULT_DENY_SCRIPTS_RECONFIG 1 #define SENTINEL_DEFAULT_RESOLVE_HOSTNAMES 0 @@ -123,9 +129,7 @@ typedef struct sentinelAddr { #define SENTINEL_SCRIPT_RUNNING 1 #define SENTINEL_SCRIPT_MAX_QUEUE 256 #define SENTINEL_SCRIPT_MAX_RUNNING 16 -#define SENTINEL_SCRIPT_MAX_RUNTIME 60000 /* 60 seconds max exec time. */ #define SENTINEL_SCRIPT_MAX_RETRY 10 -#define SENTINEL_SCRIPT_RETRY_DELAY 30000 /* 30 seconds between retries. */ /* SENTINEL SIMULATE-FAILURE command flags. */ #define SENTINEL_SIMFAILURE_NONE 0 @@ -891,7 +895,7 @@ void sentinelRunPendingScripts(void) { * starting from the second attempt to execute the script the delays are: * 30 sec, 60 sec, 2 min, 4 min, 8 min, 16 min, 32 min, 64 min, 128 min. */ mstime_t sentinelScriptRetryDelay(int retry_num) { - mstime_t delay = SENTINEL_SCRIPT_RETRY_DELAY; + mstime_t delay = sentinel_script_retry_delay; while (retry_num-- > 1) delay *= 2; return delay; @@ -958,7 +962,7 @@ void sentinelKillTimedoutScripts(void) { sentinelScriptJob *sj = ln->value; if (sj->flags & SENTINEL_SCRIPT_RUNNING && - (now - sj->start_time) > SENTINEL_SCRIPT_MAX_RUNTIME) + (now - sj->start_time) > sentinel_script_max_runtime) { sentinelEvent(LL_WARNING,"-script-timeout",NULL,"%s %ld", sj->argv[0], (long)sj->pid); @@ -1335,7 +1339,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * ri->s_down_since_time = 0; ri->o_down_since_time = 0; ri->down_after_period = master ? master->down_after_period : - SENTINEL_DEFAULT_DOWN_AFTER; + sentinel_default_down_after; ri->master_link_down_time = 0; ri->auth_pass = NULL; ri->auth_user = NULL; @@ -1361,7 +1365,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * ri->failover_state = SENTINEL_FAILOVER_STATE_NONE; ri->failover_state_change_time = 0; ri->failover_start_time = 0; - ri->failover_timeout = SENTINEL_DEFAULT_FAILOVER_TIMEOUT; + ri->failover_timeout = sentinel_default_failover_timeout; ri->failover_delay_logged = 0; ri->promoted_slave = NULL; ri->notification_script = NULL; @@ -2095,7 +2099,7 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) { /* rewriteConfigMarkAsProcessed is handled after the loop */ /* sentinel down-after-milliseconds */ - if (master->down_after_period != SENTINEL_DEFAULT_DOWN_AFTER) { + if (master->down_after_period != sentinel_default_down_after) { line = sdscatprintf(sdsempty(), "sentinel down-after-milliseconds %s %ld", master->name, (long) master->down_after_period); @@ -2104,7 +2108,7 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) { } /* sentinel failover-timeout */ - if (master->failover_timeout != SENTINEL_DEFAULT_FAILOVER_TIMEOUT) { + if (master->failover_timeout != sentinel_default_failover_timeout) { line = sdscatprintf(sdsempty(), "sentinel failover-timeout %s %ld", master->name, (long) master->failover_timeout); @@ -2402,7 +2406,7 @@ void sentinelReconnectInstance(sentinelRedisInstance *ri) { instanceLink *link = ri->link; mstime_t now = mstime(); - if (now - ri->link->last_reconn_time < SENTINEL_PING_PERIOD) return; + if (now - ri->link->last_reconn_time < sentinel_ping_period) return; ri->link->last_reconn_time = now; /* Commands connection. */ @@ -2490,7 +2494,7 @@ int sentinelMasterLooksSane(sentinelRedisInstance *master) { master->flags & SRI_MASTER && master->role_reported == SRI_MASTER && (master->flags & (SRI_S_DOWN|SRI_O_DOWN)) == 0 && - (mstime() - master->info_refresh) < SENTINEL_INFO_PERIOD*2; + (mstime() - master->info_refresh) < sentinel_info_period*2; } /* Process the INFO output from masters. */ @@ -2683,7 +2687,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { /* A slave turned into a master. We want to force our view and * reconfigure as slave. Wait some time after the change before * going forward, to receive new configs if any. */ - mstime_t wait_time = SENTINEL_PUBLISH_PERIOD*4; + mstime_t wait_time = sentinel_publish_period*4; if (!(ri->flags & SRI_PROMOTED) && sentinelMasterLooksSane(ri->master) && @@ -3024,8 +3028,8 @@ void sentinelForceHelloUpdateDictOfRedisInstances(dict *instances) { di = dictGetSafeIterator(instances); while((de = dictNext(di)) != NULL) { sentinelRedisInstance *ri = dictGetVal(de); - if (ri->last_pub_time >= (SENTINEL_PUBLISH_PERIOD+1)) - ri->last_pub_time -= (SENTINEL_PUBLISH_PERIOD+1); + if (ri->last_pub_time >= (sentinel_publish_period+1)) + ri->last_pub_time -= (sentinel_publish_period+1); } dictReleaseIterator(di); } @@ -3040,8 +3044,8 @@ void sentinelForceHelloUpdateDictOfRedisInstances(dict *instances) { * to the other Sentinels ASAP. */ int sentinelForceHelloUpdateForMaster(sentinelRedisInstance *master) { if (!(master->flags & SRI_MASTER)) return C_ERR; - if (master->last_pub_time >= (SENTINEL_PUBLISH_PERIOD+1)) - master->last_pub_time -= (SENTINEL_PUBLISH_PERIOD+1); + if (master->last_pub_time >= (sentinel_publish_period+1)) + master->last_pub_time -= (sentinel_publish_period+1); sentinelForceHelloUpdateDictOfRedisInstances(master->sentinels); sentinelForceHelloUpdateDictOfRedisInstances(master->slaves); return C_OK; @@ -3104,14 +3108,14 @@ void sentinelSendPeriodicCommands(sentinelRedisInstance *ri) { { info_period = 1000; } else { - info_period = SENTINEL_INFO_PERIOD; + info_period = sentinel_info_period; } /* We ping instances every time the last received pong is older than * the configured 'down-after-milliseconds' time, but every second * anyway if 'down-after-milliseconds' is greater than 1 second. */ ping_period = ri->down_after_period; - if (ping_period > SENTINEL_PING_PERIOD) ping_period = SENTINEL_PING_PERIOD; + if (ping_period > sentinel_ping_period) ping_period = sentinel_ping_period; /* Send INFO to masters and slaves, not sentinels. */ if ((ri->flags & SRI_SENTINEL) == 0 && @@ -3131,7 +3135,7 @@ void sentinelSendPeriodicCommands(sentinelRedisInstance *ri) { } /* PUBLISH hello messages to all the three kinds of instances. */ - if ((now - ri->last_pub_time) > SENTINEL_PUBLISH_PERIOD) { + if ((now - ri->last_pub_time) > sentinel_publish_period) { sentinelSendHello(ri); } } @@ -3445,6 +3449,215 @@ void addReplySentinelRedisInstance(client *c, sentinelRedisInstance *ri) { setDeferredMapLen(c,mbl,fields); } +void sentinelSetDebugConfigParameters(client *c){ + + int j; + int badarg = 0; /* Bad argument position for error reporting. */ + char *option; + + /* Process option - value pairs. */ + for (j = 2; j < c->argc; j++) { + int moreargs = (c->argc-1) - j; + option = c->argv[j]->ptr; + long long ll; + + if (!strcasecmp(option,"info-period") && moreargs > 0) { + /* info-period <milliseconds> */ + robj *o = c->argv[++j]; + if (getLongLongFromObject(o,&ll) == C_ERR || ll <= 0) { + badarg = j; + goto badfmt; + } + sentinel_info_period = ll; + + } else if (!strcasecmp(option,"ping-period") && moreargs > 0) { + /* ping-period <milliseconds> */ + robj *o = c->argv[++j]; + if (getLongLongFromObject(o,&ll) == C_ERR || ll <= 0) { + badarg = j; + goto badfmt; + } + sentinel_ping_period = ll; + + } else if (!strcasecmp(option,"ask-period") && moreargs > 0) { + /* ask-period <milliseconds> */ + robj *o = c->argv[++j]; + if (getLongLongFromObject(o,&ll) == C_ERR || ll <= 0) { + badarg = j; + goto badfmt; + } + sentinel_ask_period = ll; + + } else if (!strcasecmp(option,"publish-period") && moreargs > 0) { + /* publish-period <milliseconds> */ + robj *o = c->argv[++j]; + if (getLongLongFromObject(o,&ll) == C_ERR || ll <= 0) { + badarg = j; + goto badfmt; + } + sentinel_publish_period = ll; + + }else if (!strcasecmp(option,"default-down-after") && moreargs > 0) { + /* default-down-after <milliseconds> */ + robj *o = c->argv[++j]; + if (getLongLongFromObject(o,&ll) == C_ERR || ll <= 0) { + badarg = j; + goto badfmt; + } + sentinel_default_down_after = ll; + + } else if (!strcasecmp(option,"tilt-trigger") && moreargs > 0) { + /* tilt-trigger <milliseconds> */ + robj *o = c->argv[++j]; + if (getLongLongFromObject(o,&ll) == C_ERR || ll <= 0) { + badarg = j; + goto badfmt; + } + sentinel_tilt_trigger = ll; + + } else if (!strcasecmp(option,"tilt-period") && moreargs > 0) { + /* tilt-period <milliseconds> */ + robj *o = c->argv[++j]; + if (getLongLongFromObject(o,&ll) == C_ERR || ll <= 0) { + badarg = j; + goto badfmt; + } + sentinel_tilt_period = ll; + + } else if (!strcasecmp(option,"slave-reconf-timeout") && moreargs > 0) { + /* slave-reconf-timeout <milliseconds> */ + robj *o = c->argv[++j]; + if (getLongLongFromObject(o,&ll) == C_ERR || ll <= 0) { + badarg = j; + goto badfmt; + } + sentinel_slave_reconf_timeout = ll; + + } else if (!strcasecmp(option,"min-link-reconnect-period") && moreargs > 0) { + /* min-link-reconnect-period <milliseconds> */ + robj *o = c->argv[++j]; + if (getLongLongFromObject(o,&ll) == C_ERR || ll <= 0) { + badarg = j; + goto badfmt; + } + sentinel_min_link_reconnect_period = ll; + + } else if (!strcasecmp(option,"default-failover-timeout") && moreargs > 0) { + /* default-failover-timeout <milliseconds> */ + robj *o = c->argv[++j]; + if (getLongLongFromObject(o,&ll) == C_ERR || ll <= 0) { + badarg = j; + goto badfmt; + } + sentinel_default_failover_timeout = ll; + + } else if (!strcasecmp(option,"election-timeout") && moreargs > 0) { + /* election-timeout <milliseconds> */ + robj *o = c->argv[++j]; + if (getLongLongFromObject(o,&ll) == C_ERR || ll <= 0) { + badarg = j; + goto badfmt; + } + sentinel_election_timeout = ll; + + } else if (!strcasecmp(option,"script-max-runtime") && moreargs > 0) { + /* script-max-runtime <milliseconds> */ + robj *o = c->argv[++j]; + if (getLongLongFromObject(o,&ll) == C_ERR || ll <= 0) { + badarg = j; + goto badfmt; + } + sentinel_script_max_runtime = ll; + + } else if (!strcasecmp(option,"script-retry-delay") && moreargs > 0) { + /* script-retry-delay <milliseconds> */ + robj *o = c->argv[++j]; + if (getLongLongFromObject(o,&ll) == C_ERR || ll <= 0) { + badarg = j; + goto badfmt; + } + sentinel_script_retry_delay = ll; + + } else { + addReplyErrorFormat(c,"Unknown option or number of arguments for " + "SENTINEL SET '%s'", option); + } + + } + + addReply(c,shared.ok); + return; + +badfmt: /* Bad format errors */ + addReplyErrorFormat(c,"Invalid argument '%s' for SENTINEL SET '%s'", + (char*)c->argv[badarg]->ptr,option); + + return; +} + + +void addReplySentinelDebugInfo(client *c) { + + void *mbl; + int fields = 0; + + mbl = addReplyDeferredLen(c); + + addReplyBulkCString(c,"INFO-PERIOD"); + addReplyBulkLongLong(c,sentinel_info_period); + fields++; + + addReplyBulkCString(c,"PING-PERIOD"); + addReplyBulkLongLong(c,sentinel_ping_period); + fields++; + + addReplyBulkCString(c,"ASK-PERIOD"); + addReplyBulkLongLong(c,sentinel_ask_period); + fields++; + + addReplyBulkCString(c,"PUBLISH-PERIOD"); + addReplyBulkLongLong(c,sentinel_publish_period); + fields++; + + addReplyBulkCString(c,"DEFAULT-DOWN-AFTER"); + addReplyBulkLongLong(c,sentinel_default_down_after); + fields++; + + addReplyBulkCString(c,"DEFAULT-FAILOVER-TIMEOUT"); + addReplyBulkLongLong(c,sentinel_default_failover_timeout); + fields++; + + addReplyBulkCString(c,"TILT-TRIGGER"); + addReplyBulkLongLong(c,sentinel_tilt_trigger); + fields++; + + addReplyBulkCString(c,"TILT-PERIOD"); + addReplyBulkLongLong(c,sentinel_tilt_period); + fields++; + + addReplyBulkCString(c,"SLAVE-RECONF-TIMEOUT"); + addReplyBulkLongLong(c,sentinel_slave_reconf_timeout); + fields++; + + addReplyBulkCString(c,"MIN-LINK-RECONNECT-PERIOD"); + addReplyBulkLongLong(c,sentinel_min_link_reconnect_period); + fields++; + + addReplyBulkCString(c,"ELECTION-TIMEOUT"); + addReplyBulkLongLong(c,sentinel_election_timeout); + fields++; + + addReplyBulkCString(c,"SCRIPT-MAX-RUNTIME"); + addReplyBulkLongLong(c,sentinel_script_max_runtime); + fields++; + + addReplyBulkCString(c,"SCRIPT-RETRY-DELAY"); + addReplyBulkLongLong(c,sentinel_script_retry_delay); + fields++; + + setDeferredMapLen(c,mbl,fields); +} + /* Output a number of instances contained inside a dictionary as * Redis protocol. */ void addReplyDictOfRedisInstances(client *c, dict *instances) { @@ -3518,6 +3731,8 @@ void sentinelCommand(client *c) { " Set a global Sentinel configuration parameter.", "CONFIG GET <param>", " Get global Sentinel configuration parameter.", +"DEBUG", +" Show a list of configurable time parameters and their values (milliseconds).", "GET-MASTER-ADDR-BY-NAME <master-name>", " Return the ip and port number of the master with that name.", "FAILOVER <master-name>", @@ -3869,7 +4084,13 @@ NULL } } addReply(c,shared.ok); - } else { + } else if (!strcasecmp(c->argv[1]->ptr,"debug")) { + if(c->argc == 2) + addReplySentinelDebugInfo(c); + else + sentinelSetDebugConfigParameters(c); + } + else { addReplySubcommandSyntaxError(c); } return; @@ -4170,7 +4391,7 @@ void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) { * pending ping for more than half the timeout. */ if (ri->link->cc && (mstime() - ri->link->cc_conn_time) > - SENTINEL_MIN_LINK_RECONNECT_PERIOD && + sentinel_min_link_reconnect_period && ri->link->act_ping_time != 0 && /* There is a pending ping... */ /* The pending ping is delayed, and we did not receive * error replies as well. */ @@ -4187,8 +4408,8 @@ void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) { */ if (ri->link->pc && (mstime() - ri->link->pc_conn_time) > - SENTINEL_MIN_LINK_RECONNECT_PERIOD && - (mstime() - ri->link->pc_last_activity) > (SENTINEL_PUBLISH_PERIOD*3)) + sentinel_min_link_reconnect_period && + (mstime() - ri->link->pc_last_activity) > (sentinel_publish_period*3)) { instanceLinkCloseConnection(ri->link,ri->link->pc); } @@ -4203,7 +4424,7 @@ void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) { (ri->flags & SRI_MASTER && ri->role_reported == SRI_SLAVE && mstime() - ri->role_reported_time > - (ri->down_after_period+SENTINEL_INFO_PERIOD*2))) + (ri->down_after_period+sentinel_info_period*2))) { /* Is subjectively down */ if ((ri->flags & SRI_S_DOWN) == 0) { @@ -4318,7 +4539,7 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master, int f int retval; /* If the master state from other sentinel is too old, we clear it. */ - if (elapsed > SENTINEL_ASK_PERIOD*5) { + if (elapsed > sentinel_ask_period*5) { ri->flags &= ~SRI_MASTER_DOWN; sdsfree(ri->leader); ri->leader = NULL; @@ -4332,7 +4553,7 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master, int f if ((master->flags & SRI_S_DOWN) == 0) continue; if (ri->link->disconnected) continue; if (!(flags & SENTINEL_ASK_FORCED) && - mstime() - ri->last_master_down_reply_time < SENTINEL_ASK_PERIOD) + mstime() - ri->last_master_down_reply_time < sentinel_ask_period) continue; /* Ask */ @@ -4697,16 +4918,16 @@ sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) { if (slave->flags & (SRI_S_DOWN|SRI_O_DOWN)) continue; if (slave->link->disconnected) continue; - if (mstime() - slave->link->last_avail_time > SENTINEL_PING_PERIOD*5) continue; + if (mstime() - slave->link->last_avail_time > sentinel_ping_period*5) continue; if (slave->slave_priority == 0) continue; /* If the master is in SDOWN state we get INFO for slaves every second. * Otherwise we get it with the usual period so we need to account for * a larger delay. */ if (master->flags & SRI_S_DOWN) - info_validity_time = SENTINEL_PING_PERIOD*5; + info_validity_time = sentinel_ping_period*5; else - info_validity_time = SENTINEL_INFO_PERIOD*3; + info_validity_time = sentinel_info_period*3; if (mstime() - slave->info_refresh > info_validity_time) continue; if (slave->master_link_down_time > max_master_down_time) continue; instance[instances++] = slave; @@ -4734,7 +4955,7 @@ void sentinelFailoverWaitStart(sentinelRedisInstance *ri) { /* If I'm not the leader, and it is not a forced failover via * SENTINEL FAILOVER, then I can't continue with the failover. */ if (!isleader && !(ri->flags & SRI_FORCE_FAILOVER)) { - int election_timeout = SENTINEL_ELECTION_TIMEOUT; + int election_timeout = sentinel_election_timeout; /* The election timeout is the MIN between SENTINEL_ELECTION_TIMEOUT * and the configured failover timeout. */ @@ -4904,7 +5125,7 @@ void sentinelFailoverReconfNextSlave(sentinelRedisInstance *master) { * configuration later. */ if ((slave->flags & SRI_RECONF_SENT) && (mstime() - slave->slave_reconf_sent_time) > - SENTINEL_SLAVE_RECONF_TIMEOUT) + sentinel_slave_reconf_timeout) { sentinelEvent(LL_NOTICE,"-slave-reconf-sent-timeout",slave,"%@"); slave->flags &= ~SRI_RECONF_SENT; @@ -5004,7 +5225,7 @@ void sentinelHandleRedisInstance(sentinelRedisInstance *ri) { * TILT happens when we find something odd with the time, like a * sudden change in the clock. */ if (sentinel.tilt) { - if (mstime()-sentinel.tilt_start_time < SENTINEL_TILT_PERIOD) return; + if (mstime()-sentinel.tilt_start_time < sentinel_tilt_period) return; sentinel.tilt = 0; sentinelEvent(LL_WARNING,"-tilt",NULL,"#tilt mode exited"); } @@ -5076,7 +5297,7 @@ void sentinelCheckTiltCondition(void) { mstime_t now = mstime(); mstime_t delta = now - sentinel.previous_time; - if (delta < 0 || delta > SENTINEL_TILT_TRIGGER) { + if (delta < 0 || delta > sentinel_tilt_trigger) { sentinel.tilt = 1; sentinel.tilt_start_time = mstime(); sentinelEvent(LL_WARNING,"+tilt",NULL,"#tilt mode entered"); |