/*- * See the file LICENSE for redistribution information. * * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ #include "db_config.h" #include "db_int.h" static db_timeout_t __repmgr_compute_response_time __P((ENV *)); static int __repmgr_elect __P((ENV *, u_int32_t, db_timespec *)); static int __repmgr_elect_main __P((ENV *, DB_THREAD_INFO *, REPMGR_RUNNABLE *)); static void *__repmgr_elect_thread __P((void *)); /* * Starts an election thread. * * PUBLIC: int __repmgr_init_election __P((ENV *, u_int32_t)); * * !!! * Caller must hold mutex. */ int __repmgr_init_election(env, flags) ENV *env; u_int32_t flags; { DB_REP *db_rep; REPMGR_RUNNABLE *th; int ret; u_int i, new_size; COMPQUIET(th, NULL); db_rep = env->rep_handle; if (db_rep->repmgr_status == stopped) { RPRINT(env, (env, DB_VERB_REPMGR_MISC, "ignoring elect thread request %#lx; repmgr is stopped", (u_long)flags)); return (0); } /* Find an available slot, indexed by 'i'; allocate more if needed. */ for (i = 0; i < db_rep->aelect_threads; i++) { th = db_rep->elect_threads[i]; if (th == NULL) break; if (th->finished) { if ((ret = __repmgr_thread_join(th)) != 0) return (ret); /* Reuse the space in a moment. */ break; } } if (i == db_rep->aelect_threads) { new_size = db_rep->aelect_threads + 1; if ((ret = __os_realloc(env, sizeof(REPMGR_RUNNABLE*) * new_size, &db_rep->elect_threads)) != 0) return (ret); db_rep->aelect_threads = new_size; STAT(db_rep->region->mstat.st_max_elect_threads = new_size); th = db_rep->elect_threads[i] = NULL; } if (th == NULL && (ret = __os_malloc(env, sizeof(REPMGR_RUNNABLE), &th)) != 0) return (ret); th->run = __repmgr_elect_thread; th->args.flags = flags; if ((ret = __repmgr_thread_start(env, th)) == 0) STAT(db_rep->region->mstat.st_elect_threads++); else { __os_free(env, th); th = NULL; } db_rep->elect_threads[i] = th; return (ret); } static void * __repmgr_elect_thread(argsp) void *argsp; { REPMGR_RUNNABLE *th; ENV *env; DB_THREAD_INFO *ip; int ret; th = argsp; env = th->env; ip = NULL; ret = 0; ENV_ENTER_RET(env, ip, ret); if (ret == 0) RPRINT(env, (env, DB_VERB_REPMGR_MISC, "starting election thread")); if (ret != 0 || (ret = __repmgr_elect_main(env, ip, th)) != 0) { __db_err(env, ret, "election thread failed"); RPRINT(env, (env, DB_VERB_REPMGR_MISC, "election thread is exiting")); ENV_LEAVE(env, ip); (void)__repmgr_thread_failure(env, ret); } if (ret == 0) { RPRINT(env, (env, DB_VERB_REPMGR_MISC, "election thread is exiting")); ENV_LEAVE(env, ip); } th->finished = TRUE; return (NULL); } static int __repmgr_elect_main(env, ip, th) ENV *env; DB_THREAD_INFO *ip; REPMGR_RUNNABLE *th; { DB_REP *db_rep; REP *rep; #ifdef DB_WIN32 DWORD duration; db_timeout_t t; #else struct timespec deadline; #endif db_timespec failtime, now, repstart_time, target, wait_til; db_timeout_t delay_time, response_time, tmp_time; u_long sec, usec; u_int32_t flags, max_tries, tries; int client_detected, done_repstart, lsnhist_match, master_detected; int ret, suppress_election; enum { ELECTION, REPSTART } action; COMPQUIET(usec, 0); COMPQUIET(max_tries, 0); COMPQUIET(action, ELECTION); db_rep = env->rep_handle; rep = db_rep->region; flags = th->args.flags; if (LF_ISSET(ELECT_F_EVENT_NOTIFY)) DB_EVENT(env, DB_EVENT_REP_MASTER_FAILURE, NULL); /* * If leases are enabled, delay the election to allow any straggler * messages to get processed that might grant our lease again and * fool the base code into thinking the master is still there. * Any delay here offsets the time election code will wait for a * lease grant to expire. So with leases we're not adding more delay. */ if (FLD_ISSET(db_rep->region->config, REP_C_LEASE)) { /* * Use the smallest of the lease timeout, ack timeout, * or connection retry timeout. We want to give straggler * messages a chance to get processed, but get an election * underway as soon as possible to find a master. */ if ((ret = __rep_get_timeout(env->dbenv, DB_REP_LEASE_TIMEOUT, &delay_time)) != 0) goto out; if ((ret = __rep_get_timeout(env->dbenv, DB_REP_ACK_TIMEOUT, &tmp_time)) != 0) goto out; if (tmp_time < delay_time) delay_time = tmp_time; if ((ret = __rep_get_timeout(env->dbenv, DB_REP_CONNECTION_RETRY, &tmp_time)) != 0) goto out; if (tmp_time < delay_time) delay_time = tmp_time; sec = delay_time / US_PER_SEC; usec = delay_time % US_PER_SEC; RPRINT(env, (env, DB_VERB_REPMGR_MISC, "Election with leases pause sec %lu, usec %lu", sec, usec)); __os_yield(env, sec, usec); } /* * As a freshly started thread, lay claim to the title of being * "preferred". If an older thread is sleeping for retry, when it wakes * up it will relinquish its role (since there's no need for multiple * threads to sleep and retry). */ LOCK_MUTEX(db_rep->mutex); db_rep->preferred_elect_thr = th; UNLOCK_MUTEX(db_rep->mutex); /* * In preferred master mode, the select thread signals when a * client has lost its connection to the master via prefmas_pending, * but the actual restart as temporary master is done here in an * election thread. */ if (IS_PREFMAS_MODE(env) && F_ISSET(rep, REP_F_CLIENT) && db_rep->prefmas_pending == start_temp_master) { db_rep->prefmas_pending = no_action; RPRINT(env, (env, DB_VERB_REPMGR_MISC, "elect_main preferred master restart temp master")); ret = __repmgr_become_master(env, 0); goto out; } /* Get preferred master wait limits for detecting the other site. */ if (IS_PREFMAS_MODE(env) && (ret = __repmgr_prefmas_get_wait(env, &max_tries, &usec)) != 0) goto out; /* Preferred master mode master site start-up. */ if (IS_PREFMAS_MODE(env) && FLD_ISSET(rep->config, REP_C_PREFMAS_MASTER) && LF_ISSET(ELECT_F_STARTUP)) { RPRINT(env, (env, DB_VERB_REPMGR_MISC, "elect_main preferred master site startup")); client_detected = FALSE; lsnhist_match = FALSE; tries = 0; while (!client_detected && tries < max_tries) { __os_yield(env, 0, usec); tries++; client_detected = __repmgr_prefmas_connected(env); } if (client_detected) { RPRINT(env, (env, DB_VERB_REPMGR_MISC, "elect_main preferred master client detected")); /* * Restart remote site as a client. Depending on the * outcome of lsnhist_match below, this site will * either restart as master or it will start an * election. In either case, the remote site should * be running as a client. * * Then perform the lsnhist_match comparison. */ if ((ret = __repmgr_restart_site_as_client( env, 1)) != 0 || (ret = __repmgr_lsnhist_match(env, ip, 1, &lsnhist_match)) != 0) goto out; /* * An lsnhist_match means that we have a continuous * set of transactions and it is safe to call a * comparison election to preserve any temporary master * transactions that were committed while this site * was down. */ if (lsnhist_match) { F_CLR(rep, REP_F_HOLD_GEN); LF_SET(ELECT_F_IMMED); LF_CLR(ELECT_F_STARTUP); /* Continue on to election code below. */ } } /* * If we didn't detect a client within a reasonable time or * we failed the lsnhist_match (meaning we have conflicting * sets of transactions), we start this site as a master and * possibly force rollback of temporary master transactions. */ if (!client_detected || !lsnhist_match) { RPRINT(env, (env, DB_VERB_REPMGR_MISC, "elect_main preferred master site start master")); ret = __repmgr_become_master(env, 0); F_CLR(rep, REP_F_HOLD_GEN); goto out; } } /* Preferred master mode client site start-up. */ if (IS_PREFMAS_MODE(env) && FLD_ISSET(rep->config, REP_C_PREFMAS_CLIENT) && LF_ISSET(ELECT_F_STARTUP)) { RPRINT(env, (env, DB_VERB_REPMGR_MISC, "elect_main preferred master client site startup")); master_detected = FALSE; tries = 0; while (!master_detected && tries < max_tries) { __os_yield(env, 0, usec); tries++; master_detected = __repmgr_prefmas_connected(env); } /* * If we find the master, restart as client here so that we * send a newclient message after we are connected to the * master. The master will send a newmaster message so that * we can start the client sync process. * * If we haven't found the master after the timeout, start as * temporary master. */ if (master_detected) { RPRINT(env, (env, DB_VERB_REPMGR_MISC, "elect_main preferred master detected")); ret = __repmgr_become_client(env); } else { RPRINT(env, (env, DB_VERB_REPMGR_MISC, "elect_main preferred master client start master")); ret = __repmgr_become_master(env, 0); } goto out; } /* * The 'done_repstart' flag keeps track of which was our most recent * operation (repstart or election), so that we can alternate * appropriately. There are a few different ways this thread can be * invoked, and all but one specify some form of immediate election be * called. The one exception is at initial start-up, where we * first probe for a master by sending out rep_start(CLIENT) calls. */ if (LF_ISSET(ELECT_F_IMMED) && !IS_VIEW_SITE(env)) { /* * When the election succeeds, we've successfully completed * everything we need to do. If it fails in an unexpected way, * we abort all processing as usual. The only time we need to * stay in here and do some more work is on DB_REP_UNAVAIL, * in which case we want to wait a while and retry later. */ if ((ret = __repmgr_elect(env, flags, &failtime)) == DB_REP_UNAVAIL) done_repstart = FALSE; else goto out; } else { /* * We didn't really have an election failure, because in this * case we haven't even done an election yet. But the timing * we want turns out the same: we want to wait for the election * retry time and then call for an election if nothing else * interesting happens before then. */ __os_gettime(env, &failtime, 1); /* * Although we didn't do a repstart in this thread, we know that * our caller did one just before creating the thread. */ done_repstart = TRUE; } LOCK_MUTEX(db_rep->mutex); for (;;) { ret = 0; if (db_rep->repmgr_status == stopped) goto unlock; /* * If we've become the master (which could happen after an * election in another election thread), or we find we have a * working connection to a known master, then we're quite * content: that's really the essential purpose of this whole * thread. */ if (__repmgr_master_is_known(env)) goto unlock; /* * When circumstances force us to do an immediate election, we * may be forced to create multiple threads in order to do so. * But we certainly don't need multiple threads sleeping, * alternating and retrying. The "preferred election thread" is * the one that has the authority and responsibility to * persevere until our work is done. Note that this role can * switch from one thread to another, depending on the timing of * events. In particular, when an election fails the thread * that got the failure becomes the chosen one that will remain * to avenge the failure. */ if (db_rep->preferred_elect_thr != th) goto unlock; timespecclear(&wait_til); __os_gettime(env, &now, 1); /* * See if it's time to retry the operation. Normally it's an * election we're interested in retrying. But we refrain from * calling for elections if so configured or we are a view. */ suppress_election = IS_VIEW_SITE(env) || (LF_ISSET(ELECT_F_STARTUP) ? db_rep->init_policy == DB_REP_CLIENT : !FLD_ISSET(rep->config, REP_C_ELECTIONS)) || LF_ISSET(ELECT_F_CLIENT_RESTART); repstart_time = db_rep->repstart_time; target = suppress_election ? repstart_time : failtime; TIMESPEC_ADD_DB_TIMEOUT(&target, rep->election_retry_wait); if (timespeccmp(&now, &target, >=)) { /* * We've surpassed our target retry time. * However, elections should generally alternate with * rep_start calls, so do that if we haven't done one * since the last election. */ action = suppress_election ? REPSTART : (done_repstart ? ELECTION : REPSTART); } else if (db_rep->new_connection) { /* Seen a recent new connection, let's do rep_start. */ action = REPSTART; } else wait_til = target; if (!timespecisset(&wait_til)) { response_time = __repmgr_compute_response_time(env); target = repstart_time; TIMESPEC_ADD_DB_TIMEOUT(&target, response_time); if (timespeccmp(&now, &target, <)) { /* We haven't waited long enough. */ wait_til = target; } } if (timespecisset(&wait_til)) { #ifdef DB_WIN32 timespecsub(&wait_til, &now); DB_TIMESPEC_TO_TIMEOUT(t, &wait_til, TRUE); duration = t / US_PER_MS; if ((ret = SignalObjectAndWait(*db_rep->mutex, db_rep->check_election, duration, FALSE)) != WAIT_OBJECT_0 && ret != WAIT_TIMEOUT) goto out; LOCK_MUTEX(db_rep->mutex); /* * Although there could be multiple threads, only the * "preferred" thread resets the event object. If the * others tried to do so, the preferred thread might * miss the wake-up. Another way of saying this is that * the precise meaning of the check_election event is * that "there may be some election-thread-related work * to do, and the correct thread to do it has not yet * been woken up". */ if (ret == WAIT_OBJECT_0 && db_rep->preferred_elect_thr == th && !ResetEvent(db_rep->check_election)) { ret = GetLastError(); goto unlock; } #else deadline.tv_sec = wait_til.tv_sec; deadline.tv_nsec = wait_til.tv_nsec; if ((ret = pthread_cond_timedwait( &db_rep->check_election, db_rep->mutex, &deadline)) != ETIMEDOUT && ret != 0) goto unlock; #endif continue; } UNLOCK_MUTEX(db_rep->mutex); if (action == ELECTION) { db_rep->new_connection = FALSE; if ((ret = __repmgr_elect(env, 0, &failtime)) == DB_REP_UNAVAIL) done_repstart = FALSE; else goto out; LOCK_MUTEX(db_rep->mutex); db_rep->preferred_elect_thr = th; } else { DB_ASSERT(env, action == REPSTART); db_rep->new_connection = FALSE; if ((ret = __repmgr_repstart(env, DB_REP_CLIENT, 0)) != 0) goto out; done_repstart = TRUE; LOCK_MUTEX(db_rep->mutex); __os_gettime(env, &db_rep->repstart_time, 1); } } #ifdef HAVE_STATISTICS /* * We normally don't bother taking a mutex to increment statistics. But * in this case, since we're incrementing and decrementing in pairs, it * could be very weird if we were "off somewhat". For example, we could * get a negative value. And this is not a high-traffic, performance- * critical path. * On the other hand, it suffices to take repmgr's (handle-based) * mutex, rather than the rep mutex which normally protects shared * memory, since all election thread activity must be occurring in the * single listener process, under control of one single rep handle. */ out: LOCK_MUTEX(db_rep->mutex); unlock: rep->mstat.st_elect_threads--; UNLOCK_MUTEX(db_rep->mutex); #else unlock: UNLOCK_MUTEX(db_rep->mutex); out: #endif return (ret); } static db_timeout_t __repmgr_compute_response_time(env) ENV *env; { DB_REP *db_rep; REP *rep; db_timeout_t ato, eto; db_rep = env->rep_handle; rep = db_rep->region; /* * Avoid crowding operations too close together. If we've just recently * done a rep_start, wait a moment in case there's a master out there, * to give it a chance to respond with a NEWMASTER message. This is * particularly an issue at start-up time, when we're likely to have * several "new connection establishment" events bombarding us with lots * of rep_start requests in quick succession. * * We don't have a separate user configuration for rep_start response, * but it's reasonable to expect it to be similar to either the ack * timeout or the election timeout, whichever is smaller. However, only * consider the ack timeout if all signs point to it being in use. */ ato = rep->ack_timeout; eto = rep->elect_timeout; if (ato > 0 && rep->perm_policy != DB_REPMGR_ACKS_NONE && rep->priority > 0 && ato < eto) return (ato); return (eto); } static int __repmgr_elect(env, flags, failtimep) ENV *env; u_int32_t flags; db_timespec *failtimep; { DB_REP *db_rep; REP *rep; u_int32_t invitation, nsites, nvotes; int ret, t_ret; db_rep = env->rep_handle; nsites = db_rep->region->config_nsites; DB_ASSERT(env, nsites > 0); /* * With only 2 sites in the group, even a single failure could make it * impossible to get a majority. So, fudge a little, unless the user * really wants strict safety. */ if (nsites == 2 && !FLD_ISSET(db_rep->region->config, REP_C_2SITE_STRICT)) nvotes = 1; else nvotes = ELECTION_MAJORITY(nsites); if (LF_ISSET(ELECT_F_INVITEE)) { /* * We're going to the election party because we were invited by * another site. Accept the other site's suggested value, if * it's reasonable. (I.e., the other site may have wanted to do * a "fast" election after losing contact with the master. If * so, let's not spoil it by imposing our own full nsites count * on it.) */ rep = db_rep->region; invitation = rep->nsites; if (invitation == nsites || invitation == nsites - 1) { nsites = invitation; } } if (LF_ISSET(ELECT_F_FAST) && nsites > nvotes) { /* * If we're doing an election because we noticed that the master * failed, it's reasonable to expect that the master won't * participate. By not waiting for its vote, we can probably * complete the election faster. But note that we shouldn't * allow this to affect nvotes calculation. * * However, if we have 2 sites, and strict majority is turned * on, now nvotes would be 2, and it doesn't make sense to * rep_elect to see nsites of 1 in that case. So only decrement * nsites if it currently exceeds nvotes. */ nsites--; } /* The rule for leases overrides all of the above. */ if (IS_USING_LEASES(env)) nsites = 0; switch (ret = __rep_elect_int(env, nsites, nvotes, 0)) { case DB_REP_UNAVAIL: __os_gettime(env, failtimep, 1); DB_EVENT(env, DB_EVENT_REP_ELECTION_FAILED, NULL); /* * If an election fails with DB_REP_UNAVAIL, it could be * because a participating site has an obsolete, too-high * notion of the group size. (This could happen if the site * was down/disconnected during removal of some (other) sites.) * To remedy this, broadcast a current copy of the membership * list. Since all sites are doing this, and we always ratchet * to the most up-to-date version, this should bring all sites * up to date. We only do this after a failure, during what * will normally be an idle period anyway, so that we don't * slow down a first election following the loss of an active * master. */ if ((t_ret = __repmgr_bcast_member_list(env)) != 0) ret = t_ret; break; case 0: if (db_rep->takeover_pending) ret = __repmgr_claim_victory(env); break; case DB_REP_IGNORE: ret = 0; break; default: __db_err(env, ret, DB_STR("3629", "unexpected election failure")); break; } return (ret); } /* * Becomes master after we've won an election, if we can. * * PUBLIC: int __repmgr_claim_victory __P((ENV *)); */ int __repmgr_claim_victory(env) ENV *env; { int ret; env->rep_handle->takeover_pending = FALSE; if ((ret = __repmgr_become_master(env, 0)) == DB_REP_UNAVAIL) { ret = 0; RPRINT(env, (env, DB_VERB_REPMGR_MISC, "Won election but lost race with DUPMASTER client intent")); } return (ret); } /* * When turning on elections in an already-running system, check to see if we're * in a state where we need an election (i.e., we would have started one * previously if elections hadn't been turned off), and if so start one. * * PUBLIC: int __repmgr_turn_on_elections __P((ENV *)); */ int __repmgr_turn_on_elections(env) ENV *env; { DB_REP *db_rep; REP *rep; int ret; db_rep = env->rep_handle; rep = db_rep->region; ret = 0; DB_ASSERT(env, REP_ON(env)); LOCK_MUTEX(db_rep->mutex); if (db_rep->selector == NULL || !FLD_ISSET(rep->config, REP_C_ELECTIONS) || __repmgr_master_is_known(env)) goto out; ret = __repmgr_init_election(env, ELECT_F_IMMED); out: UNLOCK_MUTEX(db_rep->mutex); return (ret); }