diff options
author | Greg Farnum <greg@inktank.com> | 2013-04-30 15:39:21 -0700 |
---|---|---|
committer | Greg Farnum <greg@inktank.com> | 2013-04-30 15:39:21 -0700 |
commit | 3cf5824f60b15cdf4db4e895b4da0d6c964b9ed4 (patch) | |
tree | 4e5232b0dcd486f5b80497b77fd3b615a50e0c36 | |
parent | cd1d6fb3f9b906f13cf281294d9272e1e92a0243 (diff) | |
parent | a97eccadf75cd390ae84ed13d679d67720fda37b (diff) | |
download | ceph-3cf5824f60b15cdf4db4e895b4da0d6c964b9ed4.tar.gz |
Merge branch 'wip-4837-election-syncing' into next
Reviewed-by: Sage Weil <sage@inktank.com>
-rw-r--r-- | src/messages/MMonElection.h | 23 | ||||
-rw-r--r-- | src/mon/Elector.cc | 36 | ||||
-rw-r--r-- | src/mon/Elector.h | 14 | ||||
-rw-r--r-- | src/mon/Monitor.cc | 28 |
4 files changed, 43 insertions, 58 deletions
diff --git a/src/messages/MMonElection.h b/src/messages/MMonElection.h index 9771f6123d6..3d7dd4ec90e 100644 --- a/src/messages/MMonElection.h +++ b/src/messages/MMonElection.h @@ -45,19 +45,20 @@ public: bufferlist monmap_bl; set<int> quorum; uint64_t quorum_features; - version_t paxos_first_version; - version_t paxos_last_version; + /* the following were both used in the next branch for a while + * on user cluster, so we've left them in for compatibility. */ + version_t defunct_one; + version_t defunct_two; MMonElection() : Message(MSG_MON_ELECTION, HEAD_VERSION, COMPAT_VERSION), - op(0), epoch(0), quorum_features(0), paxos_first_version(0), - paxos_last_version(0) + op(0), epoch(0), quorum_features(0), defunct_one(0), + defunct_two(0) { } - MMonElection(int o, epoch_t e, MonMap *m, - version_t paxos_first, version_t paxos_last) + MMonElection(int o, epoch_t e, MonMap *m) : Message(MSG_MON_ELECTION, HEAD_VERSION, COMPAT_VERSION), fsid(m->fsid), op(o), epoch(e), quorum_features(0), - paxos_first_version(paxos_first), paxos_last_version(paxos_last) + defunct_one(0), defunct_two(0) { // encode using full feature set; we will reencode for dest later, // if necessary @@ -87,8 +88,8 @@ public: ::encode(monmap_bl, payload); ::encode(quorum, payload); ::encode(quorum_features, payload); - ::encode(paxos_first_version, payload); - ::encode(paxos_last_version, payload); + ::encode(defunct_one, payload); + ::encode(defunct_two, payload); } void decode_payload() { bufferlist::iterator p = payload.begin(); @@ -105,8 +106,8 @@ public: else quorum_features = 0; if (header.version >= 4) { - ::decode(paxos_first_version, p); - ::decode(paxos_last_version, p); + ::decode(defunct_one, p); + ::decode(defunct_two, p); } } diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc index b6f047e20d2..b9a7031848b 100644 --- a/src/mon/Elector.cc +++ b/src/mon/Elector.cc @@ -81,21 +81,18 @@ void Elector::start() electing_me = true; acked_me[mon->rank] = CEPH_FEATURES_ALL; leader_acked = -1; - acked_first_paxos_version = mon->paxos->get_first_committed(); // bcast to everyone else for (unsigned i=0; i<mon->monmap->size(); ++i) { if ((int)i == mon->rank) continue; - Message *m = new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap, - mon->paxos->get_first_committed(), - mon->paxos->get_version()); + Message *m = new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap); mon->messenger->send_message(m, mon->monmap->get_inst(i)); } reset_timer(); } -void Elector::defer(int who, version_t paxos_first) +void Elector::defer(int who) { dout(5) << "defer to " << who << dendl; @@ -107,11 +104,8 @@ void Elector::defer(int who, version_t paxos_first) // ack them leader_acked = who; - acked_first_paxos_version = paxos_first; ack_stamp = ceph_clock_now(g_ceph_context); - mon->messenger->send_message(new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap, - mon->paxos->get_first_committed(), - mon->paxos->get_version()), + mon->messenger->send_message(new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap), mon->monmap->get_inst(who)); // set a timer @@ -175,10 +169,7 @@ void Elector::victory() p != quorum.end(); ++p) { if (*p == mon->rank) continue; - MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch, - mon->monmap, - mon->paxos->get_first_committed(), - mon->paxos->get_version()); + MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch, mon->monmap); m->quorum = quorum; mon->messenger->send_message(m, mon->monmap->get_inst(*p)); } @@ -214,13 +205,10 @@ void Elector::handle_propose(MMonElection *m) } } - if ((mon->rank < from) && - // be careful that we have new enough data to be leader! - (m->paxos_first_version <= mon->paxos->get_version())) { + if (mon->rank < from) { // i would win over them. if (leader_acked >= 0) { // we already acked someone - assert((leader_acked < from) || // and they still win, of course - (acked_first_paxos_version > mon->paxos->get_version())); + assert(leader_acked < from); // and they still win, of course dout(5) << "no, we already acked " << leader_acked << dendl; } else { // wait, i should win! @@ -229,20 +217,16 @@ void Elector::handle_propose(MMonElection *m) mon->start_election(); } } - } else if (m->paxos_last_version >= mon->paxos->get_first_committed()) { + } else { // they would win over me if (leader_acked < 0 || // haven't acked anyone yet, or leader_acked > from || // they would win over who you did ack, or - leader_acked == from) { // this is the guy we're already deferring to - defer(from, m->paxos_first_version); + leader_acked == from) { // this is the guy we're already deferring to + defer(from); } else { // ignore them! dout(5) << "no, we already acked " << leader_acked << dendl; } - } else { // they are too out-of-date - dout(5) << "no, they are too far behind; paxos version: " - << m->paxos_last_version << " versus my first " - << mon->paxos->get_first_committed() << dendl; } m->put(); @@ -287,7 +271,7 @@ void Elector::handle_victory(MMonElection *m) dout(5) << "handle_victory from " << m->get_source() << " quorum_features " << m->quorum_features << dendl; int from = m->get_source().num(); - assert((from < mon->rank) || (acked_first_paxos_version > mon->paxos->get_version())); + assert(from < mon->rank); assert(m->epoch % 2 == 0); leader_acked = -1; diff --git a/src/mon/Elector.h b/src/mon/Elector.h index 9cce81e9f49..d81eb239763 100644 --- a/src/mon/Elector.h +++ b/src/mon/Elector.h @@ -126,10 +126,6 @@ class Elector { */ int leader_acked; /** - * Indicates the first_paxos_commit on who we've acked - */ - version_t acked_first_paxos_version; - /** * Indicates when we have acked him */ utime_t ack_stamp; @@ -201,17 +197,16 @@ class Elector { * to become the Leader. We will only defer an election if the monitor we * are deferring to outranks us. * - * @pre @p who outranks us (who < our rank, or we're behind their store) + * @pre @p who outranks us (i.e., who < our rank) * @pre @p who outranks any other monitor we have deferred to in the past * @post electing_me is false * @post leader_acked equals @p who * @post we sent an ack message to @p who * @post we reset the expire_event timer * - * @param who Some other monitor's numeric identifier. - * @param paxos_first The other monitor's first committed paxos version + * @param who Some other monitor's numeric identifier. */ - void defer(int who, version_t paxos_first); + void defer(int who); /** * The election has taken too long and has expired. * @@ -331,8 +326,7 @@ class Elector { epoch(0), participating(true), electing_me(false), - leader_acked(-1), - acked_first_paxos_version(0) { } + leader_acked(-1) { } /** * Initiate the Elector class. diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index b360bd0f9d9..dd490948b45 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -780,13 +780,6 @@ void Monitor::handle_sync_start(MMonSync *m) { dout(10) << __func__ << " " << *m << dendl; - /** - * This looks a bit odd, but we've seen cases where sync start messages - * get bounced around and end up at the originator without anybody - * noticing! - */ - assert(m->reply_to != messenger->get_myinst()); - /* If we are not the leader, then some monitor picked us as the point of * entry to the quorum during its synchronization process. Therefore, we * have an obligation of forwarding this message to leader, so the sender @@ -829,6 +822,20 @@ void Monitor::handle_sync_start(MMonSync *m) assert(quorum.empty()); assert(sync_leader.get() != NULL); + /** + * This looks a bit odd, but we've seen cases where sync start messages + * get bounced around and end up at the originator without anybody + * noticing!* If it happens, just drop the message and the timeouts + * will clean everything up -- eventually. + * [*] If a leader gets elected who is too far behind, he'll drop into + * bootstrap and sync, but the person he sends his sync to thinks he's + * still the leader and forwards the reply back. + */ + if (m->reply_to == messenger->get_myinst()) { + m->put(); + return; + } + dout(10) << __func__ << " forward " << *m << " to our sync leader at " << sync_leader->entity << dendl; @@ -1936,10 +1943,9 @@ void Monitor::handle_probe_reply(MMonProbe *m) if (m->quorum.size()) { dout(10) << " existing quorum " << m->quorum << dendl; - if ((paxos->get_version() + g_conf->paxos_max_join_drift < - m->paxos_last_version) || - (paxos->get_version() < m->paxos_first_version)){ - dout(10) << " peer paxos version " << m->paxos_last_version + if (paxos->get_version() < m->paxos_first_version) { + dout(10) << " peer paxos versions [" << m->paxos_first_version + << "," << m->paxos_last_version << "]" << " vs my version " << paxos->get_version() << " (too far ahead)" << dendl; |