summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGreg Farnum <greg@inktank.com>2013-04-30 15:39:21 -0700
committerGreg Farnum <greg@inktank.com>2013-04-30 15:39:21 -0700
commit3cf5824f60b15cdf4db4e895b4da0d6c964b9ed4 (patch)
tree4e5232b0dcd486f5b80497b77fd3b615a50e0c36
parentcd1d6fb3f9b906f13cf281294d9272e1e92a0243 (diff)
parenta97eccadf75cd390ae84ed13d679d67720fda37b (diff)
downloadceph-3cf5824f60b15cdf4db4e895b4da0d6c964b9ed4.tar.gz
Merge branch 'wip-4837-election-syncing' into next
Reviewed-by: Sage Weil <sage@inktank.com>
-rw-r--r--src/messages/MMonElection.h23
-rw-r--r--src/mon/Elector.cc36
-rw-r--r--src/mon/Elector.h14
-rw-r--r--src/mon/Monitor.cc28
4 files changed, 43 insertions, 58 deletions
diff --git a/src/messages/MMonElection.h b/src/messages/MMonElection.h
index 9771f6123d6..3d7dd4ec90e 100644
--- a/src/messages/MMonElection.h
+++ b/src/messages/MMonElection.h
@@ -45,19 +45,20 @@ public:
bufferlist monmap_bl;
set<int> quorum;
uint64_t quorum_features;
- version_t paxos_first_version;
- version_t paxos_last_version;
+ /* the following were both used in the next branch for a while
+ * on user cluster, so we've left them in for compatibility. */
+ version_t defunct_one;
+ version_t defunct_two;
MMonElection() : Message(MSG_MON_ELECTION, HEAD_VERSION, COMPAT_VERSION),
- op(0), epoch(0), quorum_features(0), paxos_first_version(0),
- paxos_last_version(0)
+ op(0), epoch(0), quorum_features(0), defunct_one(0),
+ defunct_two(0)
{ }
- MMonElection(int o, epoch_t e, MonMap *m,
- version_t paxos_first, version_t paxos_last)
+ MMonElection(int o, epoch_t e, MonMap *m)
: Message(MSG_MON_ELECTION, HEAD_VERSION, COMPAT_VERSION),
fsid(m->fsid), op(o), epoch(e), quorum_features(0),
- paxos_first_version(paxos_first), paxos_last_version(paxos_last)
+ defunct_one(0), defunct_two(0)
{
// encode using full feature set; we will reencode for dest later,
// if necessary
@@ -87,8 +88,8 @@ public:
::encode(monmap_bl, payload);
::encode(quorum, payload);
::encode(quorum_features, payload);
- ::encode(paxos_first_version, payload);
- ::encode(paxos_last_version, payload);
+ ::encode(defunct_one, payload);
+ ::encode(defunct_two, payload);
}
void decode_payload() {
bufferlist::iterator p = payload.begin();
@@ -105,8 +106,8 @@ public:
else
quorum_features = 0;
if (header.version >= 4) {
- ::decode(paxos_first_version, p);
- ::decode(paxos_last_version, p);
+ ::decode(defunct_one, p);
+ ::decode(defunct_two, p);
}
}
diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc
index b6f047e20d2..b9a7031848b 100644
--- a/src/mon/Elector.cc
+++ b/src/mon/Elector.cc
@@ -81,21 +81,18 @@ void Elector::start()
electing_me = true;
acked_me[mon->rank] = CEPH_FEATURES_ALL;
leader_acked = -1;
- acked_first_paxos_version = mon->paxos->get_first_committed();
// bcast to everyone else
for (unsigned i=0; i<mon->monmap->size(); ++i) {
if ((int)i == mon->rank) continue;
- Message *m = new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap,
- mon->paxos->get_first_committed(),
- mon->paxos->get_version());
+ Message *m = new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap);
mon->messenger->send_message(m, mon->monmap->get_inst(i));
}
reset_timer();
}
-void Elector::defer(int who, version_t paxos_first)
+void Elector::defer(int who)
{
dout(5) << "defer to " << who << dendl;
@@ -107,11 +104,8 @@ void Elector::defer(int who, version_t paxos_first)
// ack them
leader_acked = who;
- acked_first_paxos_version = paxos_first;
ack_stamp = ceph_clock_now(g_ceph_context);
- mon->messenger->send_message(new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap,
- mon->paxos->get_first_committed(),
- mon->paxos->get_version()),
+ mon->messenger->send_message(new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap),
mon->monmap->get_inst(who));
// set a timer
@@ -175,10 +169,7 @@ void Elector::victory()
p != quorum.end();
++p) {
if (*p == mon->rank) continue;
- MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch,
- mon->monmap,
- mon->paxos->get_first_committed(),
- mon->paxos->get_version());
+ MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch, mon->monmap);
m->quorum = quorum;
mon->messenger->send_message(m, mon->monmap->get_inst(*p));
}
@@ -214,13 +205,10 @@ void Elector::handle_propose(MMonElection *m)
}
}
- if ((mon->rank < from) &&
- // be careful that we have new enough data to be leader!
- (m->paxos_first_version <= mon->paxos->get_version())) {
+ if (mon->rank < from) {
// i would win over them.
if (leader_acked >= 0) { // we already acked someone
- assert((leader_acked < from) || // and they still win, of course
- (acked_first_paxos_version > mon->paxos->get_version()));
+ assert(leader_acked < from); // and they still win, of course
dout(5) << "no, we already acked " << leader_acked << dendl;
} else {
// wait, i should win!
@@ -229,20 +217,16 @@ void Elector::handle_propose(MMonElection *m)
mon->start_election();
}
}
- } else if (m->paxos_last_version >= mon->paxos->get_first_committed()) {
+ } else {
// they would win over me
if (leader_acked < 0 || // haven't acked anyone yet, or
leader_acked > from || // they would win over who you did ack, or
- leader_acked == from) { // this is the guy we're already deferring to
- defer(from, m->paxos_first_version);
+ leader_acked == from) { // this is the guy we're already deferring to
+ defer(from);
} else {
// ignore them!
dout(5) << "no, we already acked " << leader_acked << dendl;
}
- } else { // they are too out-of-date
- dout(5) << "no, they are too far behind; paxos version: "
- << m->paxos_last_version << " versus my first "
- << mon->paxos->get_first_committed() << dendl;
}
m->put();
@@ -287,7 +271,7 @@ void Elector::handle_victory(MMonElection *m)
dout(5) << "handle_victory from " << m->get_source() << " quorum_features " << m->quorum_features << dendl;
int from = m->get_source().num();
- assert((from < mon->rank) || (acked_first_paxos_version > mon->paxos->get_version()));
+ assert(from < mon->rank);
assert(m->epoch % 2 == 0);
leader_acked = -1;
diff --git a/src/mon/Elector.h b/src/mon/Elector.h
index 9cce81e9f49..d81eb239763 100644
--- a/src/mon/Elector.h
+++ b/src/mon/Elector.h
@@ -126,10 +126,6 @@ class Elector {
*/
int leader_acked;
/**
- * Indicates the first_paxos_commit on who we've acked
- */
- version_t acked_first_paxos_version;
- /**
* Indicates when we have acked him
*/
utime_t ack_stamp;
@@ -201,17 +197,16 @@ class Elector {
* to become the Leader. We will only defer an election if the monitor we
* are deferring to outranks us.
*
- * @pre @p who outranks us (who < our rank, or we're behind their store)
+ * @pre @p who outranks us (i.e., who < our rank)
* @pre @p who outranks any other monitor we have deferred to in the past
* @post electing_me is false
* @post leader_acked equals @p who
* @post we sent an ack message to @p who
* @post we reset the expire_event timer
*
- * @param who Some other monitor's numeric identifier.
- * @param paxos_first The other monitor's first committed paxos version
+ * @param who Some other monitor's numeric identifier.
*/
- void defer(int who, version_t paxos_first);
+ void defer(int who);
/**
* The election has taken too long and has expired.
*
@@ -331,8 +326,7 @@ class Elector {
epoch(0),
participating(true),
electing_me(false),
- leader_acked(-1),
- acked_first_paxos_version(0) { }
+ leader_acked(-1) { }
/**
* Initiate the Elector class.
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index b360bd0f9d9..dd490948b45 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -780,13 +780,6 @@ void Monitor::handle_sync_start(MMonSync *m)
{
dout(10) << __func__ << " " << *m << dendl;
- /**
- * This looks a bit odd, but we've seen cases where sync start messages
- * get bounced around and end up at the originator without anybody
- * noticing!
- */
- assert(m->reply_to != messenger->get_myinst());
-
/* If we are not the leader, then some monitor picked us as the point of
* entry to the quorum during its synchronization process. Therefore, we
* have an obligation of forwarding this message to leader, so the sender
@@ -829,6 +822,20 @@ void Monitor::handle_sync_start(MMonSync *m)
assert(quorum.empty());
assert(sync_leader.get() != NULL);
+ /**
+ * This looks a bit odd, but we've seen cases where sync start messages
+ * get bounced around and end up at the originator without anybody
+ * noticing!* If it happens, just drop the message and the timeouts
+ * will clean everything up -- eventually.
+ * [*] If a leader gets elected who is too far behind, he'll drop into
+ * bootstrap and sync, but the person he sends his sync to thinks he's
+ * still the leader and forwards the reply back.
+ */
+ if (m->reply_to == messenger->get_myinst()) {
+ m->put();
+ return;
+ }
+
dout(10) << __func__ << " forward " << *m
<< " to our sync leader at "
<< sync_leader->entity << dendl;
@@ -1936,10 +1943,9 @@ void Monitor::handle_probe_reply(MMonProbe *m)
if (m->quorum.size()) {
dout(10) << " existing quorum " << m->quorum << dendl;
- if ((paxos->get_version() + g_conf->paxos_max_join_drift <
- m->paxos_last_version) ||
- (paxos->get_version() < m->paxos_first_version)){
- dout(10) << " peer paxos version " << m->paxos_last_version
+ if (paxos->get_version() < m->paxos_first_version) {
+ dout(10) << " peer paxos versions [" << m->paxos_first_version
+ << "," << m->paxos_last_version << "]"
<< " vs my version " << paxos->get_version()
<< " (too far ahead)"
<< dendl;