summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-07-21 08:48:18 -0700
committerSage Weil <sage@inktank.com>2013-07-24 09:01:20 -0700
commit938a639e2cb6abd22c2c588e619c1aae32c6521f (patch)
tree81669451bb6f3b0ec05ffd3ec90ad7d72c73f7ef
parent18596340f020be1f21bdc9bcc752ae1da4a93a46 (diff)
downloadceph-938a639e2cb6abd22c2c588e619c1aae32c6521f.tar.gz
mon/Paxos: fix pn for uncommitted value during collect/last phase
During the collect/last exchange, peers share any uncommitted values with the leader. They are supposed to also share the pn under which that value was accepted, but were instead using the just-accepted pn value. This effectively meant that we *always* took the uncommitted value; if there were multiples, which one we accepted depended on what order the LAST messages arrived, not which pn the values were generated under. The specific failure sequence I observed: - collect - learned uncommitted value for 262 from myself - send collect with pn 901 - got last with pn 901 (incorrect) for 200 (old) from peer - discard our own value, remember the other - finish collect phase - ignore old uncommitted value Fix this by storing a pending_v and pending_pn value whenever we accept a value. Use this to send an appropriate pn value in the LAST reply so that the leader can make it's decision about which uncommitted value to accept based on accurate information. Also use it when we learn the uncommitted value from ourselves. We could probably be more clever about storing less information here, for example by omitting pending_v and clearing pending_pn at the appropriate point, but that would be more fragile. Similarly, we could store a pn for *every* commit if we wanted to lay some groundwork for having multiple uncommitted proposals in flight, but I don't want to speculate about what is necessary or sufficient for a correct solution there. Fixes: #5698 Backport: cuttlefish, bobtail Signed-off-by: Sage Weil <sage@inktank.com> (cherry picked from commit 20baf662112dd5f560bc3a2d2114b469444c3de8)
-rw-r--r--src/mon/Paxos.cc35
1 files changed, 33 insertions, 2 deletions
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
index 391a89c60da..4cf7b700242 100644
--- a/src/mon/Paxos.cc
+++ b/src/mon/Paxos.cc
@@ -110,11 +110,21 @@ void Paxos::collect(version_t oldpn)
// look for uncommitted value
if (get_store()->exists(get_name(), last_committed+1)) {
+ version_t v = get_store()->get(get_name(), "pending_v");
+ version_t pn = get_store()->get(get_name(), "pending_pn");
+ if (v && pn && v == last_committed + 1) {
+ uncommitted_pn = pn;
+ } else {
+ dout(10) << "WARNING: no pending_pn on disk, using previous accepted_pn " << accepted_pn
+ << " and crossing our fingers" << dendl;
+ uncommitted_pn = accepted_pn;
+ }
uncommitted_v = last_committed+1;
- uncommitted_pn = accepted_pn;
+
get_store()->get(get_name(), last_committed+1, uncommitted_value);
assert(uncommitted_value.length());
dout(10) << "learned uncommitted " << (last_committed+1)
+ << " pn " << uncommitted_pn
<< " (" << uncommitted_value.length() << " bytes) from myself"
<< dendl;
}
@@ -171,6 +181,8 @@ void Paxos::handle_collect(MMonPaxos *collect)
last->last_committed = last_committed;
last->first_committed = first_committed;
+ version_t previous_pn = accepted_pn;
+
// can we accept this pn?
if (collect->pn > accepted_pn) {
// ok, accept it
@@ -212,7 +224,18 @@ void Paxos::handle_collect(MMonPaxos *collect)
dout(10) << " sharing our accepted but uncommitted value for "
<< last_committed+1 << " (" << bl.length() << " bytes)" << dendl;
last->values[last_committed+1] = bl;
- last->uncommitted_pn = accepted_pn;
+
+ version_t v = get_store()->get(get_name(), "pending_v");
+ version_t pn = get_store()->get(get_name(), "pending_pn");
+ if (v && pn && v == last_committed + 1) {
+ last->uncommitted_pn = pn;
+ } else {
+ // previously we didn't record which pn a value was accepted
+ // under! use the pn value we just had... :(
+ dout(10) << "WARNING: no pending_pn on disk, using previous accepted_pn " << previous_pn
+ << " and crossing our fingers" << dendl;
+ last->uncommitted_pn = previous_pn;
+ }
}
// send reply
@@ -497,6 +520,10 @@ void Paxos::begin(bufferlist& v)
MonitorDBStore::Transaction t;
t.put(get_name(), last_committed+1, new_value);
+ // note which pn this pending value is for.
+ t.put(get_name(), "pending_v", last_committed + 1);
+ t.put(get_name(), "pending_pn", accepted_pn);
+
dout(30) << __func__ << " transaction dump:\n";
JSONFormatter f(true);
t.dump(&f);
@@ -569,6 +596,10 @@ void Paxos::handle_begin(MMonPaxos *begin)
MonitorDBStore::Transaction t;
t.put(get_name(), v, begin->values[v]);
+ // note which pn this pending value is for.
+ t.put(get_name(), "pending_v", v);
+ t.put(get_name(), "pending_pn", accepted_pn);
+
dout(30) << __func__ << " transaction dump:\n";
JSONFormatter f(true);
t.dump(&f);