summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@newdream.net>2011-11-20 14:26:09 -0800
committerSage Weil <sage@newdream.net>2011-11-20 14:26:09 -0800
commitf60702822ab7681e6b53b31d04913ecf9eb9a7bd (patch)
treec9659e69c2f2f8388a47711c2545ebcba418f1d4
parent9aabd3982cceb7e8489412b4bfbb4c2387880de2 (diff)
downloadceph-f60702822ab7681e6b53b31d04913ecf9eb9a7bd.tar.gz
paxos: fix sharing of learned commits during collect/last
We can learn either an uncommitted or committed value during the collect/last recovery phase. For the committed values, we need to remember each peer's first/last_committed and share only at the end to avoid a situation like: - mon.1 has same last_committed as us - mon.2 has newer last_commited, we save it - mon.3 has same last_commited as mon.1, we share new value - done... but mon.1 never got mon.2's newer commit. Instead, save the commit sharing until the collect process completes, so we know that any committed value learned from anyone is shared with everyone who needs it. This fixes a crash like mon/Paxos.cc: In function 'void Paxos::handle_begin(MMonPaxos*)', in thread '7fd91192c700' mon/Paxos.cc: 400: FAILED assert(begin->last_committed == last_committed) ceph version 0.38-208-g9aabd39 (commit:9aabd3982cceb7e8489412b4bfbb4c2387880de2) 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x76) [0x72454e] 2: (Paxos::handle_begin(MMonPaxos*)+0x363) [0x6499ef] 3: (Paxos::dispatch(PaxosServiceMessage*)+0x2b4) [0x64db2c] 4: (Monitor::_ms_dispatch(Message*)+0xdc6) [0x6205c2] 5: (Monitor::ms_dispatch(Message*)+0x3a) [0x62831a] 6: (Messenger::ms_deliver_dispatch(Message*)+0x63) [0x7d1f31] 7: (SimpleMessenger::dispatch_entry()+0x7c2) [0x7bb786] 8: (SimpleMessenger::DispatchThread::entry()+0x2c) [0x6070fa] 9: (Thread::_entry_func(void*)+0x23) [0x6f3f69] 10: (()+0x7971) [0x7fd9153a1971] 11: (clone()+0x6d) [0x7fd913c3092d] Signed-off-by: Sage Weil <sage@newdream.net>
-rw-r--r--src/mon/Paxos.cc28
-rw-r--r--src/mon/Paxos.h1
2 files changed, 21 insertions, 8 deletions
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
index 437cc393d8b..40600d1b8ba 100644
--- a/src/mon/Paxos.cc
+++ b/src/mon/Paxos.cc
@@ -63,6 +63,8 @@ void Paxos::collect(version_t oldpn)
uncommitted_v = 0;
uncommitted_pn = 0;
uncommitted_value.clear();
+ peer_first_committed.clear();
+ peer_last_committed.clear();
// look for uncommitted value
if (mon->store->exists_bl_sn(machine_name, last_committed+1)) {
@@ -247,14 +249,9 @@ void Paxos::handle_last(MMonPaxos *last)
return;
}
- // share committed values?
- if (last->last_committed < last_committed) {
- // share committed values
- dout(10) << "sending commit to " << last->get_source() << dendl;
- MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, machine_id);
- share_state(commit, last->first_committed, last->last_committed);
- mon->messenger->send_message(commit, last->get_source_inst());
- }
+ // note peer's last_committed, in case we learn a new commit and need to
+ // push it to them.
+ peer_last_committed[last->get_source().num()] = last->last_committed;
// did we receive a committed value?
store_state(last);
@@ -293,6 +290,21 @@ void Paxos::handle_last(MMonPaxos *last)
mon->timer.cancel_event(collect_timeout_event);
collect_timeout_event = 0;
+ // share committed values?
+ for (map<int,version_t>::iterator p = peer_last_committed.begin();
+ p != peer_last_committed.end();
+ ++p) {
+ if (p->second < last_committed) {
+ // share committed values
+ dout(10) << " sending commit to mon." << p->first << dendl;
+ MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, machine_id);
+ share_state(commit, peer_first_committed[p->first], p->second);
+ mon->messenger->send_message(commit, mon->monmap->get_inst(p->first));
+ }
+ }
+ peer_first_committed.clear();
+ peer_last_committed.clear();
+
// almost...
state = STATE_ACTIVE;
diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h
index a340cdab3b9..e67a975f4fa 100644
--- a/src/mon/Paxos.h
+++ b/src/mon/Paxos.h
@@ -112,6 +112,7 @@ private:
utime_t last_commit_time;
version_t accepted_pn;
version_t accepted_pn_from;
+ map<int,version_t> peer_first_committed, peer_last_committed;
// active (phase 2)
utime_t lease_expire;