diff options
author | Sage Weil <sage@inktank.com> | 2012-05-31 16:42:38 -0700 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2012-05-31 16:42:38 -0700 |
commit | a4c90b725024c155b79d511822ce541aa83f5d65 (patch) | |
tree | 61292bfa1f567c5429055a69263e56183862d114 /src/mon | |
parent | fb7ce59b9a18ac26882836de366629a4593855a6 (diff) | |
parent | 504c6ce95799fe3f559cdfee4e758a6ea5608ab9 (diff) | |
download | ceph-a4c90b725024c155b79d511822ce541aa83f5d65.tar.gz |
Merge remote-tracking branch 'gh/wip-mon-doc'
Diffstat (limited to 'src/mon')
-rw-r--r-- | src/mon/Elector.cc | 10 | ||||
-rw-r--r-- | src/mon/Elector.h | 328 | ||||
-rw-r--r-- | src/mon/Monitor.h | 19 | ||||
-rw-r--r-- | src/mon/OSDMonitor.cc | 10 | ||||
-rw-r--r-- | src/mon/Paxos.cc | 95 | ||||
-rw-r--r-- | src/mon/Paxos.h | 877 | ||||
-rw-r--r-- | src/mon/PaxosService.cc | 10 | ||||
-rw-r--r-- | src/mon/PaxosService.h | 259 |
8 files changed, 1460 insertions, 148 deletions
diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc index 6b91f5a8d76..cc25cba82a2 100644 --- a/src/mon/Elector.cc +++ b/src/mon/Elector.cc @@ -78,8 +78,8 @@ void Elector::start() // bcast to everyone else for (unsigned i=0; i<mon->monmap->size(); ++i) { if ((int)i == mon->rank) continue; - mon->messenger->send_message(new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap), - mon->monmap->get_inst(i)); + Message *m = new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap); + mon->messenger->send_message(m, mon->monmap->get_inst(i)); } reset_timer(); @@ -294,7 +294,8 @@ void Elector::dispatch(Message *m) return; } if (m->get_source().num() >= mon->monmap->size()) { - dout(5) << " ignoring bogus election message with bad mon rank " << m->get_source() << dendl; + dout(5) << " ignoring bogus election message with bad mon rank " + << m->get_source() << dendl; m->put(); return; } @@ -303,7 +304,8 @@ void Elector::dispatch(Message *m) // assume an old message encoding would have matched if (em->fsid != mon->monmap->fsid) { - dout(0) << " ignoring election msg fsid " << em->fsid << " != " << mon->monmap->fsid << dendl; + dout(0) << " ignoring election msg fsid " + << em->fsid << " != " << mon->monmap->fsid << dendl; m->put(); return; } diff --git a/src/mon/Elector.h b/src/mon/Elector.h index c4e0f5cad6b..3f3b508f45b 100644 --- a/src/mon/Elector.h +++ b/src/mon/Elector.h @@ -28,31 +28,141 @@ using namespace std; class Monitor; - +/** + * This class is responsible for maintaining the local state when electing + * a new Leader. We may win or we may lose. If we win, it means we became the + * Leader; if we lose, it means we are a Peon. + */ class Elector { + /** + * @defgroup Elector_h_class Elector + * @{ + */ private: + /** + * The Monitor instance associated with this class. + */ Monitor *mon; + /** + * Event callback responsible for dealing with an expired election once a + * timer runs out and fires up. + */ Context *expire_event; + /** + * Resets the expire_event timer, by cancelling any existing one and + * scheduling a new one. + * + * @remarks This function assumes as a default firing value the duration of + * the monitor's lease interval, and adds to it the value specified + * in @plus + * + * @post expire_event is set + * + * @param plus The amount of time to be added to the default firing value. + */ void reset_timer(double plus=0.0); + /** + * Cancel the expire_event timer, if it is defined. + * + * @post expire_event is not set + */ void cancel_timer(); - epoch_t epoch; // latest epoch we've seen. odd == election, even == stable, + /** + * Latest epoch we've seen. + * + * @remarks if its value is odd, we're electing; if it's even, then we're + * stable. + */ + epoch_t epoch; + /** + * Indicates if we are participating in the quorum. + * + * @remarks By default, we are created as participating. We may stop + * participating if the Monitor explicitely calls + * Elector::stop_participating though. If that happens, it will + * have to call Elector::start_participating for us to resume + * participating in the quorum. + */ bool participating; // electing me + /** + * @defgroup Elector_h_electing_me_vars We are being elected + * @{ + */ + /** + * Indicates if we are the ones being elected. + * + * We always attempt to be the one being elected if we are the ones starting + * the election. If we are not the ones that started it, we will only attempt + * to be elected if we think we might have a chance (i.e., the other guy's + * rank is lower than ours). + */ bool electing_me; + /** + * Holds the time at which we started the election. + */ utime_t start_stamp; + /** + * Set containing all those that acked our proposal to become the Leader. + * + * If we are acked by everyone in the MonMap, we will declare victory. + */ set<int> acked_me; + /** + * @} + */ + /** + * @defgroup Elector_h_electing_them_vars We are electing another guy + * @{ + */ + /** + * Indicates who we have acked + */ + int leader_acked; + /** + * Indicates when we have acked him + */ + utime_t ack_stamp; + /** + * @} + */ + + /** + * Update our epoch. + * + * If we come across a higher epoch, we simply update ours, also making + * sure we are no longer being elected (even though we could have been, + * we no longer are since we no longer are on that old epoch). + * + * @pre Our epoch is lower than @p e + * @post Our epoch equals @p e + * + * @param e Epoch to which we will update our epoch + */ + void bump_epoch(epoch_t e=0); - // electing them - int leader_acked; // who i've acked - utime_t ack_stamp; // and when - - void bump_epoch(epoch_t e=0); // i just saw a larger epoch - + /** + * @defgroup Elector_h_callbacks Callbacks + * @{ + */ + /** + * This class is used as the callback when the expire_event timer fires up. + * + * If the expire_event is fired, then it means that we had an election going, + * either started by us or by some other participant, but it took too long, + * thus expiring. + * + * When the election expires, we will check if we were the ones who won, and + * if so we will declare victory. If that is not the case, then we assume + * that the one we defered to didn't declare victory quickly enough (in fact, + * as far as we know, we may even be dead); so, just propose ourselves as the + * Leader. + */ class C_ElectionExpire : public Context { Elector *elector; public: @@ -61,17 +171,155 @@ class Elector { elector->expire(); } }; + /** + * @} + */ - void start(); // start an electing me + /** + * Start new elections by proposing ourselves as the new Leader. + * + * Basically, send propose messages to all the monitors in the MonMap and + * then reset the expire_event timer so we can limit the amount of time we + * will be going at it. + * + * @pre participating is true + * @post epoch is an odd value + * @post electing_me is true + * @post we sent propose messages to all the monitors in the MonMap + * @post we reset the expire_event timer + */ + void start(); + /** + * Defer the current election to some other monitor. + * + * This means that we will ack some other monitor and drop out from the run + * to become the Leader. We will only defer an election if the monitor we + * are deferring to outranks us. + * + * @pre @p who outranks us (i.e., who < our rank) + * @pre @p who outranks any other monitor we have deferred to in the past + * @post electing_me is false + * @post leader_acked equals @p who + * @post we sent an ack message to @p who + * @post we reset the expire_event timer + * + * @param who Some other monitor's numeric identifier. + */ void defer(int who); - void expire(); // timer goes off + /** + * The election has taken too long and has expired. + * + * This will happen when no one declared victory or started a new election + * during the time span allowed by the expire_event timer. + * + * When the election expires, we will check if we were the ones who won, and + * if so we will declare victory. If that is not the case, then we assume + * that the one we defered to didn't declare victory quickly enough (in fact, + * as far as we know, we may even be dead); so, just propose ourselves as the + * Leader. + */ + void expire(); + /** + * Declare Victory. + * + * We won. Or at least we believe we won, but for all intentions and purposes + * that does not matter. What matters is that we Won. + * + * That said, we must now bump our epoch to reflect that the election is over + * and then we must let everybody in the quorum know we are their brand new + * Leader. And we will also cancel our expire_event timer. + * + * Actually, the quorum will be now defined as the group of monitors that + * acked us during the election process. + * + * @pre Election is on-going + * @pre electing_me is true + * @post electing_me is false + * @post epoch is bumped up into an even value + * @post Election is not on-going + * @post We have a quorum, composed of the monitors that acked us + * @post We sent a message of type OP_VICTORY to each quorum member. + */ void victory(); - + + /** + * Handle a message from some other node proposing himself to become him + * the Leader. + * + * If the message appears to be old (i.e., its epoch is lower than our epoch), + * then we may take one of two actions: + * + * @li Ignore it because it's nothing more than an old proposal + * @li Start new elections if we verify that it was sent by a monitor from + * outside the quorum; given its old state, it's fair to assume he just + * started, so we should start new elections so he may rejoin + * + * If we did not ignore the received message, then we know that this message + * was sent by some other node proposing himself to become the Leader. So, we + * will take one of the following actions: + * + * @li Ignore him because we already acked another node with higher rank + * @li Ignore him and start a new election because we outrank him + * @li Defer to him because he outranks us and the node we previously + * acked, if any + * + * + * @invariant The received message is an operation of type OP_PROPOSE + * + * @param m A message sent by another participant in the quorum. + */ void handle_propose(class MMonElection *m); + /** + * Handle a message from some other participant Acking us as the Leader. + * + * When we receive such a message, one of three thing may be happening: + * @li We received a message with a newer epoch, which means we must have + * somehow lost track of what was going on (maybe we rebooted), thus we + * will start a new election + * @li We consider ourselves in the run for the Leader (i.e., @p electing_me + * is true), and we are actually being Acked by someone; thus simply add + * the one acking us to the @p acked_me set. If we do now have acks from + * all the participants, then we can declare victory + * @li We already deferred the election to somebody else, so we will just + * ignore this message + * + * @pre Election is on-going + * @post Election is on-going if we deferred to somebody else + * @post Election is on-going if we are still waiting for further Acks + * @post Election is not on-going if we are victorious + * @post Election is not on-going if we must start a new one + * + * @param m A message with an operation type of OP_ACK + */ void handle_ack(class MMonElection *m); + /** + * Handle a message from some other participant declaring Victory. + * + * We just got a message from someone declaring themselves Victorious, thus + * the new Leader. + * + * However, if the message's epoch happens to be different from our epoch+1, + * then it means we lost track of something and we must start a new election. + * + * If that is not the case, then we will simply update our epoch to the one + * in the message, cancel our @p expire_event timer and inform our Monitor + * that we lost the election and provide it with the new quorum. + * + * @pre Election in on-going + * @post Election is not on-going + * @post Updated @p epoch + * @post We have a new quorum if we lost the election + * + * @param m A message with an operation type of OP_VICTORY + */ void handle_victory(class MMonElection *m); - public: + public: + /** + * Create an Elector class + * + * @param m A Monitor instance + */ Elector(Monitor *m) : mon(m), expire_event(0), epoch(0), @@ -79,20 +327,74 @@ class Elector { electing_me(false), leader_acked(-1) { } + /** + * Initiate the Elector class. + * + * Basically, we will simply read whatever epoch value we have in our stable + * storage, or consider it to be 1 if none is read. + * + * @post @p epoch is set to 1 or higher. + */ void init(); + /** + * Inform this class it is supposed to shutdown. + * + * We will simply cancel the @p expire_event if any exists. + * + * @post @p expire_event is cancelled + */ void shutdown(); + /** + * Obtain our epoch + * + * @returns Our current epoch number + */ epoch_t get_epoch() { return epoch; } + /** + * Handle received messages. + * + * We will ignore all messages that are not of type @p MSG_MON_ELECTION + * (i.e., messages whose interface is not of type @p MMonElection). All of + * those that are will then be dispatched to their operation-specific + * functions. + * + * @param m A received message + */ void dispatch(Message *m); + /** + * Call an election. + * + * This function simply calls Elector::start. + */ void call_election() { start(); } + /** + * Stop participating in subsequent Elections. + * + * @post @p participating is false + */ void stop_participating() { participating = false; } + /** + * Start participating in Elections. + * + * If we are already participating (i.e., @p participating is true), then + * calling this function is moot. + * + * However, if we are not participating (i.e., @p participating is false), + * then we will start participating by setting @p participating to true and + * we will call for an Election. + * + * @post @p participating is true + */ void start_participating(); + /** + * @} + */ }; - #endif diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h index 381138a97a1..a7a3f9abe9a 100644 --- a/src/mon/Monitor.h +++ b/src/mon/Monitor.h @@ -284,7 +284,7 @@ public: void handle_route(MRoute *m); /** - * generate health report + * Generate health report * * @param status one-line status summary * @param detailbl optional bufferlist* to fill with a detailed report @@ -295,12 +295,27 @@ public: void reply_command(MMonCommand *m, int rc, const string &rs, bufferlist& rdata, version_t version); void handle_probe(MMonProbe *m); + /** + * Handle a Probe Operation, replying with our name, quorum and known versions. + * + * We use the MMonProbe message class for anything and everything related with + * Monitor probing. One of the operations relates directly with the probing + * itself, in which we receive a probe request and to which we reply with + * our name, our quorum and the known versions for each Paxos service. Thus the + * redundant function name. This reply will obviously be sent to the one + * probing/requesting these infos. + * + * @todo Add @pre and @post + * + * @param m A Probe message, with an operation of type Probe. + */ void handle_probe_probe(MMonProbe *m); void handle_probe_reply(MMonProbe *m); void handle_probe_slurp(MMonProbe *m); void handle_probe_slurp_latest(MMonProbe *m); void handle_probe_data(MMonProbe *m); - /* Given an MMonProbe and associated Paxos machine, create a reply, + /** + * Given an MMonProbe and associated Paxos machine, create a reply, * fill it with the missing Paxos states and current commit pointers * * @param m The incoming MMonProbe. We use this to determine the range diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index ec083c3fda6..86896809d0b 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -603,6 +603,10 @@ bool OSDMonitor::can_mark_up(int i) return true; } +/** + * @note the parameter @p i apparently only exists here so we can output the + * osd's id on messages. + */ bool OSDMonitor::can_mark_out(int i) { if (osdmap.test_flag(CEPH_OSDMAP_NOOUT)) { @@ -1252,6 +1256,12 @@ void OSDMonitor::tick() // mark down osds out? utime_t now = ceph_clock_now(g_ceph_context); + + /* can_mark_out() checks if we can mark osds as being out. The -1 has no + * influence at all. The decision is made based on the ratio of "in" osds, + * and the function returns false if this ratio is lower that the minimum + * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us. + */ if (can_mark_out(-1)) { map<int,utime_t>::iterator i = down_pending_out.begin(); while (i != down_pending_out.end()) { diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc index 6014c456454..ebf1b815309 100644 --- a/src/mon/Paxos.cc +++ b/src/mon/Paxos.cc @@ -23,9 +23,10 @@ #define dout_subsys ceph_subsys_paxos #undef dout_prefix #define dout_prefix _prefix(_dout, mon, mon->name, mon->rank, machine_name, state, first_committed, last_committed) -static ostream& _prefix(std::ostream *_dout, Monitor *mon, const string& name, int rank, - const char *machine_name, int state, - version_t first_committed, version_t last_committed) { +static ostream& _prefix(std::ostream *_dout, Monitor *mon, const string& name, + int rank, const char *machine_name, int state, + version_t first_committed, version_t last_committed) +{ return *_dout << "mon." << name << "@" << rank << "(" << mon->get_state_name() << ")" << ".paxos(" << machine_name << " " << Paxos::get_statename(state) @@ -89,8 +90,8 @@ void Paxos::collect(version_t oldpn) ++p) { if (*p == mon->rank) continue; - MMonPaxos *collect = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COLLECT, machine_id, - ceph_clock_now(g_ceph_context)); + MMonPaxos *collect = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COLLECT, + machine_id, ceph_clock_now(g_ceph_context)); collect->last_committed = last_committed; collect->first_committed = first_committed; collect->pn = accepted_pn; @@ -124,13 +125,14 @@ void Paxos::handle_collect(MMonPaxos *collect) // ok, accept it accepted_pn = collect->pn; accepted_pn_from = collect->pn_from; - dout(10) << "accepting pn " << accepted_pn << " from " << accepted_pn_from << dendl; + dout(10) << "accepting pn " << accepted_pn << " from " + << accepted_pn_from << dendl; mon->store->put_int(accepted_pn, machine_name, "accepted_pn"); } else { // don't accept! - dout(10) << "NOT accepting pn " << collect->pn << " from " << collect->pn_from - << ", we already accepted " << accepted_pn << " from " << accepted_pn_from - << dendl; + dout(10) << "NOT accepting pn " << collect->pn << " from " << collect->pn_from + << ", we already accepted " << accepted_pn + << " from " << accepted_pn_from << dendl; } last->pn = accepted_pn; last->pn_from = accepted_pn_from; @@ -145,8 +147,8 @@ void Paxos::handle_collect(MMonPaxos *collect) if (mon->store->exists_bl_sn(machine_name, last_committed+1)) { mon->store->get_bl_sn(bl, machine_name, last_committed+1); assert(bl.length() > 0); - dout(10) << " sharing our accepted but uncommitted value for " << last_committed+1 - << " (" << bl.length() << " bytes)" << dendl; + dout(10) << " sharing our accepted but uncommitted value for " + << last_committed+1 << " (" << bl.length() << " bytes)" << dendl; last->values[last_committed+1] = bl; last->uncommitted_pn = accepted_pn; } @@ -156,11 +158,13 @@ void Paxos::handle_collect(MMonPaxos *collect) collect->put(); } -void Paxos::share_state(MMonPaxos *m, version_t peer_first_committed, version_t peer_last_committed) +void Paxos::share_state(MMonPaxos *m, version_t peer_first_committed, + version_t peer_last_committed) { assert(peer_last_committed < last_committed); - dout(10) << "share_state peer has fc " << peer_first_committed << " lc " << peer_last_committed << dendl; + dout(10) << "share_state peer has fc " << peer_first_committed + << " lc " << peer_last_committed << dendl; version_t v = peer_last_committed + 1; // start with a stashed full copy? @@ -168,16 +172,15 @@ void Paxos::share_state(MMonPaxos *m, version_t peer_first_committed, version_t bufferlist bl; version_t l = get_stashed(bl); assert(l <= last_committed); - dout(10) << "share_state starting with latest " << l << " (" << bl.length() << " bytes)" << dendl; + dout(10) << "share_state starting with latest " << l + << " (" << bl.length() << " bytes)" << dendl; m->latest_value.claim(bl); m->latest_version = l; v = l; } // include incrementals - for ( ; - v <= last_committed; - v++) { + for ( ; v <= last_committed; v++) { if (mon->store->exists_bl_sn(machine_name, v)) { mon->store->get_bl_sn(m->values[v], machine_name, v); dout(10) << " sharing " << v << " (" @@ -194,7 +197,8 @@ void Paxos::store_state(MMonPaxos *m) // stash? if (m->latest_version && m->latest_version > last_committed) { - dout(10) << "store_state got stash version " << m->latest_version << ", zapping old states" << dendl; + dout(10) << "store_state got stash version " + << m->latest_version << ", zapping old states" << dendl; assert(start != m->values.end() && start->first == m->latest_version); @@ -220,13 +224,12 @@ void Paxos::store_state(MMonPaxos *m) start = m->values.end(); } - while (start != m->values.end() && - start->first <= last_committed) + while (start != m->values.end() && start->first <= last_committed) { ++start; + } map<version_t,bufferlist>::iterator end = start; - while (end != m->values.end() && - end->first <= m->last_committed) { + while (end != m->values.end() && end->first <= m->last_committed) { last_committed = end->first; if (!first_committed) first_committed = last_committed; @@ -236,7 +239,9 @@ void Paxos::store_state(MMonPaxos *m) if (start == end) { dout(10) << "store_state nothing to commit" << dendl; } else { - dout(10) << "store_state [" << start->first << ".." << last_committed << "]" << dendl; + dout(10) << "store_state [" << start->first << ".." + << last_committed << "]" << dendl; + mon->store->put_bl_sn_map(machine_name, start, end); mon->store->put_int(last_committed, machine_name, "last_committed"); mon->store->put_int(first_committed, machine_name, "first_committed"); @@ -303,7 +308,8 @@ void Paxos::handle_last(MMonPaxos *last) if (p->second < last_committed) { // share committed values dout(10) << " sending commit to mon." << p->first << dendl; - MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, machine_id, + MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), + MMonPaxos::OP_COMMIT, machine_id, ceph_clock_now(g_ceph_context)); share_state(commit, peer_first_committed[p->first], p->second); mon->messenger->send_message(commit, mon->monmap->get_inst(p->first)); @@ -383,15 +389,15 @@ void Paxos::begin(bufferlist& v) return; } - // ask others to accept it to! + // ask others to accept it too! for (set<int>::const_iterator p = mon->get_quorum().begin(); p != mon->get_quorum().end(); ++p) { if (*p == mon->rank) continue; dout(10) << " sending begin to mon." << *p << dendl; - MMonPaxos *begin = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_BEGIN, machine_id, - ceph_clock_now(g_ceph_context)); + MMonPaxos *begin = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_BEGIN, + machine_id, ceph_clock_now(g_ceph_context)); begin->values[last_committed+1] = new_value; begin->last_committed = last_committed; begin->pn = accepted_pn; @@ -428,8 +434,8 @@ void Paxos::handle_begin(MMonPaxos *begin) mon->store->put_bl_sn(begin->values[v], machine_name, v); // reply - MMonPaxos *accept = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_ACCEPT, machine_id, - ceph_clock_now(g_ceph_context)); + MMonPaxos *accept = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_ACCEPT, + machine_id, ceph_clock_now(g_ceph_context)); accept->pn = accepted_pn; accept->last_committed = last_committed; mon->messenger->send_message(accept, begin->get_source_inst()); @@ -525,8 +531,8 @@ void Paxos::commit() if (*p == mon->rank) continue; dout(10) << " sending commit to mon." << *p << dendl; - MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, machine_id, - ceph_clock_now(g_ceph_context)); + MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, + machine_id, ceph_clock_now(g_ceph_context)); commit->values[last_committed] = new_value; commit->pn = accepted_pn; commit->last_committed = last_committed; @@ -539,8 +545,6 @@ void Paxos::commit() } - - void Paxos::handle_commit(MMonPaxos *commit) { dout(10) << "handle_commit on " << commit->last_committed << dendl; @@ -569,15 +573,16 @@ void Paxos::extend_lease() acked_lease.clear(); acked_lease.insert(mon->rank); - dout(7) << "extend_lease now+" << g_conf->mon_lease << " (" << lease_expire << ")" << dendl; + dout(7) << "extend_lease now+" << g_conf->mon_lease + << " (" << lease_expire << ")" << dendl; // bcast for (set<int>::const_iterator p = mon->get_quorum().begin(); - p != mon->get_quorum().end(); - ++p) { + p != mon->get_quorum().end(); ++p) { + if (*p == mon->rank) continue; - MMonPaxos *lease = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE, machine_id, - ceph_clock_now(g_ceph_context)); + MMonPaxos *lease = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE, + machine_id, ceph_clock_now(g_ceph_context)); lease->last_committed = last_committed; lease->lease_timestamp = lease_expire; lease->first_committed = first_committed; @@ -588,7 +593,8 @@ void Paxos::extend_lease() // if old timeout is still in place, leave it. if (!lease_ack_timeout_event) { lease_ack_timeout_event = new C_LeaseAckTimeout(this); - mon->timer.add_event_after(g_conf->mon_lease_ack_timeout, lease_ack_timeout_event); + mon->timer.add_event_after(g_conf->mon_lease_ack_timeout, + lease_ack_timeout_event); } // set renew event @@ -624,7 +630,8 @@ void Paxos::handle_lease(MMonPaxos *lease) // sanity if (!mon->is_peon() || last_committed != lease->last_committed) { - dout(10) << "handle_lease i'm not a peon, or they're not the leader, or the last_committed doesn't match, dropping" << dendl; + dout(10) << "handle_lease i'm not a peon, or they're not the leader," + << " or the last_committed doesn't match, dropping" << dendl; lease->put(); return; } @@ -642,8 +649,8 @@ void Paxos::handle_lease(MMonPaxos *lease) << " now " << lease_expire << dendl; // ack - MMonPaxos *ack = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE_ACK, machine_id, - ceph_clock_now(g_ceph_context)); + MMonPaxos *ack = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE_ACK, + machine_id, ceph_clock_now(g_ceph_context)); ack->last_committed = last_committed; ack->first_committed = first_committed; ack->lease_timestamp = ceph_clock_now(g_ceph_context); @@ -671,7 +678,8 @@ void Paxos::handle_lease_ack(MMonPaxos *ack) int from = ack->get_source().num(); if (!lease_ack_timeout_event) { - dout(10) << "handle_lease_ack from " << ack->get_source() << " -- stray (probably since revoked)" << dendl; + dout(10) << "handle_lease_ack from " << ack->get_source() + << " -- stray (probably since revoked)" << dendl; } else if (acked_lease.count(from) == 0) { acked_lease.insert(from); @@ -724,7 +732,6 @@ void Paxos::lease_renew_timeout() } - /* * trim old states */ diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h index 8558fc9502f..71980aebc75 100644 --- a/src/mon/Paxos.h +++ b/src/mon/Paxos.h @@ -35,18 +35,6 @@ e 12v */ - -/* - * NOTE: This libary is based on the Paxos algorithm, but varies in a few key ways: - * 1- Only a single new value is generated at a time, simplifying the recovery logic. - * 2- Nodes track "committed" values, and share them generously (and trustingly) - * 3- A 'leasing' mechism is built-in, allowing nodes to determine when it is safe to - * "read" their copy of the last committed value. - * - * This provides a simple replication substrate that services can be built on top of. - * See PaxosService.h - */ - #ifndef CEPH_MON_PAXOS_H #define CEPH_MON_PAXOS_H @@ -66,7 +54,24 @@ class Paxos; // i am one state machine. +/** + * This libary is based on the Paxos algorithm, but varies in a few key ways: + * 1- Only a single new value is generated at a time, simplifying the recovery logic. + * 2- Nodes track "committed" values, and share them generously (and trustingly) + * 3- A 'leasing' mechism is built-in, allowing nodes to determine when it is + * safe to "read" their copy of the last committed value. + * + * This provides a simple replication substrate that services can be built on top of. + * See PaxosService.h + */ class Paxos { + /** + * @defgroup Paxos_h_class Paxos + * @{ + */ + /** + * The Monitor to which this Paxos class is associated with. + */ Monitor *mon; // my state machine info @@ -82,9 +87,32 @@ class Paxos { // -- generic state -- public: - const static int STATE_RECOVERING = 1; // leader|peon: recovering paxos state - const static int STATE_ACTIVE = 2; // leader|peon: idle. peon may or may not have valid lease - const static int STATE_UPDATING = 3; // leader|peon: updating to new value + /** + * @defgroup Paxos_h_states States on which the leader/peon may be. + * @{ + */ + /** + * Leader/Peon is in Paxos' Recovery state + */ + const static int STATE_RECOVERING = 1; + /** + * Leader/Peon is idle, and the Peon may or may not have a valid lease. + */ + const static int STATE_ACTIVE = 2; + /** + * Leader/Peon is updating to a new value. + */ + const static int STATE_UPDATING = 3; + + /** + * Obtain state name from constant value. + * + * @note This function will raise a fatal error if @p s is not + * a valid state value. + * + * @param s State value. + * @return The state's name. + */ static const char *get_statename(int s) { switch (s) { case STATE_RECOVERING: return "recovering"; @@ -95,60 +123,328 @@ public: } private: + /** + * The state we are in. + */ int state; + /** + * @} + */ public: + /** + * Check if we are recovering. + * + * @return 'true' if we are on the Recovering state; 'false' otherwise. + */ bool is_recovering() const { return state == STATE_RECOVERING; } + /** + * Check if we are active. + * + * @return 'true' if we are on the Active state; 'false' otherwise. + */ bool is_active() const { return state == STATE_ACTIVE; } + /** + * Check if we are updating. + * + * @return 'true' if we are on the Updating state; 'false' otherwise. + */ bool is_updating() const { return state == STATE_UPDATING; } private: - // recovery (phase 1) + /** + * @defgroup Paxos_h_recovery_vars Common recovery-related member variables + * @note These variables are common to both the Leader and the Peons. + * @{ + */ + /** + * + */ version_t first_committed; + /** + * Last Proposal Number + * + * @todo Expand description + */ version_t last_pn; + /** + * Last committed value's version. + * + * On both the Leader and the Peons, this is the last value's version that + * was accepted by a given quorum and thus committed, that this instance + * knows about. + * + * @note It may not be the last committed value's version throughout the + * system. If we are a Peon, we may have not been part of the quorum + * that accepted the value, and for this very same reason we may still + * be a (couple of) version(s) behind, until we learn about the most + * recent version. This should only happen if we are not active (i.e., + * part of the quorum), which should not happen if we are up, running + * and able to communicate with others -- thus able to be part of the + * monmap and trigger new elections. + */ version_t last_committed; + /** + * Last committed value's time. + * + * When the commit happened. + */ utime_t last_commit_time; + /** + * The last Proposal Number we have accepted. + * + * On the Leader, it will be the Proposal Number picked by the Leader + * itself. On the Peon, however, it will be the proposal sent by the Leader + * and it will only be updated iif its value is higher than the one + * already known by the Peon. + */ version_t accepted_pn; + /** + * @todo This has something to do with the last_committed version. Not sure + * about what it entails, tbh. + */ version_t accepted_pn_from; - map<int,version_t> peer_first_committed, peer_last_committed; + /** + * @todo Check out if this map has any purpose at all. So far, we have only + * seen it being read, although it is never affected. + */ + map<int,version_t> peer_first_committed; + /** + * Map holding the last committed version by each quorum member. + * + * The versions kept in this map are updated during the collect phase. + * When the Leader starts the collect phase, each Peon will reply with its + * last committed version, which will then be kept in this map. + */ + map<int,version_t> peer_last_committed; + /** + * @todo Check out what 'slurping' is. + */ int slurping; + /** + * @} + */ // active (phase 2) + /** + * @defgroup Paxos_h_active_vars Common active-related member variables + * @{ + */ + /** + * When does our read lease expires. + * + * Instead of performing a full commit each time a read is requested, we + * keep leases. Each lease will have an expiration date, which may or may + * not be extended. This member variable will keep when is the lease + * expiring. + */ utime_t lease_expire; + /** + * List of callbacks waiting for our state to change into STATE_ACTIVE. + */ list<Context*> waiting_for_active; + /** + * List of callbacks waiting for the chance to read a version from us. + * + * Each entry on the list may result from an attempt to read a version that + * wasn't available at the time, or an attempt made during a period during + * which we could not satisfy the read request. The first case happens if + * the requested version is greater than our last committed version. The + * second scenario may happen if we are recovering, or if we don't have a + * valid lease. + * + * The list will be woken up once we change to STATE_ACTIVE with an extended + * lease -- which can be achieved if we have everyone on the quorum on board + * with the latest proposal, or if we don't really care about the remaining + * uncommitted values --, or if we're on a quorum of one. + */ list<Context*> waiting_for_readable; + /** + * Latest version written to the store after the latest commit. + */ version_t latest_stashed; + /** + * @} + */ // -- leader -- // recovery (paxos phase 1) + /** + * @defgroup Paxos_h_leader_recovery Leader-specific Recovery-related vars + * @{ + */ + /** + * Number of replies to the collect phase we've received so far. + * + * This variable is reset to 1 each time we start a collect phase; it is + * incremented each time we receive a reply to the collect message, and + * is used to determine whether or not we have received replies from the + * whole quorum. + */ unsigned num_last; + /** + * Uncommitted value's version. + * + * If we have, or end up knowing about, an uncommitted value, then its + * version will be kept in this variable. + * + * @note If this version equals @p last_committed+1 when we reach the final + * steps of recovery, then the algorithm will assume this is a value + * the Leader does not know about, and trustingly the Leader will + * propose this version's value. + */ version_t uncommitted_v; + /** + * Uncommitted value's Proposal Number. + * + * We use this variable to assess if the Leader should take into consideration + * an uncommitted value sent by a Peon. Given that the Peon will send back to + * the Leader the last Proposal Number he accepted, the Leader will be able + * to infer if this value is more recent than the one the Leader has, thus + * more relevant. + */ version_t uncommitted_pn; + /** + * Uncommitted Value. + * + * If the system fails in-between the accept replies from the Peons and the + * instruction to commit from the Leader, then we may end up with accepted + * but yet-uncommitted values. During the Leader's recovery, he will attempt + * to bring the whole system to the latest state, and that means committing + * past accepted but uncommitted values. + * + * This variable will hold an uncommitted value, which may originate either + * on the Leader, or learnt by the Leader from a Peon during the collect + * phase. + */ bufferlist uncommitted_value; - + /** + * Used to specify when an on-going collect phase times out. + */ Context *collect_timeout_event; + /** + * @} + */ // active + /** + * @defgroup Paxos_h_leader_active Leader-specific Active-related vars + * @{ + */ + /** + * Set of participants (Leader & Peons) that have acked a lease extension. + * + * Each Peon that acknowledges a lease extension will have its place in this + * set, which will be used to account for all the acks from all the quorum + * members, guaranteeing that we trigger new elections if some don't ack in + * the expected timeframe. + */ set<int> acked_lease; + /** + * Callback responsible for extending the lease periodically. + */ Context *lease_renew_event; + /** + * Callback to trigger new elections once the time for acks is out. + */ Context *lease_ack_timeout_event; + /** + * @} + */ + /** + * @defgroup Paxos_h_peon_active Peon-specific Active-related vars + * @{ + */ + /** + * Callback to trigger new elections when the Peon's lease times out. + * + * If the Peon's lease is extended, this callback will be reset (i.e., + * we cancel the event and reschedule a new one with starting from the + * beginning). + */ Context *lease_timeout_event; + /** + * @} + */ // updating (paxos phase 2) + /** + * @defgroup Paxos_h_leader_updating Leader-specific Updating-related vars + * @{ + */ + /** + * New Value being proposed to the Peons. + * + * This bufferlist holds the value the Leader is proposing to the Peons, and + * that will be committed if the Peons do accept the proposal. + */ bufferlist new_value; + /** + * Set of participants (Leader & Peons) that accepted the new proposed value. + * + * This set is used to keep track of those who have accepted the proposed + * value, so the leader may know when to issue a commit (when a majority of + * participants has accepted the proposal), and when to extend the lease + * (when all the quorum members have accepted the proposal). + */ set<int> accepted; - + /** + * Callback to trigger a new election if the proposal is not accepted by the + * full quorum within a given timeframe. + * + * If the full quorum does not accept the proposal, then it means that the + * Leader may no longer be recognized as the leader, or that the quorum has + * changed, and the value may have not reached all the participants. Thus, + * the leader must call new elections, and go through a recovery phase in + * order to propagate the new value throughout the system. + * + * This does not mean that we won't commit. We will commit as soon as we + * have a majority of acceptances. But if we do not have full acceptance + * from the quorum, then we cannot extend the lease, as some participants + * may not have the latest committed value. + */ Context *accept_timeout_event; + /** + * List of callbacks waiting for it to be possible to write again. + * + * @remarks It is not possible to write if we are not the Leader, or we are + * not on the active state, or if the lease has expired. + */ list<Context*> waiting_for_writeable; + /** + * List of callbacks waiting for a commit to finish. + * + * @remarks This may be used to a) wait for an on-going commit to finish + * before we proceed with, say, a new proposal; or b) wait for the + * next commit to be finished so we are sure that our value was + * fully committed. + */ list<Context*> waiting_for_commit; + /** + * @} + */ - //synchronization warnings + /** + * @defgroup Paxos_h_sync_warns Synchronization warnings + * @todo Describe these variables + * @{ + */ utime_t last_clock_drift_warn; int clock_drift_warned; + /** + * @} + */ + /** + * @defgroup Paxos_h_callbacks Callback classes. + * @{ + */ + /** + * Callback class responsible for handling a Collect Timeout. + */ class C_CollectTimeout : public Context { Paxos *paxos; public: @@ -158,6 +454,9 @@ private: } }; + /** + * Callback class responsible for handling an Accept Timeout. + */ class C_AcceptTimeout : public Context { Paxos *paxos; public: @@ -167,6 +466,9 @@ private: } }; + /** + * Callback class responsible for handling a Lease Ack Timeout. + */ class C_LeaseAckTimeout : public Context { Paxos *paxos; public: @@ -176,6 +478,9 @@ private: } }; + /** + * Callback class responsible for handling a Lease Timeout. + */ class C_LeaseTimeout : public Context { Paxos *paxos; public: @@ -185,6 +490,9 @@ private: } }; + /** + * Callback class responsible for handling a Lease Renew Timeout. + */ class C_LeaseRenew : public Context { Paxos *paxos; public: @@ -193,35 +501,336 @@ private: paxos->lease_renew_timeout(); } }; + /** + * @} + */ - + /** + * @defgroup Paxos_h_election_triggered Steps triggered by an election. + * + * @note All these functions play a significant role in the Recovery Phase, + * which is triggered right after an election once someone becomes + * the Leader. + * @{ + */ + /** + * Create a new Proposal Number and propose it to the Peons. + * + * This function starts the Recovery Phase, which can be directly mapped + * onto the original Paxos' Prepare phase. Basically, we'll generate a + * Proposal Number, taking @p oldpn into consideration, and we will send + * it to a quorum, along with our first and last committed versions. By + * sending these informations in a message to the quorum, we expect to + * obtain acceptances from a majority, allowing us to commit, or be + * informed of a higher Proposal Number known by one or more of the Peons + * in the quorum. + * + * @pre We are the Leader. + * @post Recovery Phase initiated by sending messages to the quorum. + * + * @param oldpn A proposal number taken as the highest known so far, that + * should be taken into consideration when generating a new + * Proposal Number for the Recovery Phase. + */ void collect(version_t oldpn); - void handle_collect(MMonPaxos*); - void handle_last(MMonPaxos*); + /** + * Handle the reception of a collect message from the Leader and reply + * accordingly. + * + * Once a Peon receives a collect message from the Leader it will reply + * with its first and last committed versions, as well as informations so + * the Leader may know if his Proposal Number was, or was not, accepted by + * the Peon. The Peon will accept the Leader's Proposal Number iif it is + * higher than the Peon's currently accepted Proposal Number. The Peon may + * also inform the Leader of accepted but uncommitted values. + * + * @invariant The message is an operation of type OP_COLLECT. + * @pre We are a Peon. + * @post Replied to the Leader, accepting or not accepting his PN. + * + * @param collect The collect message sent by the Leader to the Peon. + */ + void handle_collect(MMonPaxos *collect); + /** + * Handle a response from a Peon to the Leader's collect phase. + * + * The received message will state the Peon's last committed version, as + * well as its last proposal number. This will lead to one of the following + * scenarios: if the replied Proposal Number is equal to the one we proposed, + * then the Peon has accepted our proposal, and if all the Peons do accept + * our Proposal Number, then we are allowed to proceed with the commit; + * however, if a Peon replies with a higher Proposal Number, we assume he + * knows something we don't and the Leader will have to abort the current + * proposal in order to retry with the Proposal Number specified by the Peon. + * It may also occur that the Peon replied with a lower Proposal Number, in + * which case we assume it is a reply to an an older value and we'll simply + * drop it. + * This function will also check if the Peon replied with an accepted but + * yet uncommitted value. In this case, if its version is higher than our + * last committed value by one, we assume that the Peon knows a value from a + * previous proposal that has never been committed, and we should try to + * commit that value by proposing it next. On the other hand, if that is + * not the case, we'll assume it is an old, uncommitted value, we do not + * care about and we'll consider the system active by extending the leases. + * + * @invariant The message is an operation of type OP_LAST. + * @pre We are the Leader. + * @post We initiate a commit, or we retry with a higher Proposal Number, + * or we drop the message. + * @post We move from STATE_RECOVERING to STATE_ACTIVE. + * + * @param last The message sent by the Peon to the Leader. + */ + void handle_last(MMonPaxos *last); + /** + * The Recovery Phase timed out, meaning that a significant part of the + * quorum does not believe we are the Leader, and we thus should trigger new + * elections. + * + * @pre We believe to be the Leader. + * @post Trigger new elections. + */ void collect_timeout(); + /** + * @} + */ + /** + * @defgroup Paxos_h_updating_funcs Functions used during the Updating State + * + * These functions may easily be mapped to the original Paxos Algorithm's + * phases. + * + * Taking into account the algorithm can be divided in 4 phases (Prepare, + * Promise, Accept Request and Accepted), we can easily map Paxos::begin to + * both the Prepare and Accept Request phases; the Paxos::handle_begin to + * the Promise phase; and the Paxos::handle_accept to the Accepted phase. + * @{ + */ + /** + * Start a new proposal with the intent of committing @p value. + * + * If we are alone on the system (i.e., a quorum of one), then we will + * simply commit the value, but if we are not alone, then we need to propose + * the value to the quorum. + * + * @pre We are the Leader + * @pre We are on STATE_ACTIVE + * @post We commit, iif we are alone, or we send a message to each quorum + * member + * @post We are on STATE_ACTIVE, iif we are alone, or on + * STATE_UPDATING otherwise + * + * @param value The value being proposed to the quorum + */ void begin(bufferlist& value); - void handle_begin(MMonPaxos*); - void handle_accept(MMonPaxos*); + /** + * Accept or decline (by ignoring) a proposal from the Leader. + * + * We will decline the proposal (by ignoring it) if we have promised to + * accept a higher numbered proposal. If that is not the case, we will + * accept it and accordingly reply to the Leader. + * + * @pre We are a Peon + * @pre We are on STATE_ACTIVE + * @post We are on STATE_UPDATING iif we accept the Leader's proposal + * @post We send a reply message to the Leader iif we accept his proposal + * + * @invariant The received message is an operation of type OP_BEGIN + * + * @param begin The message sent by the Leader to the Peon during the + * Paxos::begin function + * + */ + void handle_begin(MMonPaxos *begin); + /** + * Handle an Accept message sent by a Peon. + * + * In order to commit, the Leader has to receive accepts from a majority of + * the quorum. If that does happen, then the Leader may proceed with the + * commit. However, the Leader needs the accepts from all the quorum members + * in order to extend the lease and move on to STATE_ACTIVE. + * + * This function handles these two situations, accounting for the amount of + * received accepts. + * + * @pre We are the Leader + * @pre We are on STATE_UPDATING + * @post We are on STATE_ACTIVE iif we received accepts from the full quorum + * @post We extended the lease iif we moved on to STATE_ACTIVE + * @post We are on STATE_UPDATING iif we didn't received accepts from the + * full quorum + * @post We have committed iif we received accepts from a majority + * + * @invariant The received message is an operation of type OP_ACCEPT + * + * @param accept The message sent by the Peons to the Leader during the + * Paxos::handle_begin function + */ + void handle_accept(MMonPaxos *accept); + /** + * Trigger a fresh election. + * + * During Paxos::begin we set a Callback of type Paxos::C_AcceptTimeout in + * order to limit the amount of time we spend waiting for Accept replies. + * This callback will call Paxos::accept_timeout when it is fired. + * + * This is essential to the algorithm because there may be the chance that + * we are no longer the Leader (i.e., others don't believe in us) and we + * are getting ignored, or we dropped out of the quorum and haven't realised + * it. So, our only option is to trigger fresh elections. + * + * @pre We are the Leader + * @pre We are on STATE_UPDATING + * @post Triggered fresh elections + */ void accept_timeout(); + /** + * @} + */ + /** + * Commit a value throughout the system. + * + * The Leader will cancel the current lease (as it was for the old value), + * and will store the committed value locally. It will then instruct every + * quorum member to do so as well. + * + * @pre We are the Leader + * @pre We are on STATE_UPDATING + * @pre A majority of quorum members accepted our proposal + * @post Value locally stored + * @post Quorum members instructed to commit the new value. + */ void commit(); - void handle_commit(MMonPaxos*); + /** + * Commit the new value to stable storage as being the latest available + * version. + * + * @pre We are a Peon + * @post The new value is locally stored + * @post Fire up the callbacks waiting on waiting_for_commit + * + * @invariant The received message is an operation of type OP_COMMIT + * + * @param commit The message sent by the Leader to the Peon during + * Paxos::commit + */ + void handle_commit(MMonPaxos *commit); + /** + * Extend the system's lease. + * + * This means that the Leader considers that it should now safe to read from + * any node on the system, since every quorum member is now in possession of + * the latest version. Therefore, the Leader will send a message stating just + * this to each quorum member, and will impose a limited timeframe during + * which acks will be accepted. If there aren't as many acks as expected + * (i.e, if at least one quorum member does not ack the lease) during this + * timeframe, then we will force fresh elections. + * + * @pre We are the Leader + * @pre We are on STATE_ACTIVE + * @post A message extending the lease is sent to each quorum member + * @post A timeout callback is set to limit the amount of time we will wait + * for lease acks. + * @post A timer is set in order to renew the lease after a certain amount + * of time. + */ void extend_lease(); - void handle_lease(MMonPaxos*); - void handle_lease_ack(MMonPaxos*); - - void lease_ack_timeout(); // on leader, if lease isn't acked by all peons - void lease_renew_timeout(); // on leader, to renew the lease + /** + * Update the lease on the Peon's side of things. + * + * Once a Peon receives a Lease message, it will update its lease_expire + * variable, reply to the Leader acknowledging the lease update and set a + * timeout callback to be fired upon the lease's expiration. Finally, the + * Peon will fire up all the callbacks waiting for it to become active, + * which it just did, and all those waiting for it to become readable, + * which should be true if the Peon's lease didn't expire in the mean time. + * + * @pre We are a Peon + * @post We update the lease accordingly + * @post A lease timeout callback is set + * @post Move to STATE_ACTIVE + * @post Fire up all the callbacks waiting for STATE_ACTIVE + * @post Fire up all the callbacks waiting for readable iif we are readable + * @post Ack the lease to the Leader + * + * @invariant The received message is an operation of type OP_LEASE + * + * @param The message sent by the Leader to the Peon during the + * Paxos::extend_lease function + */ + void handle_lease(MMonPaxos *lease); + /** + * Account for all the Lease Acks the Leader receives from the Peons. + * + * Once the Leader receives all the Lease Acks from the Peons, it will be + * able to cancel the Lease Ack timeout callback, thus avoiding calling + * fresh elections. + * + * @pre We are the Leader + * @post Cancel the Lease Ack timeout callback iif we receive acks from all + * the quorum members + * + * @invariant The received message is an operation of type OP_LEASE_ACK + * + * @param ack The message sent by a Peon to the Leader during the + * Paxos::handle_lease function + */ + void handle_lease_ack(MMonPaxos *ack); + /** + * Call fresh elections because at least one Peon didn't acked our lease. + * + * @pre We are the Leader + * @pre We are on STATE_ACTIVE + * @post Trigger fresh elections + */ + void lease_ack_timeout(); + /** + * Extend lease since we haven't had new committed values meanwhile. + * + * @pre We are the Leader + * @pre We are on STATE_ACTIVE + * @post Go through with Paxos::extend_lease + */ + void lease_renew_timeout(); + /** + * Call fresh elections because the Peon's lease expired without being + * renewed or receiving a fresh lease. + * + * This means that the Peon is no longer assumed as being in the quorum + * (or there is no Leader to speak of), so just trigger fresh elections + * to circumvent this issue. + * + * @pre We are a Peon + * @post Trigger fresh elections + */ void lease_timeout(); // on peon, if lease isn't extended + /** + * Cancel all of Paxos' timeout/renew events. + */ void cancel_events(); + /** + * Generate a new Proposal Number based on @p gt + * + * @todo Check what @p gt actually means and what its usage entails + * @param gt A hint for the geration of the Proposal Number + * @return A globally unique, monotonically increasing Proposal Number + */ version_t get_new_proposal_number(version_t gt=0); - + + /** + * @todo document sync function + */ void warn_on_future_time(utime_t t, entity_name_t from); public: + /** + * @param m A monitor + * @param mid A machine id + */ Paxos(Monitor *m, int mid) : mon(m), machine_id(mid), @@ -256,63 +865,253 @@ public: bool is_consistent(); void restart(); + /** + * Initiate the Leader after it wins an election. + * + * Once an election is won, the Leader will be initiated and there are two + * possible outcomes of this method: the Leader directly jumps to the active + * state (STATE_ACTIVE) if it believes to be the only one in the quorum, or + * will start recovering (STATE_RECOVERING) by initiating the collect phase. + * + * @pre Our monitor is the Leader. + * @post We are either on STATE_ACTIVE if we're the only one in the quorum, + * or on STATE_RECOVERING otherwise. + */ void leader_init(); + /** + * Initiate a Peon after it loses an election. + * + * If we are a Peon, then there must be a Leader and we are not alone in the + * quorum, thus automatically assume we are on STATE_RECOVERING, which means + * we will soon be enrolled into the Leader's collect phase. + * + * @pre There is a Leader, and he's about to start the collect phase. + * @post We are on STATE_RECOVERING and will soon receive collect phase's + * messages. + */ void peon_init(); - void share_state(MMonPaxos *m, version_t first_committed, version_t last_committed); + /** + * Include an incremental state of values, ranging from peer_first_committed + * to the last committed value, on the message m + * + * @param m A message + * @param peer_first_committed Lowest version to take into account + * @param peer_last_committed Highest version to take into account + */ + void share_state(MMonPaxos *m, version_t peer_first_committed, + version_t peer_last_committed); + /** + * Store the state held on the message m into local, stable storage. + * + * @param m A message + */ void store_state(MMonPaxos *m); + /** + * @todo This appears to be used only by the OSDMonitor, and I would say + * its objective is to allow a third-party to have a "private" + * state dir. -JL + */ void add_extra_state_dir(string s) { extra_state_dirs.push_back(s); } // -- service interface -- + /** + * Add c to the list of callbacks waiting for us to become active. + * + * @param c A callback + */ void wait_for_active(Context *c) { waiting_for_active.push_back(c); } + /** + * Erase old states from stable storage. + * + * @param first The version we are trimming to + * @param force If specified, we may even erase the latest stashed version + * iif @p first is higher than that version. + */ void trim_to(version_t first, bool force=false); - + + /** + * @defgroup Paxos_h_slurping_funcs Slurping-related functions + * @todo Discover what slurping is + * @{ + */ void start_slurping(); void end_slurping(); bool is_slurping() { return slurping == 1; } + /** + * @} + */ // read + /** + * @defgroup Paxos_h_read_funcs Read-related functions + * @{ + */ + /** + * Get latest committed version + * + * @return latest committed version + */ version_t get_version() { return last_committed; } + /** + * Get first committed version + * + * @return the first committed version + */ + version_t get_first_committed() { return first_committed; } + /** + * Check if a given version is readable. + * + * A version may not be readable for a myriad of reasons: + * @li the version @v is higher that the last committed version + * @li we are not the Leader nor a Peon (election may be on-going) + * @li we do not have a committed value yet + * @li we do not have a valid lease + * + * @param seen The version we want to check if it is readable. + * @return 'true' if the version is readable; 'false' otherwise. + */ bool is_readable(version_t seen=0); + /** + * Read version @v and store its value in @bl + * + * @param[in] v The version we want to read + * @param[out] bl The version's value + * @return 'true' if we successfully read the value; 'false' otherwise + */ bool read(version_t v, bufferlist &bl); + /** + * Read the latest committed version + * + * @param[out] bl The version's value + * @return the latest committed version if we successfully read the value; + * or 0 (zero) otherwise. + */ version_t read_current(bufferlist &bl); + /** + * Add onreadable to the list of callbacks waiting for us to become readable. + * + * @param onreadable A callback + */ void wait_for_readable(Context *onreadable) { //assert(!is_readable()); waiting_for_readable.push_back(onreadable); } + /** + * @} + */ - // write + /** + * @warning This declaration is not implemented anywhere and appears to be + * just some lingering code. + */ bool is_leader(); + // write + /** + * @defgroup Paxos_h_write_funcs Write-related functions + * @{ + */ + /** + * Check if we are writeable. + * + * We are writeable if we are alone (i.e., a quorum of one), or if we match + * all the following conditions: + * @li We are the Leader + * @li We are on STATE_ACTIVE + * @li We have a valid lease + * + * @return 'true' if we are writeable; 'false' otherwise. + */ bool is_writeable(); + /** + * Add c to the list of callbacks waiting for us to become writeable. + * + * @param c A callback + */ void wait_for_writeable(Context *c) { assert(!is_writeable()); waiting_for_writeable.push_back(c); } + /** + * Propose a new value to the Leader. + * + * This function enables the submission of a new value to the Leader, which + * will trigger a new proposal. + * + * @param bl A bufferlist holding the value to be proposed + * @param oncommit A callback to be fired up once we finish committing bl + */ bool propose_new_value(bufferlist& bl, Context *oncommit=0); + /** + * Add oncommit to the back of the list of callbacks waiting for us to + * finish committing. + * + * @param oncommit A callback + */ void wait_for_commit(Context *oncommit) { waiting_for_commit.push_back(oncommit); } + /** + * Add oncommit to the front of the list of callbacks waiting for us to + * finish committing. + * + * @param oncommit A callback + */ void wait_for_commit_front(Context *oncommit) { waiting_for_commit.push_front(oncommit); } + /** + * @} + */ - // if state values are incrementals, it is usefult to keep - // the latest copy of the complete structure. + /** + * @defgroup Paxos_h_stash_funcs State values stashing-related functions + * + * If the state values are incrementals, it is useful to keep the latest + * copy of the complete structure. + * + * @{ + */ + /** + * Get the latest version onto stable storage. + * + * Keeping the latest version on a predefined location makes it easier to + * access, since we know we always have the latest version on the same + * place. + * + * @param v the latest version + * @param bl the latest version's value + */ void stash_latest(version_t v, bufferlist& bl); + /** + * Get the latest stashed version's value + * + * @param[out] bl the latest stashed version's value + * @return the latest stashed version + */ version_t get_stashed(bufferlist& bl); + /** + * Get the latest stashed version + * + * @return the latest stashed version + */ version_t get_stashed_version() { return latest_stashed; } + /** + * @} + */ - version_t get_first_committed() { return first_committed; } + /** + * @} + */ }; - #endif diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc index 812ae996a17..38fbceab221 100644 --- a/src/mon/PaxosService.cc +++ b/src/mon/PaxosService.cc @@ -104,6 +104,7 @@ bool PaxosService::should_propose(double& delay) return true; } + void PaxosService::propose_pending() { dout(10) << "propose_pending" << dendl; @@ -115,7 +116,14 @@ void PaxosService::propose_pending() proposal_timer = 0; } - // finish and encode + /** + * @note The value we propose is encoded in a bufferlist, passed to + * Paxos::propose_new_value and it is obtained by calling a + * function that must be implemented by the class implementing us. + * I.e., the function encode_pending will be the one responsible + * to encode whatever is pending on the implementation class into a + * bufferlist, so we can then propose that as a value through Paxos. + */ bufferlist bl; encode_pending(bl); have_pending = false; diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h index e8c7ca41d85..2cb59a3d61a 100644 --- a/src/mon/PaxosService.h +++ b/src/mon/PaxosService.h @@ -21,12 +21,42 @@ class Monitor; class Paxos; +/** + * A Paxos Service is an abstraction that easily allows one to obtain an + * association between a Monitor and a Paxos class, in order to implement any + * service. + */ class PaxosService { + /** + * @defgroup PaxosService_h_class Paxos Service + * @{ + */ public: + /** + * The Monitor to which this class is associated with + */ Monitor *mon; + /** + * The Paxos instance to which this class is associated with + */ Paxos *paxos; protected: + /** + * @defgroup PaxosService_h_callbacks Callback classes + * @{ + */ + /** + * Retry dispatching a given service message + * + * This callback class is used when we had to wait for some condition to + * become true while we were dispatching it. + * + * For instance, if the message's version isn't readable, according to Paxos, + * then we must wait for it to become readable. So, we just queue an + * instance of this class onto the Paxos::wait_for_readable function, and + * we will retry the whole dispatch again once the callback is fired. + */ class C_RetryMessage : public Context { PaxosService *svc; PaxosServiceMessage *m; @@ -36,6 +66,15 @@ protected: svc->dispatch(m); } }; + + /** + * Callback used to make sure we call the PaxosService::_active function + * whenever a condition is fulfilled. + * + * This is used in multiple situations, from waiting for the Paxos to commit + * our proposed value, to waiting for the Paxos to become active once an + * election is finished. + */ class C_Active : public Context { PaxosService *svc; public: @@ -46,6 +85,10 @@ protected: } }; + /** + * Callback class used to propose the pending value once the proposal_timer + * fires up. + */ class C_Propose : public Context { PaxosService *ps; public: @@ -54,112 +97,235 @@ protected: ps->proposal_timer = 0; ps->propose_pending(); } - }; + }; + /** + * @} + */ friend class C_Propose; private: + /** + * Event callback responsible for proposing our pending value once a timer + * runs out and fires. + */ Context *proposal_timer; + /** + * If the implementation class has anything pending to be proposed to Paxos, + * then have_pending should be true; otherwise, false. + */ bool have_pending; public: + /** + * @param mn A Monitor instance + * @param p A Paxos instance + */ PaxosService(Monitor *mn, Paxos *p) : mon(mn), paxos(p), proposal_timer(0), have_pending(false) { } virtual ~PaxosService() {} + /** + * Get the machine name. + * + * @returns The machine name. + */ const char *get_machine_name(); // i implement and you ignore + /** + * Informs this instance that it should consider itself restarted. + * + * This means that we will cancel our proposal_timer event, if any exists. + */ void restart(); + /** + * Informs this instance that an election has finished. + * + * This means that we will invoke a PaxosService::discard_pending while + * setting have_pending to false (basically, ignore our pending state) and + * we will then make sure we obtain a new state. + * + * Our state shall be updated by PaxosService::_active if the Paxos is + * active; otherwise, we will wait for it to become active by adding a + * PaxosService::C_Active callback to it. + */ void election_finished(); + /** + * Informs this instance that it is supposed to shutdown. + * + * Basically, it will instruct Paxos to cancel all events/callbacks and then + * will cancel the proposal_timer event if any exists. + */ void shutdown(); private: + /** + * Update our state by updating it from Paxos, and then creating a new + * pending state if need be. + * + * @remarks We only create a pending state we our Monitor is the Leader. + * + * @pre Paxos is active + * @post have_pending is true iif our Monitor is the Leader and Paxos is + * active + */ void _active(); public: - // i implement and you use - void propose_pending(); // propose current pending as new paxos state + /** + * Propose a new value through Paxos. + * + * This function should be called by the classes implementing + * PaxosService, in order to propose a new value through Paxos. + * + * @pre The implementation class implements the encode_pending function. + * @pre have_pending is true + * @pre Our monitor is the Leader + * @pre Paxos is active + * @post Cancel the proposal timer, if any + * @post have_pending is false + * @post propose pending value through Paxos + * + * @note This function depends on the implementation of encode_pending on + * the class that is implementing PaxosService + */ + void propose_pending(); + /** + * Dispatch a message by passing it to several different functions that are + * either implemented directly by this service, or that should be implemented + * by the class implementing this service. + * + * @param m A message + * @returns 'true' on successful dispatch; 'false' otherwise. + */ bool dispatch(PaxosServiceMessage *m); - // you implement - /* + /** + * @defgroup PaxosService_h_override_funcs Functions that should be + * overridden. + * + * These functions should be overridden at will by the class implementing + * this service. + * @{ + */ + /** * Create the initial state for your system. - * In some of ours the state is actually set up - * elsewhere so this does nothing. + * + * In some of ours the state is actually set up elsewhere so this does + * nothing. */ virtual void create_initial() = 0; - /* - * Query the Paxos system for the latest state and apply it if - * it's newer than the current Monitor state. - * Return true on success. + /** + * Query the Paxos system for the latest state and apply it if it's newer + * than the current Monitor state. + * + * @returns 'true' on success; 'false' otherwise. */ virtual void update_from_paxos() = 0; - /* - * This function is only called on a leader. Create the pending state. - * (this created state is then modified by incoming messages). - * Called at startup and after every Paxos ratification round. + /** + * Create the pending state. + * + * @invariant This function is only called on a Leader. + * @remarks This created state is then modified by incoming messages. + * @remarks Called at startup and after every Paxos ratification round. */ virtual void create_pending() = 0; - /* - * This function is only called on a leader. Encode the pending state - * into a bufferlist for ratification and transmission - * as the next state. + /** + * Encode the pending state into a bufferlist for ratification and + * transmission as the next state. + * + * @invariant This function is only called on a Leader. + * + * @param[out] bl A bufferlist containing the encoded pending state */ virtual void encode_pending(bufferlist& bl) = 0; - /* - * As this function is NOT overridden in any of our code, - * but it is called in election_finished if have_pending. + /** + * Discard the pending state + * + * @invariant This function is only called on a Leader. + * + * @remarks This function is NOT overridden in any of our code, but it is + * called in PaxosService::election_finished if have_pending is + * true. */ - virtual void discard_pending() { } // [leader] discard pending + virtual void discard_pending() { } - /* - * Look at the query; if the query can be handled without changing - * state, do so. - * Return true if the query was handled (ie, was a read that got answered, - * was a state change that has no effect), false otherwise. + /** + * Look at the query; if the query can be handled without changing state, + * do so. + * + * @param m A query message + * @returns 'true' if the query was handled (e.g., was a read that got + * answered, was a state change that has no effect); 'false' + * otherwise. */ virtual bool preprocess_query(PaxosServiceMessage *m) = 0; - /* - * This function is only called on the leader. Apply the message - * to the pending state. + /** + * Apply the message to the pending state. + * + * @invariant This function is only called on a Leader. + * + * @param m An update message + * @returns 'true' if the update message was handled (e.g., a command that + * went through); 'false' otherwise. */ virtual bool prepare_update(PaxosServiceMessage *m) = 0; + /** + * @} + */ - /* - * Determine if the Paxos system should vote on pending, and - * if so how long it should wait to vote. - * Returns true if the Paxos system should propose, - * and fills in the delay paramater with the wait time - * (so you can limit update traffic spamming). + /** + * Determine if the Paxos system should vote on pending, and if so how long + * it should wait to vote. + * + * @param[out] delay The wait time, used so we can limit the update traffic + * spamming. + * @returns 'true' if the Paxos system should propose; 'false' otherwise. */ virtual bool should_propose(double &delay); - /* + /** + * @defgroup PaxosService_h_courtesy Courtesy functions + * + * Courtesy functions, in case the class implementing this service has + * anything it wants/needs to do at these times. + * @{ + */ + /** * This is called when the Paxos state goes to active. - * It's a courtesy method if you have things you want/need - * to do at that time. * - * Note that is may get called twice in certain recovery cases. + * @remarks It's a courtesy method, in case the class implementing this + * service has anything it wants/needs to do at that time. + * + * @note This function may get called twice in certain recovery cases. */ virtual void on_active() { } - /* - * Another courtesy method. Called when the Paxos - * system enters a leader election. + /** + * Called when the Paxos system enters a Leader election. + * + * @remarks It's a courtesy method, in case the class implementing this + * service has anything it wants/needs to do at that time. */ virtual void on_restart() { } + /** + * @} + */ + /** + * Tick. + */ virtual void tick() {} /** - * get health information + * Get health information * * @param summary list of summary strings and associated severity * @param detail optional list of detailed problem reports; may be NULL @@ -167,6 +333,9 @@ public: virtual void get_health(list<pair<health_status_t,string> >& summary, list<pair<health_status_t,string> > *detail) const { } + /** + * @} + */ }; #endif |