summaryrefslogtreecommitdiff
path: root/src/mon
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2012-05-31 16:42:38 -0700
committerSage Weil <sage@inktank.com>2012-05-31 16:42:38 -0700
commita4c90b725024c155b79d511822ce541aa83f5d65 (patch)
tree61292bfa1f567c5429055a69263e56183862d114 /src/mon
parentfb7ce59b9a18ac26882836de366629a4593855a6 (diff)
parent504c6ce95799fe3f559cdfee4e758a6ea5608ab9 (diff)
downloadceph-a4c90b725024c155b79d511822ce541aa83f5d65.tar.gz
Merge remote-tracking branch 'gh/wip-mon-doc'
Diffstat (limited to 'src/mon')
-rw-r--r--src/mon/Elector.cc10
-rw-r--r--src/mon/Elector.h328
-rw-r--r--src/mon/Monitor.h19
-rw-r--r--src/mon/OSDMonitor.cc10
-rw-r--r--src/mon/Paxos.cc95
-rw-r--r--src/mon/Paxos.h877
-rw-r--r--src/mon/PaxosService.cc10
-rw-r--r--src/mon/PaxosService.h259
8 files changed, 1460 insertions, 148 deletions
diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc
index 6b91f5a8d76..cc25cba82a2 100644
--- a/src/mon/Elector.cc
+++ b/src/mon/Elector.cc
@@ -78,8 +78,8 @@ void Elector::start()
// bcast to everyone else
for (unsigned i=0; i<mon->monmap->size(); ++i) {
if ((int)i == mon->rank) continue;
- mon->messenger->send_message(new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap),
- mon->monmap->get_inst(i));
+ Message *m = new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap);
+ mon->messenger->send_message(m, mon->monmap->get_inst(i));
}
reset_timer();
@@ -294,7 +294,8 @@ void Elector::dispatch(Message *m)
return;
}
if (m->get_source().num() >= mon->monmap->size()) {
- dout(5) << " ignoring bogus election message with bad mon rank " << m->get_source() << dendl;
+ dout(5) << " ignoring bogus election message with bad mon rank "
+ << m->get_source() << dendl;
m->put();
return;
}
@@ -303,7 +304,8 @@ void Elector::dispatch(Message *m)
// assume an old message encoding would have matched
if (em->fsid != mon->monmap->fsid) {
- dout(0) << " ignoring election msg fsid " << em->fsid << " != " << mon->monmap->fsid << dendl;
+ dout(0) << " ignoring election msg fsid "
+ << em->fsid << " != " << mon->monmap->fsid << dendl;
m->put();
return;
}
diff --git a/src/mon/Elector.h b/src/mon/Elector.h
index c4e0f5cad6b..3f3b508f45b 100644
--- a/src/mon/Elector.h
+++ b/src/mon/Elector.h
@@ -28,31 +28,141 @@ using namespace std;
class Monitor;
-
+/**
+ * This class is responsible for maintaining the local state when electing
+ * a new Leader. We may win or we may lose. If we win, it means we became the
+ * Leader; if we lose, it means we are a Peon.
+ */
class Elector {
+ /**
+ * @defgroup Elector_h_class Elector
+ * @{
+ */
private:
+ /**
+ * The Monitor instance associated with this class.
+ */
Monitor *mon;
+ /**
+ * Event callback responsible for dealing with an expired election once a
+ * timer runs out and fires up.
+ */
Context *expire_event;
+ /**
+ * Resets the expire_event timer, by cancelling any existing one and
+ * scheduling a new one.
+ *
+ * @remarks This function assumes as a default firing value the duration of
+ * the monitor's lease interval, and adds to it the value specified
+ * in @plus
+ *
+ * @post expire_event is set
+ *
+ * @param plus The amount of time to be added to the default firing value.
+ */
void reset_timer(double plus=0.0);
+ /**
+ * Cancel the expire_event timer, if it is defined.
+ *
+ * @post expire_event is not set
+ */
void cancel_timer();
- epoch_t epoch; // latest epoch we've seen. odd == election, even == stable,
+ /**
+ * Latest epoch we've seen.
+ *
+ * @remarks if its value is odd, we're electing; if it's even, then we're
+ * stable.
+ */
+ epoch_t epoch;
+ /**
+ * Indicates if we are participating in the quorum.
+ *
+ * @remarks By default, we are created as participating. We may stop
+ * participating if the Monitor explicitely calls
+ * Elector::stop_participating though. If that happens, it will
+ * have to call Elector::start_participating for us to resume
+ * participating in the quorum.
+ */
bool participating;
// electing me
+ /**
+ * @defgroup Elector_h_electing_me_vars We are being elected
+ * @{
+ */
+ /**
+ * Indicates if we are the ones being elected.
+ *
+ * We always attempt to be the one being elected if we are the ones starting
+ * the election. If we are not the ones that started it, we will only attempt
+ * to be elected if we think we might have a chance (i.e., the other guy's
+ * rank is lower than ours).
+ */
bool electing_me;
+ /**
+ * Holds the time at which we started the election.
+ */
utime_t start_stamp;
+ /**
+ * Set containing all those that acked our proposal to become the Leader.
+ *
+ * If we are acked by everyone in the MonMap, we will declare victory.
+ */
set<int> acked_me;
+ /**
+ * @}
+ */
+ /**
+ * @defgroup Elector_h_electing_them_vars We are electing another guy
+ * @{
+ */
+ /**
+ * Indicates who we have acked
+ */
+ int leader_acked;
+ /**
+ * Indicates when we have acked him
+ */
+ utime_t ack_stamp;
+ /**
+ * @}
+ */
+
+ /**
+ * Update our epoch.
+ *
+ * If we come across a higher epoch, we simply update ours, also making
+ * sure we are no longer being elected (even though we could have been,
+ * we no longer are since we no longer are on that old epoch).
+ *
+ * @pre Our epoch is lower than @p e
+ * @post Our epoch equals @p e
+ *
+ * @param e Epoch to which we will update our epoch
+ */
+ void bump_epoch(epoch_t e=0);
- // electing them
- int leader_acked; // who i've acked
- utime_t ack_stamp; // and when
-
- void bump_epoch(epoch_t e=0); // i just saw a larger epoch
-
+ /**
+ * @defgroup Elector_h_callbacks Callbacks
+ * @{
+ */
+ /**
+ * This class is used as the callback when the expire_event timer fires up.
+ *
+ * If the expire_event is fired, then it means that we had an election going,
+ * either started by us or by some other participant, but it took too long,
+ * thus expiring.
+ *
+ * When the election expires, we will check if we were the ones who won, and
+ * if so we will declare victory. If that is not the case, then we assume
+ * that the one we defered to didn't declare victory quickly enough (in fact,
+ * as far as we know, we may even be dead); so, just propose ourselves as the
+ * Leader.
+ */
class C_ElectionExpire : public Context {
Elector *elector;
public:
@@ -61,17 +171,155 @@ class Elector {
elector->expire();
}
};
+ /**
+ * @}
+ */
- void start(); // start an electing me
+ /**
+ * Start new elections by proposing ourselves as the new Leader.
+ *
+ * Basically, send propose messages to all the monitors in the MonMap and
+ * then reset the expire_event timer so we can limit the amount of time we
+ * will be going at it.
+ *
+ * @pre participating is true
+ * @post epoch is an odd value
+ * @post electing_me is true
+ * @post we sent propose messages to all the monitors in the MonMap
+ * @post we reset the expire_event timer
+ */
+ void start();
+ /**
+ * Defer the current election to some other monitor.
+ *
+ * This means that we will ack some other monitor and drop out from the run
+ * to become the Leader. We will only defer an election if the monitor we
+ * are deferring to outranks us.
+ *
+ * @pre @p who outranks us (i.e., who < our rank)
+ * @pre @p who outranks any other monitor we have deferred to in the past
+ * @post electing_me is false
+ * @post leader_acked equals @p who
+ * @post we sent an ack message to @p who
+ * @post we reset the expire_event timer
+ *
+ * @param who Some other monitor's numeric identifier.
+ */
void defer(int who);
- void expire(); // timer goes off
+ /**
+ * The election has taken too long and has expired.
+ *
+ * This will happen when no one declared victory or started a new election
+ * during the time span allowed by the expire_event timer.
+ *
+ * When the election expires, we will check if we were the ones who won, and
+ * if so we will declare victory. If that is not the case, then we assume
+ * that the one we defered to didn't declare victory quickly enough (in fact,
+ * as far as we know, we may even be dead); so, just propose ourselves as the
+ * Leader.
+ */
+ void expire();
+ /**
+ * Declare Victory.
+ *
+ * We won. Or at least we believe we won, but for all intentions and purposes
+ * that does not matter. What matters is that we Won.
+ *
+ * That said, we must now bump our epoch to reflect that the election is over
+ * and then we must let everybody in the quorum know we are their brand new
+ * Leader. And we will also cancel our expire_event timer.
+ *
+ * Actually, the quorum will be now defined as the group of monitors that
+ * acked us during the election process.
+ *
+ * @pre Election is on-going
+ * @pre electing_me is true
+ * @post electing_me is false
+ * @post epoch is bumped up into an even value
+ * @post Election is not on-going
+ * @post We have a quorum, composed of the monitors that acked us
+ * @post We sent a message of type OP_VICTORY to each quorum member.
+ */
void victory();
-
+
+ /**
+ * Handle a message from some other node proposing himself to become him
+ * the Leader.
+ *
+ * If the message appears to be old (i.e., its epoch is lower than our epoch),
+ * then we may take one of two actions:
+ *
+ * @li Ignore it because it's nothing more than an old proposal
+ * @li Start new elections if we verify that it was sent by a monitor from
+ * outside the quorum; given its old state, it's fair to assume he just
+ * started, so we should start new elections so he may rejoin
+ *
+ * If we did not ignore the received message, then we know that this message
+ * was sent by some other node proposing himself to become the Leader. So, we
+ * will take one of the following actions:
+ *
+ * @li Ignore him because we already acked another node with higher rank
+ * @li Ignore him and start a new election because we outrank him
+ * @li Defer to him because he outranks us and the node we previously
+ * acked, if any
+ *
+ *
+ * @invariant The received message is an operation of type OP_PROPOSE
+ *
+ * @param m A message sent by another participant in the quorum.
+ */
void handle_propose(class MMonElection *m);
+ /**
+ * Handle a message from some other participant Acking us as the Leader.
+ *
+ * When we receive such a message, one of three thing may be happening:
+ * @li We received a message with a newer epoch, which means we must have
+ * somehow lost track of what was going on (maybe we rebooted), thus we
+ * will start a new election
+ * @li We consider ourselves in the run for the Leader (i.e., @p electing_me
+ * is true), and we are actually being Acked by someone; thus simply add
+ * the one acking us to the @p acked_me set. If we do now have acks from
+ * all the participants, then we can declare victory
+ * @li We already deferred the election to somebody else, so we will just
+ * ignore this message
+ *
+ * @pre Election is on-going
+ * @post Election is on-going if we deferred to somebody else
+ * @post Election is on-going if we are still waiting for further Acks
+ * @post Election is not on-going if we are victorious
+ * @post Election is not on-going if we must start a new one
+ *
+ * @param m A message with an operation type of OP_ACK
+ */
void handle_ack(class MMonElection *m);
+ /**
+ * Handle a message from some other participant declaring Victory.
+ *
+ * We just got a message from someone declaring themselves Victorious, thus
+ * the new Leader.
+ *
+ * However, if the message's epoch happens to be different from our epoch+1,
+ * then it means we lost track of something and we must start a new election.
+ *
+ * If that is not the case, then we will simply update our epoch to the one
+ * in the message, cancel our @p expire_event timer and inform our Monitor
+ * that we lost the election and provide it with the new quorum.
+ *
+ * @pre Election in on-going
+ * @post Election is not on-going
+ * @post Updated @p epoch
+ * @post We have a new quorum if we lost the election
+ *
+ * @param m A message with an operation type of OP_VICTORY
+ */
void handle_victory(class MMonElection *m);
- public:
+ public:
+ /**
+ * Create an Elector class
+ *
+ * @param m A Monitor instance
+ */
Elector(Monitor *m) : mon(m),
expire_event(0),
epoch(0),
@@ -79,20 +327,74 @@ class Elector {
electing_me(false),
leader_acked(-1) { }
+ /**
+ * Initiate the Elector class.
+ *
+ * Basically, we will simply read whatever epoch value we have in our stable
+ * storage, or consider it to be 1 if none is read.
+ *
+ * @post @p epoch is set to 1 or higher.
+ */
void init();
+ /**
+ * Inform this class it is supposed to shutdown.
+ *
+ * We will simply cancel the @p expire_event if any exists.
+ *
+ * @post @p expire_event is cancelled
+ */
void shutdown();
+ /**
+ * Obtain our epoch
+ *
+ * @returns Our current epoch number
+ */
epoch_t get_epoch() { return epoch; }
+ /**
+ * Handle received messages.
+ *
+ * We will ignore all messages that are not of type @p MSG_MON_ELECTION
+ * (i.e., messages whose interface is not of type @p MMonElection). All of
+ * those that are will then be dispatched to their operation-specific
+ * functions.
+ *
+ * @param m A received message
+ */
void dispatch(Message *m);
+ /**
+ * Call an election.
+ *
+ * This function simply calls Elector::start.
+ */
void call_election() {
start();
}
+ /**
+ * Stop participating in subsequent Elections.
+ *
+ * @post @p participating is false
+ */
void stop_participating() { participating = false; }
+ /**
+ * Start participating in Elections.
+ *
+ * If we are already participating (i.e., @p participating is true), then
+ * calling this function is moot.
+ *
+ * However, if we are not participating (i.e., @p participating is false),
+ * then we will start participating by setting @p participating to true and
+ * we will call for an Election.
+ *
+ * @post @p participating is true
+ */
void start_participating();
+ /**
+ * @}
+ */
};
-
#endif
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index 381138a97a1..a7a3f9abe9a 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -284,7 +284,7 @@ public:
void handle_route(MRoute *m);
/**
- * generate health report
+ * Generate health report
*
* @param status one-line status summary
* @param detailbl optional bufferlist* to fill with a detailed report
@@ -295,12 +295,27 @@ public:
void reply_command(MMonCommand *m, int rc, const string &rs, bufferlist& rdata, version_t version);
void handle_probe(MMonProbe *m);
+ /**
+ * Handle a Probe Operation, replying with our name, quorum and known versions.
+ *
+ * We use the MMonProbe message class for anything and everything related with
+ * Monitor probing. One of the operations relates directly with the probing
+ * itself, in which we receive a probe request and to which we reply with
+ * our name, our quorum and the known versions for each Paxos service. Thus the
+ * redundant function name. This reply will obviously be sent to the one
+ * probing/requesting these infos.
+ *
+ * @todo Add @pre and @post
+ *
+ * @param m A Probe message, with an operation of type Probe.
+ */
void handle_probe_probe(MMonProbe *m);
void handle_probe_reply(MMonProbe *m);
void handle_probe_slurp(MMonProbe *m);
void handle_probe_slurp_latest(MMonProbe *m);
void handle_probe_data(MMonProbe *m);
- /* Given an MMonProbe and associated Paxos machine, create a reply,
+ /**
+ * Given an MMonProbe and associated Paxos machine, create a reply,
* fill it with the missing Paxos states and current commit pointers
*
* @param m The incoming MMonProbe. We use this to determine the range
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index ec083c3fda6..86896809d0b 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -603,6 +603,10 @@ bool OSDMonitor::can_mark_up(int i)
return true;
}
+/**
+ * @note the parameter @p i apparently only exists here so we can output the
+ * osd's id on messages.
+ */
bool OSDMonitor::can_mark_out(int i)
{
if (osdmap.test_flag(CEPH_OSDMAP_NOOUT)) {
@@ -1252,6 +1256,12 @@ void OSDMonitor::tick()
// mark down osds out?
utime_t now = ceph_clock_now(g_ceph_context);
+
+ /* can_mark_out() checks if we can mark osds as being out. The -1 has no
+ * influence at all. The decision is made based on the ratio of "in" osds,
+ * and the function returns false if this ratio is lower that the minimum
+ * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us.
+ */
if (can_mark_out(-1)) {
map<int,utime_t>::iterator i = down_pending_out.begin();
while (i != down_pending_out.end()) {
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
index 6014c456454..ebf1b815309 100644
--- a/src/mon/Paxos.cc
+++ b/src/mon/Paxos.cc
@@ -23,9 +23,10 @@
#define dout_subsys ceph_subsys_paxos
#undef dout_prefix
#define dout_prefix _prefix(_dout, mon, mon->name, mon->rank, machine_name, state, first_committed, last_committed)
-static ostream& _prefix(std::ostream *_dout, Monitor *mon, const string& name, int rank,
- const char *machine_name, int state,
- version_t first_committed, version_t last_committed) {
+static ostream& _prefix(std::ostream *_dout, Monitor *mon, const string& name,
+ int rank, const char *machine_name, int state,
+ version_t first_committed, version_t last_committed)
+{
return *_dout << "mon." << name << "@" << rank
<< "(" << mon->get_state_name() << ")"
<< ".paxos(" << machine_name << " " << Paxos::get_statename(state)
@@ -89,8 +90,8 @@ void Paxos::collect(version_t oldpn)
++p) {
if (*p == mon->rank) continue;
- MMonPaxos *collect = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COLLECT, machine_id,
- ceph_clock_now(g_ceph_context));
+ MMonPaxos *collect = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COLLECT,
+ machine_id, ceph_clock_now(g_ceph_context));
collect->last_committed = last_committed;
collect->first_committed = first_committed;
collect->pn = accepted_pn;
@@ -124,13 +125,14 @@ void Paxos::handle_collect(MMonPaxos *collect)
// ok, accept it
accepted_pn = collect->pn;
accepted_pn_from = collect->pn_from;
- dout(10) << "accepting pn " << accepted_pn << " from " << accepted_pn_from << dendl;
+ dout(10) << "accepting pn " << accepted_pn << " from "
+ << accepted_pn_from << dendl;
mon->store->put_int(accepted_pn, machine_name, "accepted_pn");
} else {
// don't accept!
- dout(10) << "NOT accepting pn " << collect->pn << " from " << collect->pn_from
- << ", we already accepted " << accepted_pn << " from " << accepted_pn_from
- << dendl;
+ dout(10) << "NOT accepting pn " << collect->pn << " from " << collect->pn_from
+ << ", we already accepted " << accepted_pn
+ << " from " << accepted_pn_from << dendl;
}
last->pn = accepted_pn;
last->pn_from = accepted_pn_from;
@@ -145,8 +147,8 @@ void Paxos::handle_collect(MMonPaxos *collect)
if (mon->store->exists_bl_sn(machine_name, last_committed+1)) {
mon->store->get_bl_sn(bl, machine_name, last_committed+1);
assert(bl.length() > 0);
- dout(10) << " sharing our accepted but uncommitted value for " << last_committed+1
- << " (" << bl.length() << " bytes)" << dendl;
+ dout(10) << " sharing our accepted but uncommitted value for "
+ << last_committed+1 << " (" << bl.length() << " bytes)" << dendl;
last->values[last_committed+1] = bl;
last->uncommitted_pn = accepted_pn;
}
@@ -156,11 +158,13 @@ void Paxos::handle_collect(MMonPaxos *collect)
collect->put();
}
-void Paxos::share_state(MMonPaxos *m, version_t peer_first_committed, version_t peer_last_committed)
+void Paxos::share_state(MMonPaxos *m, version_t peer_first_committed,
+ version_t peer_last_committed)
{
assert(peer_last_committed < last_committed);
- dout(10) << "share_state peer has fc " << peer_first_committed << " lc " << peer_last_committed << dendl;
+ dout(10) << "share_state peer has fc " << peer_first_committed
+ << " lc " << peer_last_committed << dendl;
version_t v = peer_last_committed + 1;
// start with a stashed full copy?
@@ -168,16 +172,15 @@ void Paxos::share_state(MMonPaxos *m, version_t peer_first_committed, version_t
bufferlist bl;
version_t l = get_stashed(bl);
assert(l <= last_committed);
- dout(10) << "share_state starting with latest " << l << " (" << bl.length() << " bytes)" << dendl;
+ dout(10) << "share_state starting with latest " << l
+ << " (" << bl.length() << " bytes)" << dendl;
m->latest_value.claim(bl);
m->latest_version = l;
v = l;
}
// include incrementals
- for ( ;
- v <= last_committed;
- v++) {
+ for ( ; v <= last_committed; v++) {
if (mon->store->exists_bl_sn(machine_name, v)) {
mon->store->get_bl_sn(m->values[v], machine_name, v);
dout(10) << " sharing " << v << " ("
@@ -194,7 +197,8 @@ void Paxos::store_state(MMonPaxos *m)
// stash?
if (m->latest_version && m->latest_version > last_committed) {
- dout(10) << "store_state got stash version " << m->latest_version << ", zapping old states" << dendl;
+ dout(10) << "store_state got stash version "
+ << m->latest_version << ", zapping old states" << dendl;
assert(start != m->values.end() && start->first == m->latest_version);
@@ -220,13 +224,12 @@ void Paxos::store_state(MMonPaxos *m)
start = m->values.end();
}
- while (start != m->values.end() &&
- start->first <= last_committed)
+ while (start != m->values.end() && start->first <= last_committed) {
++start;
+ }
map<version_t,bufferlist>::iterator end = start;
- while (end != m->values.end() &&
- end->first <= m->last_committed) {
+ while (end != m->values.end() && end->first <= m->last_committed) {
last_committed = end->first;
if (!first_committed)
first_committed = last_committed;
@@ -236,7 +239,9 @@ void Paxos::store_state(MMonPaxos *m)
if (start == end) {
dout(10) << "store_state nothing to commit" << dendl;
} else {
- dout(10) << "store_state [" << start->first << ".." << last_committed << "]" << dendl;
+ dout(10) << "store_state [" << start->first << ".."
+ << last_committed << "]" << dendl;
+
mon->store->put_bl_sn_map(machine_name, start, end);
mon->store->put_int(last_committed, machine_name, "last_committed");
mon->store->put_int(first_committed, machine_name, "first_committed");
@@ -303,7 +308,8 @@ void Paxos::handle_last(MMonPaxos *last)
if (p->second < last_committed) {
// share committed values
dout(10) << " sending commit to mon." << p->first << dendl;
- MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, machine_id,
+ MMonPaxos *commit = new MMonPaxos(mon->get_epoch(),
+ MMonPaxos::OP_COMMIT, machine_id,
ceph_clock_now(g_ceph_context));
share_state(commit, peer_first_committed[p->first], p->second);
mon->messenger->send_message(commit, mon->monmap->get_inst(p->first));
@@ -383,15 +389,15 @@ void Paxos::begin(bufferlist& v)
return;
}
- // ask others to accept it to!
+ // ask others to accept it too!
for (set<int>::const_iterator p = mon->get_quorum().begin();
p != mon->get_quorum().end();
++p) {
if (*p == mon->rank) continue;
dout(10) << " sending begin to mon." << *p << dendl;
- MMonPaxos *begin = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_BEGIN, machine_id,
- ceph_clock_now(g_ceph_context));
+ MMonPaxos *begin = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_BEGIN,
+ machine_id, ceph_clock_now(g_ceph_context));
begin->values[last_committed+1] = new_value;
begin->last_committed = last_committed;
begin->pn = accepted_pn;
@@ -428,8 +434,8 @@ void Paxos::handle_begin(MMonPaxos *begin)
mon->store->put_bl_sn(begin->values[v], machine_name, v);
// reply
- MMonPaxos *accept = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_ACCEPT, machine_id,
- ceph_clock_now(g_ceph_context));
+ MMonPaxos *accept = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_ACCEPT,
+ machine_id, ceph_clock_now(g_ceph_context));
accept->pn = accepted_pn;
accept->last_committed = last_committed;
mon->messenger->send_message(accept, begin->get_source_inst());
@@ -525,8 +531,8 @@ void Paxos::commit()
if (*p == mon->rank) continue;
dout(10) << " sending commit to mon." << *p << dendl;
- MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, machine_id,
- ceph_clock_now(g_ceph_context));
+ MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT,
+ machine_id, ceph_clock_now(g_ceph_context));
commit->values[last_committed] = new_value;
commit->pn = accepted_pn;
commit->last_committed = last_committed;
@@ -539,8 +545,6 @@ void Paxos::commit()
}
-
-
void Paxos::handle_commit(MMonPaxos *commit)
{
dout(10) << "handle_commit on " << commit->last_committed << dendl;
@@ -569,15 +573,16 @@ void Paxos::extend_lease()
acked_lease.clear();
acked_lease.insert(mon->rank);
- dout(7) << "extend_lease now+" << g_conf->mon_lease << " (" << lease_expire << ")" << dendl;
+ dout(7) << "extend_lease now+" << g_conf->mon_lease
+ << " (" << lease_expire << ")" << dendl;
// bcast
for (set<int>::const_iterator p = mon->get_quorum().begin();
- p != mon->get_quorum().end();
- ++p) {
+ p != mon->get_quorum().end(); ++p) {
+
if (*p == mon->rank) continue;
- MMonPaxos *lease = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE, machine_id,
- ceph_clock_now(g_ceph_context));
+ MMonPaxos *lease = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE,
+ machine_id, ceph_clock_now(g_ceph_context));
lease->last_committed = last_committed;
lease->lease_timestamp = lease_expire;
lease->first_committed = first_committed;
@@ -588,7 +593,8 @@ void Paxos::extend_lease()
// if old timeout is still in place, leave it.
if (!lease_ack_timeout_event) {
lease_ack_timeout_event = new C_LeaseAckTimeout(this);
- mon->timer.add_event_after(g_conf->mon_lease_ack_timeout, lease_ack_timeout_event);
+ mon->timer.add_event_after(g_conf->mon_lease_ack_timeout,
+ lease_ack_timeout_event);
}
// set renew event
@@ -624,7 +630,8 @@ void Paxos::handle_lease(MMonPaxos *lease)
// sanity
if (!mon->is_peon() ||
last_committed != lease->last_committed) {
- dout(10) << "handle_lease i'm not a peon, or they're not the leader, or the last_committed doesn't match, dropping" << dendl;
+ dout(10) << "handle_lease i'm not a peon, or they're not the leader,"
+ << " or the last_committed doesn't match, dropping" << dendl;
lease->put();
return;
}
@@ -642,8 +649,8 @@ void Paxos::handle_lease(MMonPaxos *lease)
<< " now " << lease_expire << dendl;
// ack
- MMonPaxos *ack = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE_ACK, machine_id,
- ceph_clock_now(g_ceph_context));
+ MMonPaxos *ack = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE_ACK,
+ machine_id, ceph_clock_now(g_ceph_context));
ack->last_committed = last_committed;
ack->first_committed = first_committed;
ack->lease_timestamp = ceph_clock_now(g_ceph_context);
@@ -671,7 +678,8 @@ void Paxos::handle_lease_ack(MMonPaxos *ack)
int from = ack->get_source().num();
if (!lease_ack_timeout_event) {
- dout(10) << "handle_lease_ack from " << ack->get_source() << " -- stray (probably since revoked)" << dendl;
+ dout(10) << "handle_lease_ack from " << ack->get_source()
+ << " -- stray (probably since revoked)" << dendl;
}
else if (acked_lease.count(from) == 0) {
acked_lease.insert(from);
@@ -724,7 +732,6 @@ void Paxos::lease_renew_timeout()
}
-
/*
* trim old states
*/
diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h
index 8558fc9502f..71980aebc75 100644
--- a/src/mon/Paxos.h
+++ b/src/mon/Paxos.h
@@ -35,18 +35,6 @@ e 12v
*/
-
-/*
- * NOTE: This libary is based on the Paxos algorithm, but varies in a few key ways:
- * 1- Only a single new value is generated at a time, simplifying the recovery logic.
- * 2- Nodes track "committed" values, and share them generously (and trustingly)
- * 3- A 'leasing' mechism is built-in, allowing nodes to determine when it is safe to
- * "read" their copy of the last committed value.
- *
- * This provides a simple replication substrate that services can be built on top of.
- * See PaxosService.h
- */
-
#ifndef CEPH_MON_PAXOS_H
#define CEPH_MON_PAXOS_H
@@ -66,7 +54,24 @@ class Paxos;
// i am one state machine.
+/**
+ * This libary is based on the Paxos algorithm, but varies in a few key ways:
+ * 1- Only a single new value is generated at a time, simplifying the recovery logic.
+ * 2- Nodes track "committed" values, and share them generously (and trustingly)
+ * 3- A 'leasing' mechism is built-in, allowing nodes to determine when it is
+ * safe to "read" their copy of the last committed value.
+ *
+ * This provides a simple replication substrate that services can be built on top of.
+ * See PaxosService.h
+ */
class Paxos {
+ /**
+ * @defgroup Paxos_h_class Paxos
+ * @{
+ */
+ /**
+ * The Monitor to which this Paxos class is associated with.
+ */
Monitor *mon;
// my state machine info
@@ -82,9 +87,32 @@ class Paxos {
// -- generic state --
public:
- const static int STATE_RECOVERING = 1; // leader|peon: recovering paxos state
- const static int STATE_ACTIVE = 2; // leader|peon: idle. peon may or may not have valid lease
- const static int STATE_UPDATING = 3; // leader|peon: updating to new value
+ /**
+ * @defgroup Paxos_h_states States on which the leader/peon may be.
+ * @{
+ */
+ /**
+ * Leader/Peon is in Paxos' Recovery state
+ */
+ const static int STATE_RECOVERING = 1;
+ /**
+ * Leader/Peon is idle, and the Peon may or may not have a valid lease.
+ */
+ const static int STATE_ACTIVE = 2;
+ /**
+ * Leader/Peon is updating to a new value.
+ */
+ const static int STATE_UPDATING = 3;
+
+ /**
+ * Obtain state name from constant value.
+ *
+ * @note This function will raise a fatal error if @p s is not
+ * a valid state value.
+ *
+ * @param s State value.
+ * @return The state's name.
+ */
static const char *get_statename(int s) {
switch (s) {
case STATE_RECOVERING: return "recovering";
@@ -95,60 +123,328 @@ public:
}
private:
+ /**
+ * The state we are in.
+ */
int state;
+ /**
+ * @}
+ */
public:
+ /**
+ * Check if we are recovering.
+ *
+ * @return 'true' if we are on the Recovering state; 'false' otherwise.
+ */
bool is_recovering() const { return state == STATE_RECOVERING; }
+ /**
+ * Check if we are active.
+ *
+ * @return 'true' if we are on the Active state; 'false' otherwise.
+ */
bool is_active() const { return state == STATE_ACTIVE; }
+ /**
+ * Check if we are updating.
+ *
+ * @return 'true' if we are on the Updating state; 'false' otherwise.
+ */
bool is_updating() const { return state == STATE_UPDATING; }
private:
- // recovery (phase 1)
+ /**
+ * @defgroup Paxos_h_recovery_vars Common recovery-related member variables
+ * @note These variables are common to both the Leader and the Peons.
+ * @{
+ */
+ /**
+ *
+ */
version_t first_committed;
+ /**
+ * Last Proposal Number
+ *
+ * @todo Expand description
+ */
version_t last_pn;
+ /**
+ * Last committed value's version.
+ *
+ * On both the Leader and the Peons, this is the last value's version that
+ * was accepted by a given quorum and thus committed, that this instance
+ * knows about.
+ *
+ * @note It may not be the last committed value's version throughout the
+ * system. If we are a Peon, we may have not been part of the quorum
+ * that accepted the value, and for this very same reason we may still
+ * be a (couple of) version(s) behind, until we learn about the most
+ * recent version. This should only happen if we are not active (i.e.,
+ * part of the quorum), which should not happen if we are up, running
+ * and able to communicate with others -- thus able to be part of the
+ * monmap and trigger new elections.
+ */
version_t last_committed;
+ /**
+ * Last committed value's time.
+ *
+ * When the commit happened.
+ */
utime_t last_commit_time;
+ /**
+ * The last Proposal Number we have accepted.
+ *
+ * On the Leader, it will be the Proposal Number picked by the Leader
+ * itself. On the Peon, however, it will be the proposal sent by the Leader
+ * and it will only be updated iif its value is higher than the one
+ * already known by the Peon.
+ */
version_t accepted_pn;
+ /**
+ * @todo This has something to do with the last_committed version. Not sure
+ * about what it entails, tbh.
+ */
version_t accepted_pn_from;
- map<int,version_t> peer_first_committed, peer_last_committed;
+ /**
+ * @todo Check out if this map has any purpose at all. So far, we have only
+ * seen it being read, although it is never affected.
+ */
+ map<int,version_t> peer_first_committed;
+ /**
+ * Map holding the last committed version by each quorum member.
+ *
+ * The versions kept in this map are updated during the collect phase.
+ * When the Leader starts the collect phase, each Peon will reply with its
+ * last committed version, which will then be kept in this map.
+ */
+ map<int,version_t> peer_last_committed;
+ /**
+ * @todo Check out what 'slurping' is.
+ */
int slurping;
+ /**
+ * @}
+ */
// active (phase 2)
+ /**
+ * @defgroup Paxos_h_active_vars Common active-related member variables
+ * @{
+ */
+ /**
+ * When does our read lease expires.
+ *
+ * Instead of performing a full commit each time a read is requested, we
+ * keep leases. Each lease will have an expiration date, which may or may
+ * not be extended. This member variable will keep when is the lease
+ * expiring.
+ */
utime_t lease_expire;
+ /**
+ * List of callbacks waiting for our state to change into STATE_ACTIVE.
+ */
list<Context*> waiting_for_active;
+ /**
+ * List of callbacks waiting for the chance to read a version from us.
+ *
+ * Each entry on the list may result from an attempt to read a version that
+ * wasn't available at the time, or an attempt made during a period during
+ * which we could not satisfy the read request. The first case happens if
+ * the requested version is greater than our last committed version. The
+ * second scenario may happen if we are recovering, or if we don't have a
+ * valid lease.
+ *
+ * The list will be woken up once we change to STATE_ACTIVE with an extended
+ * lease -- which can be achieved if we have everyone on the quorum on board
+ * with the latest proposal, or if we don't really care about the remaining
+ * uncommitted values --, or if we're on a quorum of one.
+ */
list<Context*> waiting_for_readable;
+ /**
+ * Latest version written to the store after the latest commit.
+ */
version_t latest_stashed;
+ /**
+ * @}
+ */
// -- leader --
// recovery (paxos phase 1)
+ /**
+ * @defgroup Paxos_h_leader_recovery Leader-specific Recovery-related vars
+ * @{
+ */
+ /**
+ * Number of replies to the collect phase we've received so far.
+ *
+ * This variable is reset to 1 each time we start a collect phase; it is
+ * incremented each time we receive a reply to the collect message, and
+ * is used to determine whether or not we have received replies from the
+ * whole quorum.
+ */
unsigned num_last;
+ /**
+ * Uncommitted value's version.
+ *
+ * If we have, or end up knowing about, an uncommitted value, then its
+ * version will be kept in this variable.
+ *
+ * @note If this version equals @p last_committed+1 when we reach the final
+ * steps of recovery, then the algorithm will assume this is a value
+ * the Leader does not know about, and trustingly the Leader will
+ * propose this version's value.
+ */
version_t uncommitted_v;
+ /**
+ * Uncommitted value's Proposal Number.
+ *
+ * We use this variable to assess if the Leader should take into consideration
+ * an uncommitted value sent by a Peon. Given that the Peon will send back to
+ * the Leader the last Proposal Number he accepted, the Leader will be able
+ * to infer if this value is more recent than the one the Leader has, thus
+ * more relevant.
+ */
version_t uncommitted_pn;
+ /**
+ * Uncommitted Value.
+ *
+ * If the system fails in-between the accept replies from the Peons and the
+ * instruction to commit from the Leader, then we may end up with accepted
+ * but yet-uncommitted values. During the Leader's recovery, he will attempt
+ * to bring the whole system to the latest state, and that means committing
+ * past accepted but uncommitted values.
+ *
+ * This variable will hold an uncommitted value, which may originate either
+ * on the Leader, or learnt by the Leader from a Peon during the collect
+ * phase.
+ */
bufferlist uncommitted_value;
-
+ /**
+ * Used to specify when an on-going collect phase times out.
+ */
Context *collect_timeout_event;
+ /**
+ * @}
+ */
// active
+ /**
+ * @defgroup Paxos_h_leader_active Leader-specific Active-related vars
+ * @{
+ */
+ /**
+ * Set of participants (Leader & Peons) that have acked a lease extension.
+ *
+ * Each Peon that acknowledges a lease extension will have its place in this
+ * set, which will be used to account for all the acks from all the quorum
+ * members, guaranteeing that we trigger new elections if some don't ack in
+ * the expected timeframe.
+ */
set<int> acked_lease;
+ /**
+ * Callback responsible for extending the lease periodically.
+ */
Context *lease_renew_event;
+ /**
+ * Callback to trigger new elections once the time for acks is out.
+ */
Context *lease_ack_timeout_event;
+ /**
+ * @}
+ */
+ /**
+ * @defgroup Paxos_h_peon_active Peon-specific Active-related vars
+ * @{
+ */
+ /**
+ * Callback to trigger new elections when the Peon's lease times out.
+ *
+ * If the Peon's lease is extended, this callback will be reset (i.e.,
+ * we cancel the event and reschedule a new one with starting from the
+ * beginning).
+ */
Context *lease_timeout_event;
+ /**
+ * @}
+ */
// updating (paxos phase 2)
+ /**
+ * @defgroup Paxos_h_leader_updating Leader-specific Updating-related vars
+ * @{
+ */
+ /**
+ * New Value being proposed to the Peons.
+ *
+ * This bufferlist holds the value the Leader is proposing to the Peons, and
+ * that will be committed if the Peons do accept the proposal.
+ */
bufferlist new_value;
+ /**
+ * Set of participants (Leader & Peons) that accepted the new proposed value.
+ *
+ * This set is used to keep track of those who have accepted the proposed
+ * value, so the leader may know when to issue a commit (when a majority of
+ * participants has accepted the proposal), and when to extend the lease
+ * (when all the quorum members have accepted the proposal).
+ */
set<int> accepted;
-
+ /**
+ * Callback to trigger a new election if the proposal is not accepted by the
+ * full quorum within a given timeframe.
+ *
+ * If the full quorum does not accept the proposal, then it means that the
+ * Leader may no longer be recognized as the leader, or that the quorum has
+ * changed, and the value may have not reached all the participants. Thus,
+ * the leader must call new elections, and go through a recovery phase in
+ * order to propagate the new value throughout the system.
+ *
+ * This does not mean that we won't commit. We will commit as soon as we
+ * have a majority of acceptances. But if we do not have full acceptance
+ * from the quorum, then we cannot extend the lease, as some participants
+ * may not have the latest committed value.
+ */
Context *accept_timeout_event;
+ /**
+ * List of callbacks waiting for it to be possible to write again.
+ *
+ * @remarks It is not possible to write if we are not the Leader, or we are
+ * not on the active state, or if the lease has expired.
+ */
list<Context*> waiting_for_writeable;
+ /**
+ * List of callbacks waiting for a commit to finish.
+ *
+ * @remarks This may be used to a) wait for an on-going commit to finish
+ * before we proceed with, say, a new proposal; or b) wait for the
+ * next commit to be finished so we are sure that our value was
+ * fully committed.
+ */
list<Context*> waiting_for_commit;
+ /**
+ * @}
+ */
- //synchronization warnings
+ /**
+ * @defgroup Paxos_h_sync_warns Synchronization warnings
+ * @todo Describe these variables
+ * @{
+ */
utime_t last_clock_drift_warn;
int clock_drift_warned;
+ /**
+ * @}
+ */
+ /**
+ * @defgroup Paxos_h_callbacks Callback classes.
+ * @{
+ */
+ /**
+ * Callback class responsible for handling a Collect Timeout.
+ */
class C_CollectTimeout : public Context {
Paxos *paxos;
public:
@@ -158,6 +454,9 @@ private:
}
};
+ /**
+ * Callback class responsible for handling an Accept Timeout.
+ */
class C_AcceptTimeout : public Context {
Paxos *paxos;
public:
@@ -167,6 +466,9 @@ private:
}
};
+ /**
+ * Callback class responsible for handling a Lease Ack Timeout.
+ */
class C_LeaseAckTimeout : public Context {
Paxos *paxos;
public:
@@ -176,6 +478,9 @@ private:
}
};
+ /**
+ * Callback class responsible for handling a Lease Timeout.
+ */
class C_LeaseTimeout : public Context {
Paxos *paxos;
public:
@@ -185,6 +490,9 @@ private:
}
};
+ /**
+ * Callback class responsible for handling a Lease Renew Timeout.
+ */
class C_LeaseRenew : public Context {
Paxos *paxos;
public:
@@ -193,35 +501,336 @@ private:
paxos->lease_renew_timeout();
}
};
+ /**
+ * @}
+ */
-
+ /**
+ * @defgroup Paxos_h_election_triggered Steps triggered by an election.
+ *
+ * @note All these functions play a significant role in the Recovery Phase,
+ * which is triggered right after an election once someone becomes
+ * the Leader.
+ * @{
+ */
+ /**
+ * Create a new Proposal Number and propose it to the Peons.
+ *
+ * This function starts the Recovery Phase, which can be directly mapped
+ * onto the original Paxos' Prepare phase. Basically, we'll generate a
+ * Proposal Number, taking @p oldpn into consideration, and we will send
+ * it to a quorum, along with our first and last committed versions. By
+ * sending these informations in a message to the quorum, we expect to
+ * obtain acceptances from a majority, allowing us to commit, or be
+ * informed of a higher Proposal Number known by one or more of the Peons
+ * in the quorum.
+ *
+ * @pre We are the Leader.
+ * @post Recovery Phase initiated by sending messages to the quorum.
+ *
+ * @param oldpn A proposal number taken as the highest known so far, that
+ * should be taken into consideration when generating a new
+ * Proposal Number for the Recovery Phase.
+ */
void collect(version_t oldpn);
- void handle_collect(MMonPaxos*);
- void handle_last(MMonPaxos*);
+ /**
+ * Handle the reception of a collect message from the Leader and reply
+ * accordingly.
+ *
+ * Once a Peon receives a collect message from the Leader it will reply
+ * with its first and last committed versions, as well as informations so
+ * the Leader may know if his Proposal Number was, or was not, accepted by
+ * the Peon. The Peon will accept the Leader's Proposal Number iif it is
+ * higher than the Peon's currently accepted Proposal Number. The Peon may
+ * also inform the Leader of accepted but uncommitted values.
+ *
+ * @invariant The message is an operation of type OP_COLLECT.
+ * @pre We are a Peon.
+ * @post Replied to the Leader, accepting or not accepting his PN.
+ *
+ * @param collect The collect message sent by the Leader to the Peon.
+ */
+ void handle_collect(MMonPaxos *collect);
+ /**
+ * Handle a response from a Peon to the Leader's collect phase.
+ *
+ * The received message will state the Peon's last committed version, as
+ * well as its last proposal number. This will lead to one of the following
+ * scenarios: if the replied Proposal Number is equal to the one we proposed,
+ * then the Peon has accepted our proposal, and if all the Peons do accept
+ * our Proposal Number, then we are allowed to proceed with the commit;
+ * however, if a Peon replies with a higher Proposal Number, we assume he
+ * knows something we don't and the Leader will have to abort the current
+ * proposal in order to retry with the Proposal Number specified by the Peon.
+ * It may also occur that the Peon replied with a lower Proposal Number, in
+ * which case we assume it is a reply to an an older value and we'll simply
+ * drop it.
+ * This function will also check if the Peon replied with an accepted but
+ * yet uncommitted value. In this case, if its version is higher than our
+ * last committed value by one, we assume that the Peon knows a value from a
+ * previous proposal that has never been committed, and we should try to
+ * commit that value by proposing it next. On the other hand, if that is
+ * not the case, we'll assume it is an old, uncommitted value, we do not
+ * care about and we'll consider the system active by extending the leases.
+ *
+ * @invariant The message is an operation of type OP_LAST.
+ * @pre We are the Leader.
+ * @post We initiate a commit, or we retry with a higher Proposal Number,
+ * or we drop the message.
+ * @post We move from STATE_RECOVERING to STATE_ACTIVE.
+ *
+ * @param last The message sent by the Peon to the Leader.
+ */
+ void handle_last(MMonPaxos *last);
+ /**
+ * The Recovery Phase timed out, meaning that a significant part of the
+ * quorum does not believe we are the Leader, and we thus should trigger new
+ * elections.
+ *
+ * @pre We believe to be the Leader.
+ * @post Trigger new elections.
+ */
void collect_timeout();
+ /**
+ * @}
+ */
+ /**
+ * @defgroup Paxos_h_updating_funcs Functions used during the Updating State
+ *
+ * These functions may easily be mapped to the original Paxos Algorithm's
+ * phases.
+ *
+ * Taking into account the algorithm can be divided in 4 phases (Prepare,
+ * Promise, Accept Request and Accepted), we can easily map Paxos::begin to
+ * both the Prepare and Accept Request phases; the Paxos::handle_begin to
+ * the Promise phase; and the Paxos::handle_accept to the Accepted phase.
+ * @{
+ */
+ /**
+ * Start a new proposal with the intent of committing @p value.
+ *
+ * If we are alone on the system (i.e., a quorum of one), then we will
+ * simply commit the value, but if we are not alone, then we need to propose
+ * the value to the quorum.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_ACTIVE
+ * @post We commit, iif we are alone, or we send a message to each quorum
+ * member
+ * @post We are on STATE_ACTIVE, iif we are alone, or on
+ * STATE_UPDATING otherwise
+ *
+ * @param value The value being proposed to the quorum
+ */
void begin(bufferlist& value);
- void handle_begin(MMonPaxos*);
- void handle_accept(MMonPaxos*);
+ /**
+ * Accept or decline (by ignoring) a proposal from the Leader.
+ *
+ * We will decline the proposal (by ignoring it) if we have promised to
+ * accept a higher numbered proposal. If that is not the case, we will
+ * accept it and accordingly reply to the Leader.
+ *
+ * @pre We are a Peon
+ * @pre We are on STATE_ACTIVE
+ * @post We are on STATE_UPDATING iif we accept the Leader's proposal
+ * @post We send a reply message to the Leader iif we accept his proposal
+ *
+ * @invariant The received message is an operation of type OP_BEGIN
+ *
+ * @param begin The message sent by the Leader to the Peon during the
+ * Paxos::begin function
+ *
+ */
+ void handle_begin(MMonPaxos *begin);
+ /**
+ * Handle an Accept message sent by a Peon.
+ *
+ * In order to commit, the Leader has to receive accepts from a majority of
+ * the quorum. If that does happen, then the Leader may proceed with the
+ * commit. However, the Leader needs the accepts from all the quorum members
+ * in order to extend the lease and move on to STATE_ACTIVE.
+ *
+ * This function handles these two situations, accounting for the amount of
+ * received accepts.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_UPDATING
+ * @post We are on STATE_ACTIVE iif we received accepts from the full quorum
+ * @post We extended the lease iif we moved on to STATE_ACTIVE
+ * @post We are on STATE_UPDATING iif we didn't received accepts from the
+ * full quorum
+ * @post We have committed iif we received accepts from a majority
+ *
+ * @invariant The received message is an operation of type OP_ACCEPT
+ *
+ * @param accept The message sent by the Peons to the Leader during the
+ * Paxos::handle_begin function
+ */
+ void handle_accept(MMonPaxos *accept);
+ /**
+ * Trigger a fresh election.
+ *
+ * During Paxos::begin we set a Callback of type Paxos::C_AcceptTimeout in
+ * order to limit the amount of time we spend waiting for Accept replies.
+ * This callback will call Paxos::accept_timeout when it is fired.
+ *
+ * This is essential to the algorithm because there may be the chance that
+ * we are no longer the Leader (i.e., others don't believe in us) and we
+ * are getting ignored, or we dropped out of the quorum and haven't realised
+ * it. So, our only option is to trigger fresh elections.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_UPDATING
+ * @post Triggered fresh elections
+ */
void accept_timeout();
+ /**
+ * @}
+ */
+ /**
+ * Commit a value throughout the system.
+ *
+ * The Leader will cancel the current lease (as it was for the old value),
+ * and will store the committed value locally. It will then instruct every
+ * quorum member to do so as well.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_UPDATING
+ * @pre A majority of quorum members accepted our proposal
+ * @post Value locally stored
+ * @post Quorum members instructed to commit the new value.
+ */
void commit();
- void handle_commit(MMonPaxos*);
+ /**
+ * Commit the new value to stable storage as being the latest available
+ * version.
+ *
+ * @pre We are a Peon
+ * @post The new value is locally stored
+ * @post Fire up the callbacks waiting on waiting_for_commit
+ *
+ * @invariant The received message is an operation of type OP_COMMIT
+ *
+ * @param commit The message sent by the Leader to the Peon during
+ * Paxos::commit
+ */
+ void handle_commit(MMonPaxos *commit);
+ /**
+ * Extend the system's lease.
+ *
+ * This means that the Leader considers that it should now safe to read from
+ * any node on the system, since every quorum member is now in possession of
+ * the latest version. Therefore, the Leader will send a message stating just
+ * this to each quorum member, and will impose a limited timeframe during
+ * which acks will be accepted. If there aren't as many acks as expected
+ * (i.e, if at least one quorum member does not ack the lease) during this
+ * timeframe, then we will force fresh elections.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_ACTIVE
+ * @post A message extending the lease is sent to each quorum member
+ * @post A timeout callback is set to limit the amount of time we will wait
+ * for lease acks.
+ * @post A timer is set in order to renew the lease after a certain amount
+ * of time.
+ */
void extend_lease();
- void handle_lease(MMonPaxos*);
- void handle_lease_ack(MMonPaxos*);
-
- void lease_ack_timeout(); // on leader, if lease isn't acked by all peons
- void lease_renew_timeout(); // on leader, to renew the lease
+ /**
+ * Update the lease on the Peon's side of things.
+ *
+ * Once a Peon receives a Lease message, it will update its lease_expire
+ * variable, reply to the Leader acknowledging the lease update and set a
+ * timeout callback to be fired upon the lease's expiration. Finally, the
+ * Peon will fire up all the callbacks waiting for it to become active,
+ * which it just did, and all those waiting for it to become readable,
+ * which should be true if the Peon's lease didn't expire in the mean time.
+ *
+ * @pre We are a Peon
+ * @post We update the lease accordingly
+ * @post A lease timeout callback is set
+ * @post Move to STATE_ACTIVE
+ * @post Fire up all the callbacks waiting for STATE_ACTIVE
+ * @post Fire up all the callbacks waiting for readable iif we are readable
+ * @post Ack the lease to the Leader
+ *
+ * @invariant The received message is an operation of type OP_LEASE
+ *
+ * @param The message sent by the Leader to the Peon during the
+ * Paxos::extend_lease function
+ */
+ void handle_lease(MMonPaxos *lease);
+ /**
+ * Account for all the Lease Acks the Leader receives from the Peons.
+ *
+ * Once the Leader receives all the Lease Acks from the Peons, it will be
+ * able to cancel the Lease Ack timeout callback, thus avoiding calling
+ * fresh elections.
+ *
+ * @pre We are the Leader
+ * @post Cancel the Lease Ack timeout callback iif we receive acks from all
+ * the quorum members
+ *
+ * @invariant The received message is an operation of type OP_LEASE_ACK
+ *
+ * @param ack The message sent by a Peon to the Leader during the
+ * Paxos::handle_lease function
+ */
+ void handle_lease_ack(MMonPaxos *ack);
+ /**
+ * Call fresh elections because at least one Peon didn't acked our lease.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_ACTIVE
+ * @post Trigger fresh elections
+ */
+ void lease_ack_timeout();
+ /**
+ * Extend lease since we haven't had new committed values meanwhile.
+ *
+ * @pre We are the Leader
+ * @pre We are on STATE_ACTIVE
+ * @post Go through with Paxos::extend_lease
+ */
+ void lease_renew_timeout();
+ /**
+ * Call fresh elections because the Peon's lease expired without being
+ * renewed or receiving a fresh lease.
+ *
+ * This means that the Peon is no longer assumed as being in the quorum
+ * (or there is no Leader to speak of), so just trigger fresh elections
+ * to circumvent this issue.
+ *
+ * @pre We are a Peon
+ * @post Trigger fresh elections
+ */
void lease_timeout(); // on peon, if lease isn't extended
+ /**
+ * Cancel all of Paxos' timeout/renew events.
+ */
void cancel_events();
+ /**
+ * Generate a new Proposal Number based on @p gt
+ *
+ * @todo Check what @p gt actually means and what its usage entails
+ * @param gt A hint for the geration of the Proposal Number
+ * @return A globally unique, monotonically increasing Proposal Number
+ */
version_t get_new_proposal_number(version_t gt=0);
-
+
+ /**
+ * @todo document sync function
+ */
void warn_on_future_time(utime_t t, entity_name_t from);
public:
+ /**
+ * @param m A monitor
+ * @param mid A machine id
+ */
Paxos(Monitor *m,
int mid) : mon(m),
machine_id(mid),
@@ -256,63 +865,253 @@ public:
bool is_consistent();
void restart();
+ /**
+ * Initiate the Leader after it wins an election.
+ *
+ * Once an election is won, the Leader will be initiated and there are two
+ * possible outcomes of this method: the Leader directly jumps to the active
+ * state (STATE_ACTIVE) if it believes to be the only one in the quorum, or
+ * will start recovering (STATE_RECOVERING) by initiating the collect phase.
+ *
+ * @pre Our monitor is the Leader.
+ * @post We are either on STATE_ACTIVE if we're the only one in the quorum,
+ * or on STATE_RECOVERING otherwise.
+ */
void leader_init();
+ /**
+ * Initiate a Peon after it loses an election.
+ *
+ * If we are a Peon, then there must be a Leader and we are not alone in the
+ * quorum, thus automatically assume we are on STATE_RECOVERING, which means
+ * we will soon be enrolled into the Leader's collect phase.
+ *
+ * @pre There is a Leader, and he's about to start the collect phase.
+ * @post We are on STATE_RECOVERING and will soon receive collect phase's
+ * messages.
+ */
void peon_init();
- void share_state(MMonPaxos *m, version_t first_committed, version_t last_committed);
+ /**
+ * Include an incremental state of values, ranging from peer_first_committed
+ * to the last committed value, on the message m
+ *
+ * @param m A message
+ * @param peer_first_committed Lowest version to take into account
+ * @param peer_last_committed Highest version to take into account
+ */
+ void share_state(MMonPaxos *m, version_t peer_first_committed,
+ version_t peer_last_committed);
+ /**
+ * Store the state held on the message m into local, stable storage.
+ *
+ * @param m A message
+ */
void store_state(MMonPaxos *m);
+ /**
+ * @todo This appears to be used only by the OSDMonitor, and I would say
+ * its objective is to allow a third-party to have a "private"
+ * state dir. -JL
+ */
void add_extra_state_dir(string s) {
extra_state_dirs.push_back(s);
}
// -- service interface --
+ /**
+ * Add c to the list of callbacks waiting for us to become active.
+ *
+ * @param c A callback
+ */
void wait_for_active(Context *c) {
waiting_for_active.push_back(c);
}
+ /**
+ * Erase old states from stable storage.
+ *
+ * @param first The version we are trimming to
+ * @param force If specified, we may even erase the latest stashed version
+ * iif @p first is higher than that version.
+ */
void trim_to(version_t first, bool force=false);
-
+
+ /**
+ * @defgroup Paxos_h_slurping_funcs Slurping-related functions
+ * @todo Discover what slurping is
+ * @{
+ */
void start_slurping();
void end_slurping();
bool is_slurping() { return slurping == 1; }
+ /**
+ * @}
+ */
// read
+ /**
+ * @defgroup Paxos_h_read_funcs Read-related functions
+ * @{
+ */
+ /**
+ * Get latest committed version
+ *
+ * @return latest committed version
+ */
version_t get_version() { return last_committed; }
+ /**
+ * Get first committed version
+ *
+ * @return the first committed version
+ */
+ version_t get_first_committed() { return first_committed; }
+ /**
+ * Check if a given version is readable.
+ *
+ * A version may not be readable for a myriad of reasons:
+ * @li the version @v is higher that the last committed version
+ * @li we are not the Leader nor a Peon (election may be on-going)
+ * @li we do not have a committed value yet
+ * @li we do not have a valid lease
+ *
+ * @param seen The version we want to check if it is readable.
+ * @return 'true' if the version is readable; 'false' otherwise.
+ */
bool is_readable(version_t seen=0);
+ /**
+ * Read version @v and store its value in @bl
+ *
+ * @param[in] v The version we want to read
+ * @param[out] bl The version's value
+ * @return 'true' if we successfully read the value; 'false' otherwise
+ */
bool read(version_t v, bufferlist &bl);
+ /**
+ * Read the latest committed version
+ *
+ * @param[out] bl The version's value
+ * @return the latest committed version if we successfully read the value;
+ * or 0 (zero) otherwise.
+ */
version_t read_current(bufferlist &bl);
+ /**
+ * Add onreadable to the list of callbacks waiting for us to become readable.
+ *
+ * @param onreadable A callback
+ */
void wait_for_readable(Context *onreadable) {
//assert(!is_readable());
waiting_for_readable.push_back(onreadable);
}
+ /**
+ * @}
+ */
- // write
+ /**
+ * @warning This declaration is not implemented anywhere and appears to be
+ * just some lingering code.
+ */
bool is_leader();
+ // write
+ /**
+ * @defgroup Paxos_h_write_funcs Write-related functions
+ * @{
+ */
+ /**
+ * Check if we are writeable.
+ *
+ * We are writeable if we are alone (i.e., a quorum of one), or if we match
+ * all the following conditions:
+ * @li We are the Leader
+ * @li We are on STATE_ACTIVE
+ * @li We have a valid lease
+ *
+ * @return 'true' if we are writeable; 'false' otherwise.
+ */
bool is_writeable();
+ /**
+ * Add c to the list of callbacks waiting for us to become writeable.
+ *
+ * @param c A callback
+ */
void wait_for_writeable(Context *c) {
assert(!is_writeable());
waiting_for_writeable.push_back(c);
}
+ /**
+ * Propose a new value to the Leader.
+ *
+ * This function enables the submission of a new value to the Leader, which
+ * will trigger a new proposal.
+ *
+ * @param bl A bufferlist holding the value to be proposed
+ * @param oncommit A callback to be fired up once we finish committing bl
+ */
bool propose_new_value(bufferlist& bl, Context *oncommit=0);
+ /**
+ * Add oncommit to the back of the list of callbacks waiting for us to
+ * finish committing.
+ *
+ * @param oncommit A callback
+ */
void wait_for_commit(Context *oncommit) {
waiting_for_commit.push_back(oncommit);
}
+ /**
+ * Add oncommit to the front of the list of callbacks waiting for us to
+ * finish committing.
+ *
+ * @param oncommit A callback
+ */
void wait_for_commit_front(Context *oncommit) {
waiting_for_commit.push_front(oncommit);
}
+ /**
+ * @}
+ */
- // if state values are incrementals, it is usefult to keep
- // the latest copy of the complete structure.
+ /**
+ * @defgroup Paxos_h_stash_funcs State values stashing-related functions
+ *
+ * If the state values are incrementals, it is useful to keep the latest
+ * copy of the complete structure.
+ *
+ * @{
+ */
+ /**
+ * Get the latest version onto stable storage.
+ *
+ * Keeping the latest version on a predefined location makes it easier to
+ * access, since we know we always have the latest version on the same
+ * place.
+ *
+ * @param v the latest version
+ * @param bl the latest version's value
+ */
void stash_latest(version_t v, bufferlist& bl);
+ /**
+ * Get the latest stashed version's value
+ *
+ * @param[out] bl the latest stashed version's value
+ * @return the latest stashed version
+ */
version_t get_stashed(bufferlist& bl);
+ /**
+ * Get the latest stashed version
+ *
+ * @return the latest stashed version
+ */
version_t get_stashed_version() { return latest_stashed; }
+ /**
+ * @}
+ */
- version_t get_first_committed() { return first_committed; }
+ /**
+ * @}
+ */
};
-
#endif
diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc
index 812ae996a17..38fbceab221 100644
--- a/src/mon/PaxosService.cc
+++ b/src/mon/PaxosService.cc
@@ -104,6 +104,7 @@ bool PaxosService::should_propose(double& delay)
return true;
}
+
void PaxosService::propose_pending()
{
dout(10) << "propose_pending" << dendl;
@@ -115,7 +116,14 @@ void PaxosService::propose_pending()
proposal_timer = 0;
}
- // finish and encode
+ /**
+ * @note The value we propose is encoded in a bufferlist, passed to
+ * Paxos::propose_new_value and it is obtained by calling a
+ * function that must be implemented by the class implementing us.
+ * I.e., the function encode_pending will be the one responsible
+ * to encode whatever is pending on the implementation class into a
+ * bufferlist, so we can then propose that as a value through Paxos.
+ */
bufferlist bl;
encode_pending(bl);
have_pending = false;
diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h
index e8c7ca41d85..2cb59a3d61a 100644
--- a/src/mon/PaxosService.h
+++ b/src/mon/PaxosService.h
@@ -21,12 +21,42 @@
class Monitor;
class Paxos;
+/**
+ * A Paxos Service is an abstraction that easily allows one to obtain an
+ * association between a Monitor and a Paxos class, in order to implement any
+ * service.
+ */
class PaxosService {
+ /**
+ * @defgroup PaxosService_h_class Paxos Service
+ * @{
+ */
public:
+ /**
+ * The Monitor to which this class is associated with
+ */
Monitor *mon;
+ /**
+ * The Paxos instance to which this class is associated with
+ */
Paxos *paxos;
protected:
+ /**
+ * @defgroup PaxosService_h_callbacks Callback classes
+ * @{
+ */
+ /**
+ * Retry dispatching a given service message
+ *
+ * This callback class is used when we had to wait for some condition to
+ * become true while we were dispatching it.
+ *
+ * For instance, if the message's version isn't readable, according to Paxos,
+ * then we must wait for it to become readable. So, we just queue an
+ * instance of this class onto the Paxos::wait_for_readable function, and
+ * we will retry the whole dispatch again once the callback is fired.
+ */
class C_RetryMessage : public Context {
PaxosService *svc;
PaxosServiceMessage *m;
@@ -36,6 +66,15 @@ protected:
svc->dispatch(m);
}
};
+
+ /**
+ * Callback used to make sure we call the PaxosService::_active function
+ * whenever a condition is fulfilled.
+ *
+ * This is used in multiple situations, from waiting for the Paxos to commit
+ * our proposed value, to waiting for the Paxos to become active once an
+ * election is finished.
+ */
class C_Active : public Context {
PaxosService *svc;
public:
@@ -46,6 +85,10 @@ protected:
}
};
+ /**
+ * Callback class used to propose the pending value once the proposal_timer
+ * fires up.
+ */
class C_Propose : public Context {
PaxosService *ps;
public:
@@ -54,112 +97,235 @@ protected:
ps->proposal_timer = 0;
ps->propose_pending();
}
- };
+ };
+ /**
+ * @}
+ */
friend class C_Propose;
private:
+ /**
+ * Event callback responsible for proposing our pending value once a timer
+ * runs out and fires.
+ */
Context *proposal_timer;
+ /**
+ * If the implementation class has anything pending to be proposed to Paxos,
+ * then have_pending should be true; otherwise, false.
+ */
bool have_pending;
public:
+ /**
+ * @param mn A Monitor instance
+ * @param p A Paxos instance
+ */
PaxosService(Monitor *mn, Paxos *p) : mon(mn), paxos(p),
proposal_timer(0),
have_pending(false) { }
virtual ~PaxosService() {}
+ /**
+ * Get the machine name.
+ *
+ * @returns The machine name.
+ */
const char *get_machine_name();
// i implement and you ignore
+ /**
+ * Informs this instance that it should consider itself restarted.
+ *
+ * This means that we will cancel our proposal_timer event, if any exists.
+ */
void restart();
+ /**
+ * Informs this instance that an election has finished.
+ *
+ * This means that we will invoke a PaxosService::discard_pending while
+ * setting have_pending to false (basically, ignore our pending state) and
+ * we will then make sure we obtain a new state.
+ *
+ * Our state shall be updated by PaxosService::_active if the Paxos is
+ * active; otherwise, we will wait for it to become active by adding a
+ * PaxosService::C_Active callback to it.
+ */
void election_finished();
+ /**
+ * Informs this instance that it is supposed to shutdown.
+ *
+ * Basically, it will instruct Paxos to cancel all events/callbacks and then
+ * will cancel the proposal_timer event if any exists.
+ */
void shutdown();
private:
+ /**
+ * Update our state by updating it from Paxos, and then creating a new
+ * pending state if need be.
+ *
+ * @remarks We only create a pending state we our Monitor is the Leader.
+ *
+ * @pre Paxos is active
+ * @post have_pending is true iif our Monitor is the Leader and Paxos is
+ * active
+ */
void _active();
public:
- // i implement and you use
- void propose_pending(); // propose current pending as new paxos state
+ /**
+ * Propose a new value through Paxos.
+ *
+ * This function should be called by the classes implementing
+ * PaxosService, in order to propose a new value through Paxos.
+ *
+ * @pre The implementation class implements the encode_pending function.
+ * @pre have_pending is true
+ * @pre Our monitor is the Leader
+ * @pre Paxos is active
+ * @post Cancel the proposal timer, if any
+ * @post have_pending is false
+ * @post propose pending value through Paxos
+ *
+ * @note This function depends on the implementation of encode_pending on
+ * the class that is implementing PaxosService
+ */
+ void propose_pending();
+ /**
+ * Dispatch a message by passing it to several different functions that are
+ * either implemented directly by this service, or that should be implemented
+ * by the class implementing this service.
+ *
+ * @param m A message
+ * @returns 'true' on successful dispatch; 'false' otherwise.
+ */
bool dispatch(PaxosServiceMessage *m);
- // you implement
- /*
+ /**
+ * @defgroup PaxosService_h_override_funcs Functions that should be
+ * overridden.
+ *
+ * These functions should be overridden at will by the class implementing
+ * this service.
+ * @{
+ */
+ /**
* Create the initial state for your system.
- * In some of ours the state is actually set up
- * elsewhere so this does nothing.
+ *
+ * In some of ours the state is actually set up elsewhere so this does
+ * nothing.
*/
virtual void create_initial() = 0;
- /*
- * Query the Paxos system for the latest state and apply it if
- * it's newer than the current Monitor state.
- * Return true on success.
+ /**
+ * Query the Paxos system for the latest state and apply it if it's newer
+ * than the current Monitor state.
+ *
+ * @returns 'true' on success; 'false' otherwise.
*/
virtual void update_from_paxos() = 0;
- /*
- * This function is only called on a leader. Create the pending state.
- * (this created state is then modified by incoming messages).
- * Called at startup and after every Paxos ratification round.
+ /**
+ * Create the pending state.
+ *
+ * @invariant This function is only called on a Leader.
+ * @remarks This created state is then modified by incoming messages.
+ * @remarks Called at startup and after every Paxos ratification round.
*/
virtual void create_pending() = 0;
- /*
- * This function is only called on a leader. Encode the pending state
- * into a bufferlist for ratification and transmission
- * as the next state.
+ /**
+ * Encode the pending state into a bufferlist for ratification and
+ * transmission as the next state.
+ *
+ * @invariant This function is only called on a Leader.
+ *
+ * @param[out] bl A bufferlist containing the encoded pending state
*/
virtual void encode_pending(bufferlist& bl) = 0;
- /*
- * As this function is NOT overridden in any of our code,
- * but it is called in election_finished if have_pending.
+ /**
+ * Discard the pending state
+ *
+ * @invariant This function is only called on a Leader.
+ *
+ * @remarks This function is NOT overridden in any of our code, but it is
+ * called in PaxosService::election_finished if have_pending is
+ * true.
*/
- virtual void discard_pending() { } // [leader] discard pending
+ virtual void discard_pending() { }
- /*
- * Look at the query; if the query can be handled without changing
- * state, do so.
- * Return true if the query was handled (ie, was a read that got answered,
- * was a state change that has no effect), false otherwise.
+ /**
+ * Look at the query; if the query can be handled without changing state,
+ * do so.
+ *
+ * @param m A query message
+ * @returns 'true' if the query was handled (e.g., was a read that got
+ * answered, was a state change that has no effect); 'false'
+ * otherwise.
*/
virtual bool preprocess_query(PaxosServiceMessage *m) = 0;
- /*
- * This function is only called on the leader. Apply the message
- * to the pending state.
+ /**
+ * Apply the message to the pending state.
+ *
+ * @invariant This function is only called on a Leader.
+ *
+ * @param m An update message
+ * @returns 'true' if the update message was handled (e.g., a command that
+ * went through); 'false' otherwise.
*/
virtual bool prepare_update(PaxosServiceMessage *m) = 0;
+ /**
+ * @}
+ */
- /*
- * Determine if the Paxos system should vote on pending, and
- * if so how long it should wait to vote.
- * Returns true if the Paxos system should propose,
- * and fills in the delay paramater with the wait time
- * (so you can limit update traffic spamming).
+ /**
+ * Determine if the Paxos system should vote on pending, and if so how long
+ * it should wait to vote.
+ *
+ * @param[out] delay The wait time, used so we can limit the update traffic
+ * spamming.
+ * @returns 'true' if the Paxos system should propose; 'false' otherwise.
*/
virtual bool should_propose(double &delay);
- /*
+ /**
+ * @defgroup PaxosService_h_courtesy Courtesy functions
+ *
+ * Courtesy functions, in case the class implementing this service has
+ * anything it wants/needs to do at these times.
+ * @{
+ */
+ /**
* This is called when the Paxos state goes to active.
- * It's a courtesy method if you have things you want/need
- * to do at that time.
*
- * Note that is may get called twice in certain recovery cases.
+ * @remarks It's a courtesy method, in case the class implementing this
+ * service has anything it wants/needs to do at that time.
+ *
+ * @note This function may get called twice in certain recovery cases.
*/
virtual void on_active() { }
- /*
- * Another courtesy method. Called when the Paxos
- * system enters a leader election.
+ /**
+ * Called when the Paxos system enters a Leader election.
+ *
+ * @remarks It's a courtesy method, in case the class implementing this
+ * service has anything it wants/needs to do at that time.
*/
virtual void on_restart() { }
+ /**
+ * @}
+ */
+ /**
+ * Tick.
+ */
virtual void tick() {}
/**
- * get health information
+ * Get health information
*
* @param summary list of summary strings and associated severity
* @param detail optional list of detailed problem reports; may be NULL
@@ -167,6 +333,9 @@ public:
virtual void get_health(list<pair<health_status_t,string> >& summary,
list<pair<health_status_t,string> > *detail) const { }
+ /**
+ * @}
+ */
};
#endif