summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-07-08 15:04:59 -0700
committerSage Weil <sage@inktank.com>2013-07-08 16:11:51 -0700
commit7fb3804fb860dcd0340dd3f7c39eec4315f8e4b6 (patch)
treef314ff52a48404e432f178f2fe81d0cf1fa35bea
parent24f90b832c695ef13021db66a178c18369ac356d (diff)
downloadceph-7fb3804fb860dcd0340dd3f7c39eec4315f8e4b6.tar.gz
mon: fix osdmap stash, trim to retain complete history of full maps
The current interaction between sync and stashing full osdmaps only on active mons means that a sync can result in an incomplete osdmap_full history: - mon.c starts a full sync - during sync, active osdmap service should_stash_full() is true and includes a full in the txn - mon.c sync finishes - mon.c update_from_paxos gets "latest" stashed that it got from the paxos txn - mon.c does *not* walk to previous inc maps to complete it's collection of full maps. To fix this, we disable the periodic/random stash of full maps by the osdmap service. This introduces a new problem: we must have at least one full map (the first one) in order for a mon that just synced to build it's full collection. Extend the encode_trim() process to allow the osdmap service to include the oldest full map with the trim txn. This is more complex than just writing the full maps in the txn, but cheaper--we only write the full map at trim time. This *might* be related to previous bugs where the full osdmap was missing, or case where leveldb keys seemed to 'disappear'. Fixes: #5512 Backport: cuttlefish Signed-off-by: Sage Weil <sage@inktank.com> Reviewed-by: Greg Farnum <greg@inktank.com> (cherry picked from commit afd6c7d8247075003e5be439ad59976c3d123218)
-rw-r--r--src/mon/OSDMonitor.cc26
-rw-r--r--src/mon/OSDMonitor.h17
-rw-r--r--src/mon/PaxosService.cc3
-rw-r--r--src/mon/PaxosService.h21
4 files changed, 47 insertions, 20 deletions
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 6ad0206c677..857fca530c3 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -117,19 +117,13 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
<< ", my e " << osdmap.epoch << dendl;
- /* We no longer have stashed versions. Maybe we can do this by reading
- * from a full map? Maybe we should keep the last full map version on a key
- * as well (say, osdmap_full_version), and consider that the last_committed
- * always contains incrementals, and maybe a full version if
- * osdmap_full_version == last_committed
- *
- * This ^^^^ sounds about right. Do it. We should then change the
- * 'get_stashed_version()' to 'get_full_version(version_t ver)', which should
- * then be read iif
- * (osdmap.epoch != osd_full_version)
- * && (osdmap.epoch <= osdmap_full_version)
+ /*
+ * We will possibly have a stashed latest that *we* wrote, and we will
+ * always be sure to have the oldest full map in the first..last range
+ * due to encode_trim_extra(), which includes the oldest full map in the trim
+ * transaction. Start with whichever is newer.
*/
- version_t latest_full = get_version_latest_full();
+ version_t latest_full = MAX(get_version_latest_full(), get_first_committed());
if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
bufferlist latest_bl;
get_version_full(latest_full, latest_bl);
@@ -557,6 +551,14 @@ void OSDMonitor::update_trim()
}
}
+void OSDMonitor::encode_trim_extra(MonitorDBStore::Transaction *tx, version_t first)
+{
+ dout(10) << __func__ << " including full map for e " << first << dendl;
+ bufferlist bl;
+ get_version_full(first, bl);
+ put_version_full(tx, first, bl);
+}
+
bool OSDMonitor::service_should_trim()
{
update_trim();
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
index 494a66cafe0..98fb5954ae5 100644
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -151,6 +151,23 @@ private:
virtual void encode_full(MonitorDBStore::Transaction *t);
void on_active();
+ /**
+ * do not let paxosservice periodically stash full osdmaps, or we will break our
+ * locally-managed full maps. (update_from_paxos loads the latest and writes them
+ * out going forward from there, but if we just synced that may mean we skip some.)
+ */
+ virtual bool should_stash_full() {
+ return false;
+ }
+
+ /**
+ * hook into trim to include the oldest full map in the trim transaction
+ *
+ * This ensures that anyone post-sync will have enough to rebuild their
+ * full osdmaps.
+ */
+ void encode_trim_extra(MonitorDBStore::Transaction *tx, version_t first);
+
void update_msgr_features();
void share_map_with_random_osd();
diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc
index e897aa79e15..803853cdeab 100644
--- a/src/mon/PaxosService.cc
+++ b/src/mon/PaxosService.cc
@@ -374,6 +374,9 @@ void PaxosService::encode_trim(MonitorDBStore::Transaction *t)
trim(t, first_committed, trim_to_max);
put_first_committed(t, trim_to_max);
+ // let the service add any extra stuff
+ encode_trim_extra(t, trim_to_max);
+
if (trim_to_max == trim_to)
set_trim_to(0);
}
diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h
index 5ede4adf020..1b994dcbe84 100644
--- a/src/mon/PaxosService.h
+++ b/src/mon/PaxosService.h
@@ -649,21 +649,26 @@ public:
*/
void trim(MonitorDBStore::Transaction *t, version_t from, version_t to);
/**
- * Trim our log. This implies getting rid of versions on the k/v store.
- * Services implementing us don't have to implement this function if they
- * don't want to, but we won't implement it for them either.
+ * Trim our log
*
- * This function had to be inheritted from the Paxos, since the existing
- * services made use of it. This function should be tuned for each service's
- * needs. We have it in this interface to make sure its usage and purpose is
- * well understood by the underlying services.
+ * Will call encode_trim_extra(), allowing services to add
+ * additional bits to the trim transaction.
*
* @param first The version that should become the first one in the log.
* @param force Optional. Each service may use it as it sees fit, but the
* expected behavior is that, when 'true', we will remove all
* the log versions even if we don't have a full map in store.
*/
- virtual void encode_trim(MonitorDBStore::Transaction *t);
+ void encode_trim(MonitorDBStore::Transaction *t);
+
+ /**
+ * encode service-specific extra bits into trim transaction
+ *
+ * @param tx transaction
+ * @param first new first_committed value
+ */
+ virtual void encode_trim_extra(MonitorDBStore::Transaction *tx, version_t first) {}
+
/**
*
*/