summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSamuel Just <sam.just@inktank.com>2012-11-21 13:59:22 -0800
committerSamuel Just <sam.just@inktank.com>2012-11-29 13:51:52 -0800
commite2c4e2f63bfc138f1d122012ddf9731c8fa04758 (patch)
treef6d62f462c08709e839f60d988f775d3a88f4a55
parent82517f1bf07809b820bca3f80473acf7fd89d68e (diff)
downloadceph-e2c4e2f63bfc138f1d122012ddf9731c8fa04758.tar.gz
PG: maintain osd local last_epoch_started for find_best_info
In order to proceed with peering, we need an osd with a log including the last commit sent to a client. This translates to the oldest last_update from the infos of the most recent acting set to go active. history.last_epoch_started gives us a lower bound on the last time the entire acting set persisted authoratative logs/infos. However, it doesn't indicate anything about the info/log on the osd which sent it. Thus, we will maintain an osd local info.last_epoch_started to determine which osds were actually active (and thus have the required log entries). The max info.last_epoch_started in the prior set gives us an upper bound on the last interval during which writes occurred. The min last_update among the infos with that last_epoch_started must therefore be an upper bound on the oldest operation which clients consider committed. Any osd with an info.last_updated past that version must be sufficient. The observed bug was there was an empty pg info with a last_epoch_started at the most recent interval which pushed min_last_update_acceptable to eversion_t(). There were two down osds, but peering proceeded since the backfill peer did survive. However, its info was later disregarded due to incomplete. An empty osd was then chosen as the best_info since it's last_update was equal to min_last_update_acceptable. This caused the contents of the pg to be lost. Signed-off-by: Samuel Just <sam.just@inktank.com>
-rw-r--r--src/osd/OSD.cc1
-rw-r--r--src/osd/PG.cc21
-rw-r--r--src/osd/osd_types.cc11
-rw-r--r--src/osd/osd_types.h6
4 files changed, 30 insertions, 9 deletions
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 3dfae663ba2..ea936b3f757 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -4505,6 +4505,7 @@ void OSD::handle_pg_create(OpRequestRef op)
0, creating_pgs[pgid].acting, creating_pgs[pgid].acting,
history, pi,
*rctx.transaction);
+ pg->info.last_epoch_started = pg->info.history.last_epoch_started;
creating_pgs.erase(pgid);
wake_pg_waiters(pg->info.pgid);
pg->handle_create(&rctx);
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 53e9ae4bb19..2bc5d127c9c 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1042,11 +1042,11 @@ map<int, pg_info_t>::const_iterator PG::find_best_info(const map<int, pg_info_t>
for (map<int, pg_info_t>::const_iterator i = infos.begin();
i != infos.end();
++i) {
- if (max_last_epoch_started_found < i->second.history.last_epoch_started) {
+ if (max_last_epoch_started_found < i->second.last_epoch_started) {
min_last_update_acceptable = eversion_t::max();
- max_last_epoch_started_found = i->second.history.last_epoch_started;
+ max_last_epoch_started_found = i->second.last_epoch_started;
}
- if (max_last_epoch_started_found == i->second.history.last_epoch_started) {
+ if (max_last_epoch_started_found == i->second.last_epoch_started) {
if (min_last_update_acceptable > i->second.last_update)
min_last_update_acceptable = i->second.last_update;
}
@@ -1381,6 +1381,8 @@ void PG::activate(ObjectStore::Transaction& t,
send_notify = false;
+ info.last_epoch_started = query_epoch;
+
if (is_primary()) {
// If necessary, create might_have_unfound to help us find our unfound objects.
// NOTE: It's important that we build might_have_unfound before trimming the
@@ -1774,7 +1776,8 @@ void PG::all_activated_and_committed()
assert(is_primary());
assert(peer_activated.size() == acting.size());
- info.history.last_epoch_started = get_osdmap()->get_epoch();
+ // info.last_epoch_started is set during activate()
+ info.history.last_epoch_started = info.last_epoch_started;
share_pg_info();
update_stats();
@@ -4134,6 +4137,10 @@ void PG::share_pg_info()
// share new pg_info_t with replicas
for (unsigned i=1; i<acting.size(); i++) {
int peer = acting[i];
+ if (peer_info.count(i)) {
+ peer_info[i].last_epoch_started = info.last_epoch_started;
+ peer_info[i].history.merge(info.history);
+ }
MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
m->pg_list.push_back(
make_pair(
@@ -4523,6 +4530,10 @@ void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
dirty_info = true;
osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp);
+ assert(oinfo.last_epoch_started == info.last_epoch_started);
+ assert(info.history.last_epoch_started == oinfo.last_epoch_started);
+ assert(oinfo.history.last_epoch_started == oinfo.last_epoch_started);
+
// Handle changes to purged_snaps ONLY IF we have caught up
if (last_complete_ondisk.epoch >= info.history.last_epoch_started) {
interval_set<snapid_t> p;
@@ -6451,7 +6462,7 @@ PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
// We pull the log from the peer's last_epoch_started to ensure we
// get enough log to detect divergent updates.
- eversion_t since(pi.history.last_epoch_started, 0);
+ eversion_t since(pi.last_epoch_started, 0);
assert(pi.last_update >= pg->info.log_tail); // or else choose_acting() did a bad thing
if (pi.log_tail <= since) {
dout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 1ca5adc70be..4a1b3fcf2ef 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -1286,7 +1286,7 @@ void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
void pg_info_t::encode(bufferlist &bl) const
{
- ENCODE_START(26, 26, bl);
+ ENCODE_START(27, 26, bl);
::encode(pgid, bl);
::encode(last_update, bl);
::encode(last_complete, bl);
@@ -1295,12 +1295,13 @@ void pg_info_t::encode(bufferlist &bl) const
::encode(stats, bl);
history.encode(bl);
::encode(purged_snaps, bl);
+ ::encode(last_epoch_started, bl);
ENCODE_FINISH(bl);
}
void pg_info_t::decode(bufferlist::iterator &bl)
{
- DECODE_START_LEGACY_COMPAT_LEN(26, 26, 26, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(27, 26, 26, bl);
if (struct_v < 23) {
old_pg_t opgid;
::decode(opgid, bl);
@@ -1325,6 +1326,11 @@ void pg_info_t::decode(bufferlist::iterator &bl)
set<snapid_t> snap_trimq;
::decode(snap_trimq, bl);
}
+ if (struct_v < 27) {
+ last_epoch_started = history.last_epoch_started;
+ } else {
+ ::decode(last_epoch_started, bl);
+ }
DECODE_FINISH(bl);
}
@@ -1348,6 +1354,7 @@ void pg_info_t::dump(Formatter *f) const
f->dump_int("empty", is_empty());
f->dump_int("dne", dne());
f->dump_int("incomplete", is_incomplete());
+ f->dump_int("last_epoch_started", last_epoch_started);
}
void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index bb2ed253ce6..da2b2abf319 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -1039,6 +1039,7 @@ struct pg_info_t {
pg_t pgid;
eversion_t last_update; // last object version applied to store.
eversion_t last_complete; // last version pg was complete through.
+ epoch_t last_epoch_started;// last epoch at which this pg started on this osd
eversion_t log_tail; // oldest log entry.
@@ -1051,11 +1052,11 @@ struct pg_info_t {
pg_history_t history;
pg_info_t()
- : last_backfill(hobject_t::get_max())
+ : last_epoch_started(0), last_backfill(hobject_t::get_max())
{ }
pg_info_t(pg_t p)
: pgid(p),
- last_backfill(hobject_t::get_max())
+ last_epoch_started(0), last_backfill(hobject_t::get_max())
{ }
bool is_empty() const { return last_update.version == 0; }
@@ -1086,6 +1087,7 @@ inline ostream& operator<<(ostream& out, const pg_info_t& pgi)
out << " lb " << pgi.last_backfill;
}
//out << " c " << pgi.epoch_created;
+ out << " local-les=" << pgi.last_epoch_started;
out << " n=" << pgi.stats.stats.sum.num_objects;
out << " " << pgi.history
<< ")";