diff options
author | Samuel Just <sam.just@inktank.com> | 2013-10-04 12:29:26 -0700 |
---|---|---|
committer | Samuel Just <sam.just@inktank.com> | 2013-10-04 13:49:11 -0700 |
commit | 091809b8149c7595cbcca439c5b8b75a0c42efe1 (patch) | |
tree | 0b607c8e9ca8afe99bc3f3e58bcf797913922bfb | |
parent | 399f1d53f7f441992f48aa72139cd628c4ad4f29 (diff) | |
download | ceph-091809b8149c7595cbcca439c5b8b75a0c42efe1.tar.gz |
PGMap,PGMonitor: maintain mapping of osd to recent stat epoch
Also, osd_stat will be empty for out osd.
When an osd is marked out, rather than remove it from osd_stat,
we instead 0 out the structure.
This patch also makes osd_stat_updates and osd_stat_rm private.
This should make it simpler to enforce invariants on these
mappings.
Each up osd will have a mapping since out osds are now included as
empty stats.
Signed-off-by: Samuel Just <sam.just@inktank.com>
-rw-r--r-- | src/mon/PGMap.cc | 45 | ||||
-rw-r--r-- | src/mon/PGMap.h | 37 | ||||
-rw-r--r-- | src/mon/PGMonitor.cc | 24 |
3 files changed, 87 insertions, 19 deletions
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index e9a35c6b8ab..0b40e9264ce 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -30,7 +30,7 @@ void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const return; } - ENCODE_START(6, 5, bl); + ENCODE_START(7, 5, bl); ::encode(version, bl); ::encode(pg_stat_updates, bl); ::encode(osd_stat_updates, bl); @@ -41,6 +41,7 @@ void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const ::encode(nearfull_ratio, bl); ::encode(pg_remove, bl); ::encode(stamp, bl); + ::encode(osd_epochs, bl); ENCODE_FINISH(bl); } @@ -89,6 +90,17 @@ void PGMap::Incremental::decode(bufferlist::iterator &bl) } if (struct_v >= 6) ::decode(stamp, bl); + if (struct_v >= 7) { + ::decode(osd_epochs, bl); + } else { + for (map<int32_t, osd_stat_t>::iterator i = osd_stat_updates.begin(); + i != osd_stat_updates.end(); + ++i) { + // This isn't accurate, but will cause trimming to behave like + // previously. + osd_epochs.insert(make_pair(i->first, osdmap_epoch)); + } + } DECODE_FINISH(bl); } @@ -195,8 +207,10 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc) } stat_pg_add(update_pg, update_stat); } - for (map<int32_t,osd_stat_t>::const_iterator p = inc.osd_stat_updates.begin(); - p != inc.osd_stat_updates.end(); + assert(osd_stat.size() == osd_epochs.size()); + for (map<int32_t,osd_stat_t>::const_iterator p = + inc.get_osd_stat_updates().begin(); + p != inc.get_osd_stat_updates().end(); ++p) { int osd = p->first; const osd_stat_t &new_stats(p->second); @@ -209,6 +223,8 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc) stat_osd_sub(t->second); t->second = new_stats; } + assert(inc.get_osd_epochs().find(osd) != inc.get_osd_epochs().end()); + osd_epochs.insert(*(inc.get_osd_epochs().find(osd))); stat_osd_add(new_stats); @@ -226,8 +242,8 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc) } } - for (set<int>::iterator p = inc.osd_stat_rm.begin(); - p != inc.osd_stat_rm.end(); + for (set<int>::iterator p = inc.get_osd_stat_rm().begin(); + p != inc.get_osd_stat_rm().end(); ++p) { hash_map<int32_t,osd_stat_t>::iterator t = osd_stat.find(*p); if (t != osd_stat.end()) { @@ -434,7 +450,7 @@ void PGMap::encode(bufferlist &bl, uint64_t features) const return; } - ENCODE_START(5, 4, bl); + ENCODE_START(6, 4, bl); ::encode(version, bl); ::encode(pg_stat, bl); ::encode(osd_stat, bl); @@ -443,6 +459,7 @@ void PGMap::encode(bufferlist &bl, uint64_t features) const ::encode(full_ratio, bl); ::encode(nearfull_ratio, bl); ::encode(stamp, bl); + ::encode(osd_epochs, bl); ENCODE_FINISH(bl); } @@ -472,6 +489,17 @@ void PGMap::decode(bufferlist::iterator &bl) } if (struct_v >= 5) ::decode(stamp, bl); + if (struct_v >= 6) { + ::decode(osd_epochs, bl); + } else { + for (hash_map<int32_t, osd_stat_t>::iterator i = osd_stat.begin(); + i != osd_stat.end(); + ++i) { + // This isn't accurate, but will cause trimming to behave like + // previously. + osd_epochs.insert(make_pair(i->first, last_osdmap_epoch)); + } + } DECODE_FINISH(bl); calc_stats(); @@ -488,7 +516,10 @@ void PGMap::dirty_all(Incremental& inc) inc.pg_stat_updates[p->first] = p->second; } for (hash_map<int32_t, osd_stat_t>::const_iterator p = osd_stat.begin(); p != osd_stat.end(); ++p) { - inc.osd_stat_updates[p->first] = p->second; + assert(inc.get_osd_epochs().count(p->first)); + inc.update_stat(p->first, + inc.get_osd_epochs().find(p->first)->second, + p->second); } } diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h index 84d89f87517..7a202fc0006 100644 --- a/src/mon/PGMap.h +++ b/src/mon/PGMap.h @@ -43,12 +43,13 @@ public: float full_ratio; float nearfull_ratio; + // mapping of osd to most recently reported osdmap epoch + hash_map<int32_t,epoch_t> osd_epochs; + class Incremental { public: version_t version; map<pg_t,pg_stat_t> pg_stat_updates; - map<int32_t,osd_stat_t> osd_stat_updates; - set<int32_t> osd_stat_rm; epoch_t osdmap_epoch; epoch_t pg_scan; // osdmap epoch set<pg_t> pg_remove; @@ -56,6 +57,38 @@ public: float nearfull_ratio; utime_t stamp; + private: + map<int32_t,osd_stat_t> osd_stat_updates; + set<int32_t> osd_stat_rm; + + // mapping of osd to most recently reported osdmap epoch + map<int32_t,epoch_t> osd_epochs; + public: + + const map<int32_t, osd_stat_t> &get_osd_stat_updates() const { + return osd_stat_updates; + } + const set<int32_t> &get_osd_stat_rm() const { + return osd_stat_rm; + } + const map<int32_t, epoch_t> &get_osd_epochs() const { + return osd_epochs; + } + + void update_stat(int32_t osd, epoch_t epoch, const osd_stat_t &stat) { + osd_stat_updates[osd] = stat; + osd_epochs[osd] = epoch; + assert(osd_epochs.size() == osd_stat_updates.size()); + } + void stat_osd_out(int32_t osd) { + // 0 the stats for the osd + osd_stat_updates[osd] = osd_stat_t(); + } + void rm_stat(int32_t osd) { + osd_stat_rm.insert(osd); + osd_epochs.erase(osd); + osd_stat_updates.erase(osd); + } void encode(bufferlist &bl, uint64_t features=-1) const; void decode(bufferlist::iterator &bl); void dump(Formatter *f) const; diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index 0f495052747..0644922ddb4 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -494,15 +494,19 @@ void PGMonitor::encode_pending(MonitorDBStore::Transaction *t) { bufferlist dirty; string prefix = pgmap_osd_prefix; - for (map<int32_t,osd_stat_t>::const_iterator p = pending_inc.osd_stat_updates.begin(); - p != pending_inc.osd_stat_updates.end(); + for (map<int32_t,osd_stat_t>::const_iterator p = + pending_inc.get_osd_stat_updates().begin(); + p != pending_inc.get_osd_stat_updates().end(); ++p) { ::encode(p->first, dirty); bufferlist bl; ::encode(p->second, bl, features); t->put(prefix, stringify(p->first), bl); } - for (set<int32_t>::const_iterator p = pending_inc.osd_stat_rm.begin(); p != pending_inc.osd_stat_rm.end(); ++p) { + for (set<int32_t>::const_iterator p = + pending_inc.get_osd_stat_rm().begin(); + p != pending_inc.get_osd_stat_rm().end(); + ++p) { ::encode(*p, dirty); t->erase(prefix, stringify(*p)); } @@ -725,7 +729,11 @@ bool PGMonitor::prepare_pg_stats(MPGStats *stats) } // osd stat - pending_inc.osd_stat_updates[from] = stats->osd_stat; + if (mon->osdmon()->osdmap.is_in(from)) { + pending_inc.update_stat(from, stats->epoch, stats->osd_stat); + } else { + pending_inc.update_stat(from, stats->epoch, osd_stat_t()); + } if (pg_map.osd_stat.count(from)) dout(10) << " got osd." << from << " " << stats->osd_stat << " (was " << pg_map.osd_stat[from] << ")" << dendl; @@ -842,11 +850,7 @@ void PGMonitor::check_osd_map(epoch_t epoch) ++p) if (p->second == CEPH_OSD_OUT) { dout(10) << "check_osd_map osd." << p->first << " went OUT" << dendl; - pending_inc.osd_stat_rm.insert(p->first); - } else { - dout(10) << "check_osd_map osd." << p->first << " is IN" << dendl; - pending_inc.osd_stat_rm.erase(p->first); - pending_inc.osd_stat_updates[p->first]; + pending_inc.stat_osd_out(p->first); } // this is conservative: we want to know if any osds (maybe) got marked down. @@ -867,7 +871,7 @@ void PGMonitor::check_osd_map(epoch_t epoch) // whether it was created *or* destroyed, we can safely drop // it's osd_stat_t record. dout(10) << "check_osd_map osd." << p->first << " created or destroyed" << dendl; - pending_inc.osd_stat_rm.insert(p->first); + pending_inc.rm_stat(p->first); // and adjust full, nearfull set pg_map.nearfull_osds.erase(p->first); |