summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-01-27 19:57:58 -0800
committerSage Weil <sage@inktank.com>2013-01-28 17:13:59 -0800
commitb955a599a6aa7d462c88e948f65508c8b9e72ba1 (patch)
tree9a695b485cffc90f31eecb450021a80c44320204
parent2b8ba7ca23fe7d9ab741b1f4e087f3eb78ef7d1f (diff)
downloadceph-b955a599a6aa7d462c88e948f65508c8b9e72ba1.tar.gz
mon: set limit so that we do not an entire down subtree out
Add new configurable 'mon osd down out subtree limit' so that you can prevent marking out an entire subtree. If for example an entire rack is down, do not mark anything in it out. If less than the whole rack is down, everything is fair game. Set the default to 'rack'. Signed-off-by: Sage Weil <sage@inktank.com>
-rw-r--r--src/common/config_opts.h1
-rw-r--r--src/mon/OSDMonitor.cc16
2 files changed, 17 insertions, 0 deletions
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 59caca5a6a2..a778268d51a 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -127,6 +127,7 @@ OPTION(mon_osd_auto_mark_in, OPT_BOOL, false) // mark any booting osds '
OPTION(mon_osd_auto_mark_auto_out_in, OPT_BOOL, true) // mark booting auto-marked-out osds 'in'
OPTION(mon_osd_auto_mark_new_in, OPT_BOOL, true) // mark booting new osds 'in'
OPTION(mon_osd_down_out_interval, OPT_INT, 300) // seconds
+OPTION(mon_osd_down_out_subtree_limit, OPT_STR, "rack") // largest crush unit/type that we will automatically mark out
OPTION(mon_osd_min_up_ratio, OPT_DOUBLE, .3) // min osds required to be up to mark things down
OPTION(mon_osd_min_in_ratio, OPT_DOUBLE, .3) // min osds required to be in to mark things out
OPTION(mon_lease, OPT_FLOAT, 5) // lease interval
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 96e2aa12ea7..6ab267b0efb 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -1458,6 +1458,8 @@ void OSDMonitor::tick()
* ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us.
*/
if (can_mark_out(-1)) {
+ set<int> down_cache; // quick cache of down subtrees
+
map<int,utime_t>::iterator i = down_pending_out.begin();
while (i != down_pending_out.end()) {
int o = i->first;
@@ -1484,6 +1486,20 @@ void OSDMonitor::tick()
grace += my_grace;
}
+ // is this an entire large subtree down?
+ if (g_conf->mon_osd_down_out_subtree_limit.length()) {
+ int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit.c_str());
+ if (type > 0) {
+ if (osdmap.containing_subtree_is_down(g_ceph_context, o, type, &down_cache)) {
+ dout(10) << "tick entire containing " << g_conf->mon_osd_down_out_subtree_limit
+ << " subtree for osd." << o << " is down; resetting timer" << dendl;
+ // reset timer, too.
+ down_pending_out[o] = now;
+ continue;
+ }
+ }
+ }
+
if (g_conf->mon_osd_down_out_interval > 0 &&
down.sec() >= grace) {
dout(10) << "tick marking osd." << o << " OUT after " << down