diff options
author | Sage Weil <sage@inktank.com> | 2013-01-27 19:57:58 -0800 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2013-01-28 17:13:59 -0800 |
commit | b955a599a6aa7d462c88e948f65508c8b9e72ba1 (patch) | |
tree | 9a695b485cffc90f31eecb450021a80c44320204 | |
parent | 2b8ba7ca23fe7d9ab741b1f4e087f3eb78ef7d1f (diff) | |
download | ceph-b955a599a6aa7d462c88e948f65508c8b9e72ba1.tar.gz |
mon: set limit so that we do not an entire down subtree out
Add new configurable 'mon osd down out subtree limit' so that you can
prevent marking out an entire subtree. If for example an entire rack is
down, do not mark anything in it out. If less than the whole rack is down,
everything is fair game.
Set the default to 'rack'.
Signed-off-by: Sage Weil <sage@inktank.com>
-rw-r--r-- | src/common/config_opts.h | 1 | ||||
-rw-r--r-- | src/mon/OSDMonitor.cc | 16 |
2 files changed, 17 insertions, 0 deletions
diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 59caca5a6a2..a778268d51a 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -127,6 +127,7 @@ OPTION(mon_osd_auto_mark_in, OPT_BOOL, false) // mark any booting osds ' OPTION(mon_osd_auto_mark_auto_out_in, OPT_BOOL, true) // mark booting auto-marked-out osds 'in' OPTION(mon_osd_auto_mark_new_in, OPT_BOOL, true) // mark booting new osds 'in' OPTION(mon_osd_down_out_interval, OPT_INT, 300) // seconds +OPTION(mon_osd_down_out_subtree_limit, OPT_STR, "rack") // largest crush unit/type that we will automatically mark out OPTION(mon_osd_min_up_ratio, OPT_DOUBLE, .3) // min osds required to be up to mark things down OPTION(mon_osd_min_in_ratio, OPT_DOUBLE, .3) // min osds required to be in to mark things out OPTION(mon_lease, OPT_FLOAT, 5) // lease interval diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 96e2aa12ea7..6ab267b0efb 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -1458,6 +1458,8 @@ void OSDMonitor::tick() * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us. */ if (can_mark_out(-1)) { + set<int> down_cache; // quick cache of down subtrees + map<int,utime_t>::iterator i = down_pending_out.begin(); while (i != down_pending_out.end()) { int o = i->first; @@ -1484,6 +1486,20 @@ void OSDMonitor::tick() grace += my_grace; } + // is this an entire large subtree down? + if (g_conf->mon_osd_down_out_subtree_limit.length()) { + int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit.c_str()); + if (type > 0) { + if (osdmap.containing_subtree_is_down(g_ceph_context, o, type, &down_cache)) { + dout(10) << "tick entire containing " << g_conf->mon_osd_down_out_subtree_limit + << " subtree for osd." << o << " is down; resetting timer" << dendl; + // reset timer, too. + down_pending_out[o] = now; + continue; + } + } + } + if (g_conf->mon_osd_down_out_interval > 0 && down.sec() >= grace) { dout(10) << "tick marking osd." << o << " OUT after " << down |