diff options
author | Mike Wallace <mikewallace1979@googlemail.com> | 2014-08-22 15:50:02 +0100 |
---|---|---|
committer | Jay Doane <jaydoane@apache.org> | 2021-04-19 00:35:19 -0700 |
commit | 4737cd789b42a66e1960769c425297b8fe4c07f7 (patch) | |
tree | 0a412c1fdac5eff5815e4c5a34e97172c1ddac00 | |
parent | 8f14524059e359fa47c4983e699f260cc8df1374 (diff) | |
download | couchdb-4737cd789b42a66e1960769c425297b8fe4c07f7.tar.gz |
Add check for whether node can be safely rebuilt
This commit adds a diagnostic check which determines whether a
node can be safely rebuilt. If a node can be taken out of service
without leaving any shards on the cluster with less than two live
copies then it is considered safe and an info message is returned.
If taking the node out of service would leave one or more shards
with one live copy then an error message is returned. If zero live
copies would be left then a critical message is returned.
The bulk of work in this commit was authored in:
cloudant/snippet@004537c28ae2c764c29c20e708681da7f542e21c
BugzID: 33831
-rw-r--r-- | src/weatherreport/src/weatherreport_check_safe_to_rebuild.erl | 116 |
1 files changed, 116 insertions, 0 deletions
diff --git a/src/weatherreport/src/weatherreport_check_safe_to_rebuild.erl b/src/weatherreport/src/weatherreport_check_safe_to_rebuild.erl new file mode 100644 index 000000000..85a5c6f2d --- /dev/null +++ b/src/weatherreport/src/weatherreport_check_safe_to_rebuild.erl @@ -0,0 +1,116 @@ +%% ------------------------------------------------------------------- +%% +%% weatherreport - automated diagnostic tools for CouchDB +%% +%% Copyright (c) 2014 Cloudant +%% +%% This file is provided to you under the Apache License, +%% Version 2.0 (the "License"); you may not use this file +%% except in compliance with the License. You may obtain +%% a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. +%% +%% ------------------------------------------------------------------- + +%% @doc Diagnostic that checks whether the current node can be +%% safely rebuilt (i.e. taken out of service). +-module(weatherreport_check_safe_to_rebuild). +-behaviour(weatherreport_check). + +-export([description/0, + valid/0, + check/1, + format/1]). + +-spec description() -> string(). +description() -> + "Check whether the node can safely be taken out of service". + +-spec valid() -> boolean(). +valid() -> + weatherreport_node:can_connect(). + +%% @doc Check if rebuilding a node is safe. Safe in this context means +%% that no shard would end up with N<Threshold when the node is offline +-spec safe_to_rebuild(atom(), integer()) -> [list()]. +safe_to_rebuild(Node, RawThreshold) -> + Threshold = case config:get("cloudant", "maintenance_mode") of + "true" -> + RawThreshold - 1; + _ -> + RawThreshold + end, + BelowThreshold = fun + ({_, _, {_, C}}) when C =< Threshold -> true; + (_) -> false end, + ToKV = fun({Db, Range, Status}) -> {[Db, Range], Status} end, + + ShardsInDanger = dict:from_list( + lists:map( + ToKV, + lists:filter(BelowThreshold, custodian:report()) + ) + ), + + mem3_shards:fold( + fun(Shard, Acc) -> + case Shard of + {shard, _, Node, Db, [Start, End], _} -> + case dict:find([Db, [Start, End]], ShardsInDanger) of + {_, _} -> + PrettyRange = [ + couch_util:to_hex(<<Start:32/integer>>), + couch_util:to_hex(<<End:32/integer>>) + ], + PrettyShard = lists:flatten( + io_lib:format("~s ~s-~s", [Db | PrettyRange]) + ), + [PrettyShard | Acc]; + _ -> + Acc + end; + _ -> Acc + end + end, + [] + ). + +-spec shards_to_message(atom(), list()) -> {atom(), {atom(), list()}}. +shards_to_message(n1, []) -> + {info, {n1, []}}; +shards_to_message(n1, Shards) -> + {error, {n1, Shards}}; +shards_to_message(n0, []) -> + {info, {n0, []}}; +shards_to_message(n0, Shards) -> + {crit, {n0, Shards}}. + +-spec check(list()) -> [{atom(), term()}]. +check(_Opts) -> + N0Shards = safe_to_rebuild(node(), 1), + N1Shards = lists:subtract(safe_to_rebuild(node(), 2), N0Shards), + [shards_to_message(n0, N0Shards), shards_to_message(n1, N1Shards)]. + +-spec format(term()) -> {io:format(), [term()]}. +format({n1, []}) -> + {"This node can be rebuilt without causing any shards to become N=1", []}; +format({n1, Shards}) -> + { + "Rebuilding this node will leave the following shards with only one live copy: ~s", + [string:join(Shards, ", ")] + }; +format({n0, []}) -> + {"This node can be rebuilt without causing any shards to become N=0", []}; +format({n0, Shards}) -> + { + "Rebuilding this node will leave the following shard with NO live copies: ~s", + [string:join(Shards, ", ")] + }. |