diff options
authorMike Wallace <>2014-08-22 15:50:02 +0100
committerJay Doane <>2021-04-19 00:35:19 -0700
commit4737cd789b42a66e1960769c425297b8fe4c07f7 (patch)
parent8f14524059e359fa47c4983e699f260cc8df1374 (diff)
Add check for whether node can be safely rebuilt
This commit adds a diagnostic check which determines whether a node can be safely rebuilt. If a node can be taken out of service without leaving any shards on the cluster with less than two live copies then it is considered safe and an info message is returned. If taking the node out of service would leave one or more shards with one live copy then an error message is returned. If zero live copies would be left then a critical message is returned. The bulk of work in this commit was authored in: cloudant/snippet@004537c28ae2c764c29c20e708681da7f542e21c BugzID: 33831
1 files changed, 116 insertions, 0 deletions
diff --git a/src/weatherreport/src/weatherreport_check_safe_to_rebuild.erl b/src/weatherreport/src/weatherreport_check_safe_to_rebuild.erl
new file mode 100644
index 000000000..85a5c6f2d
--- /dev/null
+++ b/src/weatherreport/src/weatherreport_check_safe_to_rebuild.erl
@@ -0,0 +1,116 @@
+%% -------------------------------------------------------------------
+%% weatherreport - automated diagnostic tools for CouchDB
+%% Copyright (c) 2014 Cloudant
+%% This file is provided to you under the Apache License,
+%% Version 2.0 (the "License"); you may not use this file
+%% except in compliance with the License. You may obtain
+%% a copy of the License at
+%% Unless required by applicable law or agreed to in writing,
+%% software distributed under the License is distributed on an
+%% KIND, either express or implied. See the License for the
+%% specific language governing permissions and limitations
+%% under the License.
+%% -------------------------------------------------------------------
+%% @doc Diagnostic that checks whether the current node can be
+%% safely rebuilt (i.e. taken out of service).
+ valid/0,
+ check/1,
+ format/1]).
+-spec description() -> string().
+description() ->
+ "Check whether the node can safely be taken out of service".
+-spec valid() -> boolean().
+valid() ->
+ weatherreport_node:can_connect().
+%% @doc Check if rebuilding a node is safe. Safe in this context means
+%% that no shard would end up with N<Threshold when the node is offline
+-spec safe_to_rebuild(atom(), integer()) -> [list()].
+safe_to_rebuild(Node, RawThreshold) ->
+ Threshold = case config:get("cloudant", "maintenance_mode") of
+ "true" ->
+ RawThreshold - 1;
+ _ ->
+ RawThreshold
+ end,
+ BelowThreshold = fun
+ ({_, _, {_, C}}) when C =< Threshold -> true;
+ (_) -> false end,
+ ToKV = fun({Db, Range, Status}) -> {[Db, Range], Status} end,
+ ShardsInDanger = dict:from_list(
+ lists:map(
+ ToKV,
+ lists:filter(BelowThreshold, custodian:report())
+ )
+ ),
+ mem3_shards:fold(
+ fun(Shard, Acc) ->
+ case Shard of
+ {shard, _, Node, Db, [Start, End], _} ->
+ case dict:find([Db, [Start, End]], ShardsInDanger) of
+ {_, _} ->
+ PrettyRange = [
+ couch_util:to_hex(<<Start:32/integer>>),
+ couch_util:to_hex(<<End:32/integer>>)
+ ],
+ PrettyShard = lists:flatten(
+ io_lib:format("~s ~s-~s", [Db | PrettyRange])
+ ),
+ [PrettyShard | Acc];
+ _ ->
+ Acc
+ end;
+ _ -> Acc
+ end
+ end,
+ []
+ ).
+-spec shards_to_message(atom(), list()) -> {atom(), {atom(), list()}}.
+shards_to_message(n1, []) ->
+ {info, {n1, []}};
+shards_to_message(n1, Shards) ->
+ {error, {n1, Shards}};
+shards_to_message(n0, []) ->
+ {info, {n0, []}};
+shards_to_message(n0, Shards) ->
+ {crit, {n0, Shards}}.
+-spec check(list()) -> [{atom(), term()}].
+check(_Opts) ->
+ N0Shards = safe_to_rebuild(node(), 1),
+ N1Shards = lists:subtract(safe_to_rebuild(node(), 2), N0Shards),
+ [shards_to_message(n0, N0Shards), shards_to_message(n1, N1Shards)].
+-spec format(term()) -> {io:format(), [term()]}.
+format({n1, []}) ->
+ {"This node can be rebuilt without causing any shards to become N=1", []};
+format({n1, Shards}) ->
+ {
+ "Rebuilding this node will leave the following shards with only one live copy: ~s",
+ [string:join(Shards, ", ")]
+ };
+format({n0, []}) ->
+ {"This node can be rebuilt without causing any shards to become N=0", []};
+format({n0, Shards}) ->
+ {
+ "Rebuilding this node will leave the following shard with NO live copies: ~s",
+ [string:join(Shards, ", ")]
+ }.