summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/rabbit_autoheal.erl47
-rw-r--r--src/rabbit_node_monitor.erl8
2 files changed, 29 insertions, 26 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
index c6b26245..3aa32c09 100644
--- a/src/rabbit_autoheal.erl
+++ b/src/rabbit_autoheal.erl
@@ -16,7 +16,7 @@
-module(rabbit_autoheal).
--export([init/0, maybe_start/1, node_down/2, handle_msg/3]).
+-export([init/0, maybe_start/1, rabbit_down/2, node_down/2, handle_msg/3]).
%% The named process we are running in.
-define(SERVER, rabbit_node_monitor).
@@ -37,10 +37,13 @@
%% selected as the first node in the cluster.
%%
%% To coordinate the restarting nodes we pick a special node from the
-%% winning partition - the "winner". Restarting nodes then stop, tell
-%% the winner they have done so, and wait for it to tell them it is
-%% safe to start again. The winner and the leader are not necessarily
-%% the same node.
+%% winning partition - the "winner". Restarting nodes then stop, and
+%% wait for it to tell them it is safe to start again. The winner
+%% determines that a node has stopped just by seeing if its rabbit app
+%% stops - if a node stops for any other reason it just gets a message
+%% it will ignore, and otherwise we carry on.
+%%
+%% The winner and the leader are not necessarily the same node.
%%
%% Possible states:
%%
@@ -75,6 +78,21 @@ maybe_start(State) ->
enabled() ->
{ok, autoheal} =:= application:get_env(rabbit, cluster_partition_handling).
+
+%% This is the winner receiving its last notification that a node has
+%% stopped - all nodes can now start again
+rabbit_down(Node, {winner_waiting, [Node], Notify}) ->
+ rabbit_log:info("Autoheal: final node has stopped, starting...~n",[]),
+ notify_safe(Notify),
+ not_healing;
+
+rabbit_down(Node, {winner_waiting, WaitFor, Notify}) ->
+ {winner_waiting, WaitFor -- [Node], Notify};
+
+rabbit_down(_Node, State) ->
+ %% ignore, we already cancelled the autoheal process
+ State.
+
node_down(_Node, not_healing) ->
not_healing;
@@ -135,7 +153,6 @@ handle_msg({winner_is, Winner},
fun () ->
MRef = erlang:monitor(process, {?SERVER, Winner}),
rabbit:stop(),
- send(Winner, {node_stopped, node()}),
receive
{'DOWN', MRef, process, {?SERVER, Winner}, _Reason} -> ok;
autoheal_safe_to_start -> ok
@@ -145,25 +162,9 @@ handle_msg({winner_is, Winner},
end),
restarting;
-%% This is the winner receiving its last notification that a node has
-%% stopped - all nodes can now start again
-handle_msg({node_stopped, Node},
- {winner_waiting, [Node], Notify}, _Partitions) ->
- rabbit_log:info("Autoheal: final node has stopped, starting...~n",[]),
- notify_safe(Notify),
- not_healing;
-
-handle_msg({node_stopped, Node},
- {winner_waiting, WaitFor, Notify}, _Partitions) ->
- {winner_waiting, WaitFor -- [Node], Notify};
-
handle_msg(_, restarting, _Partitions) ->
%% ignore, we can contribute no further
- restarting;
-
-handle_msg({node_stopped, _Node}, State, _Partitions) ->
- %% ignore, we already cancelled the autoheal process
- State.
+ restarting.
%%----------------------------------------------------------------------------
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl
index c47e9b24..46dbd7b7 100644
--- a/src/rabbit_node_monitor.erl
+++ b/src/rabbit_node_monitor.erl
@@ -387,7 +387,8 @@ wait_for_cluster_recovery(Nodes) ->
wait_for_cluster_recovery(Nodes)
end.
-handle_dead_rabbit(Node, State = #state{partitions = Partitions}) ->
+handle_dead_rabbit(Node, State = #state{partitions = Partitions,
+ autoheal = Autoheal}) ->
%% TODO: This may turn out to be a performance hog when there are
%% lots of nodes. We really only need to execute some of these
%% statements on *one* node, rather than all of them.
@@ -404,8 +405,9 @@ handle_dead_rabbit(Node, State = #state{partitions = Partitions}) ->
[] -> [];
_ -> Partitions
end,
- ensure_ping_timer(State#state{partitions = Partitions1}).
-
+ ensure_ping_timer(
+ State#state{partitions = Partitions1,
+ autoheal = rabbit_autoheal:rabbit_down(Node, Autoheal)}).
ensure_ping_timer(State) ->
rabbit_misc:ensure_timer(