2 files changed, 29 insertions, 26 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
index c6b26245..3aa32c09 100644
--- a/src/rabbit_autoheal.erl
+++ b/src/rabbit_autoheal.erl
@@ -16,7 +16,7 @@
 
 -module(rabbit_autoheal).
 
--export([init/0, maybe_start/1, node_down/2, handle_msg/3]).
+-export([init/0, maybe_start/1, rabbit_down/2, node_down/2, handle_msg/3]).
 
 %% The named process we are running in.
 -define(SERVER, rabbit_node_monitor).
@@ -37,10 +37,13 @@
 %% selected as the first node in the cluster.
 %%
 %% To coordinate the restarting nodes we pick a special node from the
-%% winning partition - the "winner". Restarting nodes then stop, tell
-%% the winner they have done so, and wait for it to tell them it is
-%% safe to start again. The winner and the leader are not necessarily
-%% the same node.
+%% winning partition - the "winner". Restarting nodes then stop, and
+%% wait for it to tell them it is safe to start again. The winner
+%% determines that a node has stopped just by seeing if its rabbit app
+%% stops - if a node stops for any other reason it just gets a message
+%% it will ignore, and otherwise we carry on.
+%%
+%% The winner and the leader are not necessarily the same node.
 %%
 %% Possible states:
 %%
@@ -75,6 +78,21 @@ maybe_start(State) ->
 enabled() ->
     {ok, autoheal} =:= application:get_env(rabbit, cluster_partition_handling).
 
+
+%% This is the winner receiving its last notification that a node has
+%% stopped - all nodes can now start again
+rabbit_down(Node, {winner_waiting, [Node], Notify}) ->
+    rabbit_log:info("Autoheal: final node has stopped, starting...~n",[]),
+    notify_safe(Notify),
+    not_healing;
+
+rabbit_down(Node, {winner_waiting, WaitFor, Notify}) ->
+    {winner_waiting, WaitFor -- [Node], Notify};
+
+rabbit_down(_Node, State) ->
+    %% ignore, we already cancelled the autoheal process
+    State.
+
 node_down(_Node, not_healing) ->
     not_healing;
 
@@ -135,7 +153,6 @@ handle_msg({winner_is, Winner},
       fun () ->
               MRef = erlang:monitor(process, {?SERVER, Winner}),
               rabbit:stop(),
-              send(Winner, {node_stopped, node()}),
               receive
                   {'DOWN', MRef, process, {?SERVER, Winner}, _Reason} -> ok;
                   autoheal_safe_to_start                              -> ok
@@ -145,25 +162,9 @@ handle_msg({winner_is, Winner},
       end),
     restarting;
 
-%% This is the winner receiving its last notification that a node has
-%% stopped - all nodes can now start again
-handle_msg({node_stopped, Node},
-           {winner_waiting, [Node], Notify}, _Partitions) ->
-    rabbit_log:info("Autoheal: final node has stopped, starting...~n",[]),
-    notify_safe(Notify),
-    not_healing;
-
-handle_msg({node_stopped, Node},
-           {winner_waiting, WaitFor, Notify}, _Partitions) ->
-    {winner_waiting, WaitFor -- [Node], Notify};
-
 handle_msg(_, restarting, _Partitions) ->
     %% ignore, we can contribute no further
-    restarting;
-
-handle_msg({node_stopped, _Node}, State, _Partitions) ->
-    %% ignore, we already cancelled the autoheal process
-    State.
+    restarting.
 
 %%----------------------------------------------------------------------------
 
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl
index c47e9b24..46dbd7b7 100644
--- a/src/rabbit_node_monitor.erl
+++ b/src/rabbit_node_monitor.erl
@@ -387,7 +387,8 @@ wait_for_cluster_recovery(Nodes) ->
                  wait_for_cluster_recovery(Nodes)
     end.
 
-handle_dead_rabbit(Node, State = #state{partitions = Partitions}) ->
+handle_dead_rabbit(Node, State = #state{partitions = Partitions,
+                                        autoheal   = Autoheal}) ->
     %% TODO: This may turn out to be a performance hog when there are
     %% lots of nodes.  We really only need to execute some of these
     %% statements on *one* node, rather than all of them.
@@ -404,8 +405,9 @@ handle_dead_rabbit(Node, State = #state{partitions = Partitions}) ->
                       [] -> [];
                       _  -> Partitions
                   end,
-    ensure_ping_timer(State#state{partitions = Partitions1}).
-
+    ensure_ping_timer(
+      State#state{partitions = Partitions1,
+                  autoheal   = rabbit_autoheal:rabbit_down(Node, Autoheal)}).
 
 ensure_ping_timer(State) ->
     rabbit_misc:ensure_timer(