diff options
-rw-r--r-- | src/rabbit_autoheal.erl | 47 | ||||
-rw-r--r-- | src/rabbit_node_monitor.erl | 8 |
2 files changed, 29 insertions, 26 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl index c6b26245..3aa32c09 100644 --- a/src/rabbit_autoheal.erl +++ b/src/rabbit_autoheal.erl @@ -16,7 +16,7 @@ -module(rabbit_autoheal). --export([init/0, maybe_start/1, node_down/2, handle_msg/3]). +-export([init/0, maybe_start/1, rabbit_down/2, node_down/2, handle_msg/3]). %% The named process we are running in. -define(SERVER, rabbit_node_monitor). @@ -37,10 +37,13 @@ %% selected as the first node in the cluster. %% %% To coordinate the restarting nodes we pick a special node from the -%% winning partition - the "winner". Restarting nodes then stop, tell -%% the winner they have done so, and wait for it to tell them it is -%% safe to start again. The winner and the leader are not necessarily -%% the same node. +%% winning partition - the "winner". Restarting nodes then stop, and +%% wait for it to tell them it is safe to start again. The winner +%% determines that a node has stopped just by seeing if its rabbit app +%% stops - if a node stops for any other reason it just gets a message +%% it will ignore, and otherwise we carry on. +%% +%% The winner and the leader are not necessarily the same node. %% %% Possible states: %% @@ -75,6 +78,21 @@ maybe_start(State) -> enabled() -> {ok, autoheal} =:= application:get_env(rabbit, cluster_partition_handling). + +%% This is the winner receiving its last notification that a node has +%% stopped - all nodes can now start again +rabbit_down(Node, {winner_waiting, [Node], Notify}) -> + rabbit_log:info("Autoheal: final node has stopped, starting...~n",[]), + notify_safe(Notify), + not_healing; + +rabbit_down(Node, {winner_waiting, WaitFor, Notify}) -> + {winner_waiting, WaitFor -- [Node], Notify}; + +rabbit_down(_Node, State) -> + %% ignore, we already cancelled the autoheal process + State. + node_down(_Node, not_healing) -> not_healing; @@ -135,7 +153,6 @@ handle_msg({winner_is, Winner}, fun () -> MRef = erlang:monitor(process, {?SERVER, Winner}), rabbit:stop(), - send(Winner, {node_stopped, node()}), receive {'DOWN', MRef, process, {?SERVER, Winner}, _Reason} -> ok; autoheal_safe_to_start -> ok @@ -145,25 +162,9 @@ handle_msg({winner_is, Winner}, end), restarting; -%% This is the winner receiving its last notification that a node has -%% stopped - all nodes can now start again -handle_msg({node_stopped, Node}, - {winner_waiting, [Node], Notify}, _Partitions) -> - rabbit_log:info("Autoheal: final node has stopped, starting...~n",[]), - notify_safe(Notify), - not_healing; - -handle_msg({node_stopped, Node}, - {winner_waiting, WaitFor, Notify}, _Partitions) -> - {winner_waiting, WaitFor -- [Node], Notify}; - handle_msg(_, restarting, _Partitions) -> %% ignore, we can contribute no further - restarting; - -handle_msg({node_stopped, _Node}, State, _Partitions) -> - %% ignore, we already cancelled the autoheal process - State. + restarting. %%---------------------------------------------------------------------------- diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl index c47e9b24..46dbd7b7 100644 --- a/src/rabbit_node_monitor.erl +++ b/src/rabbit_node_monitor.erl @@ -387,7 +387,8 @@ wait_for_cluster_recovery(Nodes) -> wait_for_cluster_recovery(Nodes) end. -handle_dead_rabbit(Node, State = #state{partitions = Partitions}) -> +handle_dead_rabbit(Node, State = #state{partitions = Partitions, + autoheal = Autoheal}) -> %% TODO: This may turn out to be a performance hog when there are %% lots of nodes. We really only need to execute some of these %% statements on *one* node, rather than all of them. @@ -404,8 +405,9 @@ handle_dead_rabbit(Node, State = #state{partitions = Partitions}) -> [] -> []; _ -> Partitions end, - ensure_ping_timer(State#state{partitions = Partitions1}). - + ensure_ping_timer( + State#state{partitions = Partitions1, + autoheal = rabbit_autoheal:rabbit_down(Node, Autoheal)}). ensure_ping_timer(State) -> rabbit_misc:ensure_timer( |