summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSimon MacMullen <simon@rabbitmq.com>2014-10-03 17:04:33 +0100
committerSimon MacMullen <simon@rabbitmq.com>2014-10-03 17:04:33 +0100
commit25dbc02119a39ca4263ff1cf957100208177ff03 (patch)
treeba0800f2fec2c9da8adec03a377fbcaecbaf52b6
parentd62a6d8e4c8cfafc7a562d2b0d62eaade3ca1314 (diff)
downloadrabbitmq-server-25dbc02119a39ca4263ff1cf957100208177ff03.tar.gz
Distinguish between "already stopped" (fine, carry on) or "already down" (abort since we've lost contact).
-rw-r--r--src/rabbit_autoheal.erl42
-rw-r--r--src/rabbit_node_monitor.erl3
2 files changed, 28 insertions, 17 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
index 13df1662..beb06a8c 100644
--- a/src/rabbit_autoheal.erl
+++ b/src/rabbit_autoheal.erl
@@ -106,10 +106,7 @@ node_down(_Node, not_healing) ->
not_healing;
node_down(Node, {winner_waiting, _, Notify}) ->
- rabbit_log:info("Autoheal: aborting - ~p went down~n", [Node]),
- %% Make sure any nodes waiting for us start - it won't necessarily
- %% heal the partition but at least they won't get stuck.
- winner_finish(Notify);
+ abort([Node], Notify);
node_down(Node, _State) ->
rabbit_log:info("Autoheal: aborting - ~p went down~n", [Node]),
@@ -190,6 +187,12 @@ handle_msg(_, restarting, _Partitions) ->
send(Node, Msg) -> {?SERVER, Node} ! {autoheal_msg, Msg}.
+abort(Down, Notify) ->
+ rabbit_log:info("Autoheal: aborting - ~p down~n", [Down]),
+ %% Make sure any nodes waiting for us start - it won't necessarily
+ %% heal the partition but at least they won't get stuck.
+ winner_finish(Notify).
+
winner_finish(Notify) ->
[{rabbit_outside_app_process, N} ! autoheal_safe_to_start || N <- Notify],
not_healing.
@@ -231,16 +234,23 @@ all_partitions([{Node, CantSee} | Rest], Partitions) ->
%% We could have received and ignored DOWN messages from some losers
%% before becoming the winner - check for already down nodes.
-filter_already_down_losers(WaitFor, Notify) ->
- WaitFor2 = rabbit_node_monitor:alive_rabbit_nodes(WaitFor),
- case WaitFor of
- WaitFor2 -> ok;
- _ -> rabbit_log:info("Autoheal: ~p already down~n",
- [WaitFor -- WaitFor2])
- end,
- case WaitFor2 of
- [] -> rabbit_log:info(
- "Autoheal: final node has stopped, starting...~n",[]),
- winner_finish(Notify);
- _ -> {winner_waiting, WaitFor2, Notify}
+filter_already_down_losers(WantStopped, Notify) ->
+ Down = WantStopped -- rabbit_node_monitor:alive_nodes(WantStopped),
+ case Down of
+ [] ->
+ Running = rabbit_node_monitor:alive_rabbit_nodes(WantStopped),
+ AlreadyStopped = WantStopped -- Running,
+ case AlreadyStopped of
+ [] -> ok;
+ _ -> rabbit_log:info(
+ "Autoheal: ~p already down~n", [AlreadyStopped])
+ end,
+ case Running of
+ [] -> rabbit_log:info(
+ "Autoheal: final node has stopped, starting...~n",[]),
+ winner_finish(Notify);
+ _ -> {winner_waiting, Running, Notify}
+ end;
+ _ ->
+ abort(Down, Notify)
end.
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl
index af294bab..051b992f 100644
--- a/src/rabbit_node_monitor.erl
+++ b/src/rabbit_node_monitor.erl
@@ -33,7 +33,7 @@
%% Utils
-export([all_rabbit_nodes_up/0, run_outside_applications/1, ping_all/0,
- alive_rabbit_nodes/1]).
+ alive_nodes/1, alive_rabbit_nodes/1]).
-define(SERVER, ?MODULE).
-define(RABBIT_UP_RPC_TIMEOUT, 2000).
@@ -67,6 +67,7 @@
-spec(all_rabbit_nodes_up/0 :: () -> boolean()).
-spec(run_outside_applications/1 :: (fun (() -> any())) -> pid()).
-spec(ping_all/0 :: () -> 'ok').
+-spec(alive_nodes/1 :: ([node()]) -> [node()]).
-spec(alive_rabbit_nodes/1 :: ([node()]) -> [node()]).
-endif.