diff options
author | Simon MacMullen <simon@rabbitmq.com> | 2014-10-03 17:04:33 +0100 |
---|---|---|
committer | Simon MacMullen <simon@rabbitmq.com> | 2014-10-03 17:04:33 +0100 |
commit | 25dbc02119a39ca4263ff1cf957100208177ff03 (patch) | |
tree | ba0800f2fec2c9da8adec03a377fbcaecbaf52b6 | |
parent | d62a6d8e4c8cfafc7a562d2b0d62eaade3ca1314 (diff) | |
download | rabbitmq-server-25dbc02119a39ca4263ff1cf957100208177ff03.tar.gz |
Distinguish between "already stopped" (fine, carry on) or "already down" (abort since we've lost contact).
-rw-r--r-- | src/rabbit_autoheal.erl | 42 | ||||
-rw-r--r-- | src/rabbit_node_monitor.erl | 3 |
2 files changed, 28 insertions, 17 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl index 13df1662..beb06a8c 100644 --- a/src/rabbit_autoheal.erl +++ b/src/rabbit_autoheal.erl @@ -106,10 +106,7 @@ node_down(_Node, not_healing) -> not_healing; node_down(Node, {winner_waiting, _, Notify}) -> - rabbit_log:info("Autoheal: aborting - ~p went down~n", [Node]), - %% Make sure any nodes waiting for us start - it won't necessarily - %% heal the partition but at least they won't get stuck. - winner_finish(Notify); + abort([Node], Notify); node_down(Node, _State) -> rabbit_log:info("Autoheal: aborting - ~p went down~n", [Node]), @@ -190,6 +187,12 @@ handle_msg(_, restarting, _Partitions) -> send(Node, Msg) -> {?SERVER, Node} ! {autoheal_msg, Msg}. +abort(Down, Notify) -> + rabbit_log:info("Autoheal: aborting - ~p down~n", [Down]), + %% Make sure any nodes waiting for us start - it won't necessarily + %% heal the partition but at least they won't get stuck. + winner_finish(Notify). + winner_finish(Notify) -> [{rabbit_outside_app_process, N} ! autoheal_safe_to_start || N <- Notify], not_healing. @@ -231,16 +234,23 @@ all_partitions([{Node, CantSee} | Rest], Partitions) -> %% We could have received and ignored DOWN messages from some losers %% before becoming the winner - check for already down nodes. -filter_already_down_losers(WaitFor, Notify) -> - WaitFor2 = rabbit_node_monitor:alive_rabbit_nodes(WaitFor), - case WaitFor of - WaitFor2 -> ok; - _ -> rabbit_log:info("Autoheal: ~p already down~n", - [WaitFor -- WaitFor2]) - end, - case WaitFor2 of - [] -> rabbit_log:info( - "Autoheal: final node has stopped, starting...~n",[]), - winner_finish(Notify); - _ -> {winner_waiting, WaitFor2, Notify} +filter_already_down_losers(WantStopped, Notify) -> + Down = WantStopped -- rabbit_node_monitor:alive_nodes(WantStopped), + case Down of + [] -> + Running = rabbit_node_monitor:alive_rabbit_nodes(WantStopped), + AlreadyStopped = WantStopped -- Running, + case AlreadyStopped of + [] -> ok; + _ -> rabbit_log:info( + "Autoheal: ~p already down~n", [AlreadyStopped]) + end, + case Running of + [] -> rabbit_log:info( + "Autoheal: final node has stopped, starting...~n",[]), + winner_finish(Notify); + _ -> {winner_waiting, Running, Notify} + end; + _ -> + abort(Down, Notify) end. diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl index af294bab..051b992f 100644 --- a/src/rabbit_node_monitor.erl +++ b/src/rabbit_node_monitor.erl @@ -33,7 +33,7 @@ %% Utils -export([all_rabbit_nodes_up/0, run_outside_applications/1, ping_all/0, - alive_rabbit_nodes/1]). + alive_nodes/1, alive_rabbit_nodes/1]). -define(SERVER, ?MODULE). -define(RABBIT_UP_RPC_TIMEOUT, 2000). @@ -67,6 +67,7 @@ -spec(all_rabbit_nodes_up/0 :: () -> boolean()). -spec(run_outside_applications/1 :: (fun (() -> any())) -> pid()). -spec(ping_all/0 :: () -> 'ok'). +-spec(alive_nodes/1 :: ([node()]) -> [node()]). -spec(alive_rabbit_nodes/1 :: ([node()]) -> [node()]). -endif. |