diff options
author | Simon MacMullen <simon@rabbitmq.com> | 2014-10-15 17:01:34 +0100 |
---|---|---|
committer | Simon MacMullen <simon@rabbitmq.com> | 2014-10-15 17:01:34 +0100 |
commit | 5198e99b0e697ae022abfacdaf2fcf4808a9ff19 (patch) | |
tree | 85c28db9a83e1840c36f649ef3db50d7a1ac89f2 | |
parent | f236972005a6c7f1a63968502ca5d38a9005cd2b (diff) | |
parent | 174a27e86d90e499eabccc00a05a889639ad8764 (diff) | |
download | rabbitmq-server-5198e99b0e697ae022abfacdaf2fcf4808a9ff19.tar.gz |
Merge bug26368
-rw-r--r-- | src/rabbit_autoheal.erl | 108 | ||||
-rw-r--r-- | src/rabbit_node_monitor.erl | 22 |
2 files changed, 69 insertions, 61 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl index beb06a8c..90458741 100644 --- a/src/rabbit_autoheal.erl +++ b/src/rabbit_autoheal.erl @@ -117,50 +117,50 @@ node_down(Node, _State) -> handle_msg({request_start, Node}, not_healing, Partitions) -> rabbit_log:info("Autoheal request received from ~p~n", [Node]), - rabbit_node_monitor:ping_all(), - case rabbit_node_monitor:all_rabbit_nodes_up() of - false -> not_healing; - true -> AllPartitions = all_partitions(Partitions), - {Winner, Losers} = make_decision(AllPartitions), - rabbit_log:info("Autoheal decision~n" - " * Partitions: ~p~n" - " * Winner: ~p~n" - " * Losers: ~p~n", - [AllPartitions, Winner, Losers]), - [send(L, {winner_is, Winner}) || L <- Losers], - Continue = fun(Msg) -> - handle_msg(Msg, not_healing, Partitions) - end, - case node() =:= Winner of - true -> Continue({become_winner, Losers}); - false -> send(Winner, {become_winner, Losers}), %% [0] - case lists:member(node(), Losers) of - true -> Continue({winner_is, Winner}); - false -> {leader_waiting, Losers} - end - end + case check_other_nodes(Partitions) of + {error, E} -> + rabbit_log:info("Autoheal request denied: ~s~n", [fmt_error(E)]), + not_healing; + {ok, AllPartitions} -> + {Winner, Losers} = make_decision(AllPartitions), + rabbit_log:info("Autoheal decision~n" + " * Partitions: ~p~n" + " * Winner: ~p~n" + " * Losers: ~p~n", + [AllPartitions, Winner, Losers]), + Continue = fun(Msg) -> + handle_msg(Msg, not_healing, Partitions) + end, + case node() =:= Winner of + true -> Continue({become_winner, Losers}); + false -> send(Winner, {become_winner, Losers}), %% [0] + case lists:member(node(), Losers) of + true -> Continue({winner_is, Winner}); + false -> {leader_waiting, Losers} + end + end end; %% [0] If we are a loser we will never receive this message - but it %% won't stick in the mailbox as we are restarting anyway handle_msg({request_start, Node}, State, _Partitions) -> - rabbit_log:info("Autoheal request received from ~p when in state ~p; " - "ignoring~n", [Node, State]), + rabbit_log:info("Autoheal request received from ~p when healing; " + "ignoring~n", [Node]), State; handle_msg({become_winner, Losers}, not_healing, _Partitions) -> rabbit_log:info("Autoheal: I am the winner, waiting for ~p to stop~n", [Losers]), - filter_already_down_losers(Losers, Losers); - -handle_msg({become_winner, Losers}, - {winner_waiting, WaitFor, Notify}, _Partitions) -> - rabbit_log:info("Autoheal: I am the winner, waiting additionally for " - "~p to stop~n", [Losers]), - filter_already_down_losers(lists:usort(Losers ++ WaitFor), - lists:usort(Losers ++ Notify)); + %% The leader said everything was ready - do we agree? If not then + %% give up. + Down = Losers -- rabbit_node_monitor:alive_rabbit_nodes(Losers), + case Down of + [] -> [send(L, {winner_is, node()}) || L <- Losers], + {winner_waiting, Losers, Losers}; + _ -> abort(Down, Losers) + end; handle_msg({winner_is, Winner}, not_healing, _Partitions) -> @@ -212,11 +212,21 @@ partition_value(Partition) -> %% We have our local understanding of what partitions exist; but we %% only know which nodes we have been partitioned from, not which %% nodes are partitioned from each other. -all_partitions(PartitionedWith) -> +check_other_nodes(LocalPartitions) -> Nodes = rabbit_mnesia:cluster_nodes(all), - Partitions = [{node(), PartitionedWith} | - rabbit_node_monitor:partitions(Nodes -- [node()])], - all_partitions(Partitions, [Nodes]). + {Results, Bad} = rabbit_node_monitor:status(Nodes -- [node()]), + RemotePartitions = [{Node, proplists:get_value(partitions, Res)} + || {Node, Res} <- Results], + RemoteDown = [{Node, Down} + || {Node, Res} <- Results, + Down <- [Nodes -- proplists:get_value(nodes, Res)], + Down =/= []], + case {Bad, RemoteDown} of + {[], []} -> Partitions = [{node(), LocalPartitions} | RemotePartitions], + {ok, all_partitions(Partitions, [Nodes])}; + {[], _} -> {error, {remote_down, RemoteDown}}; + {_, _} -> {error, {nodes_down, Bad}} + end. all_partitions([], Partitions) -> Partitions; @@ -232,25 +242,7 @@ all_partitions([{Node, CantSee} | Rest], Partitions) -> end, all_partitions(Rest, Partitions1). -%% We could have received and ignored DOWN messages from some losers -%% before becoming the winner - check for already down nodes. -filter_already_down_losers(WantStopped, Notify) -> - Down = WantStopped -- rabbit_node_monitor:alive_nodes(WantStopped), - case Down of - [] -> - Running = rabbit_node_monitor:alive_rabbit_nodes(WantStopped), - AlreadyStopped = WantStopped -- Running, - case AlreadyStopped of - [] -> ok; - _ -> rabbit_log:info( - "Autoheal: ~p already down~n", [AlreadyStopped]) - end, - case Running of - [] -> rabbit_log:info( - "Autoheal: final node has stopped, starting...~n",[]), - winner_finish(Notify); - _ -> {winner_waiting, Running, Notify} - end; - _ -> - abort(Down, Notify) - end. +fmt_error({remote_down, RemoteDown}) -> + rabbit_misc:format("Remote nodes disconnected:~n ~p", [RemoteDown]); +fmt_error({nodes_down, NodesDown}) -> + rabbit_misc:format("Local nodes down: ~p", [NodesDown]). diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl index a948115d..e6069387 100644 --- a/src/rabbit_node_monitor.erl +++ b/src/rabbit_node_monitor.erl @@ -24,7 +24,7 @@ write_cluster_status/1, read_cluster_status/0, update_cluster_status/0, reset_cluster_status/0]). -export([notify_node_up/0, notify_joined_cluster/0, notify_left_cluster/1]). --export([partitions/0, partitions/1, subscribe/1]). +-export([partitions/0, partitions/1, status/1, subscribe/1]). -export([pause_minority_guard/0]). %% gen_server callbacks @@ -62,6 +62,7 @@ -spec(partitions/0 :: () -> [node()]). -spec(partitions/1 :: ([node()]) -> [{node(), [node()]}]). +-spec(status/1 :: ([node()]) -> {[{node(), [node()]}], [node()]}). -spec(subscribe/1 :: (pid()) -> 'ok'). -spec(pause_minority_guard/0 :: () -> 'ok' | 'pausing'). @@ -186,6 +187,9 @@ partitions(Nodes) -> {Replies, _} = gen_server:multi_call(Nodes, ?SERVER, partitions, infinity), Replies. +status(Nodes) -> + gen_server:multi_call(Nodes, ?SERVER, status, infinity). + subscribe(Pid) -> gen_server:cast(?SERVER, {subscribe, Pid}). @@ -252,6 +256,10 @@ init([]) -> handle_call(partitions, _From, State = #state{partitions = Partitions}) -> {reply, Partitions, State}; +handle_call(status, _From, State = #state{partitions = Partitions}) -> + {reply, [{partitions, Partitions}, + {nodes, [node() | nodes()]}], State}; + handle_call(_Request, _From, State) -> {noreply, State}. @@ -528,8 +536,16 @@ run_outside_applications(Fun) -> %% Ensure only one such process at a time, the %% exit(badarg) is harmless if one is already running try register(rabbit_outside_app_process, self()) of - true -> Fun() - catch error:badarg -> ok + true -> + try + Fun() + catch _:E -> + rabbit_log:error( + "rabbit_outside_app_process:~n~p~n~p~n", + [E, erlang:get_stacktrace()]) + end + catch error:badarg -> + ok end end). |