summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSimon MacMullen <simon@rabbitmq.com>2014-10-15 17:01:34 +0100
committerSimon MacMullen <simon@rabbitmq.com>2014-10-15 17:01:34 +0100
commit5198e99b0e697ae022abfacdaf2fcf4808a9ff19 (patch)
tree85c28db9a83e1840c36f649ef3db50d7a1ac89f2
parentf236972005a6c7f1a63968502ca5d38a9005cd2b (diff)
parent174a27e86d90e499eabccc00a05a889639ad8764 (diff)
downloadrabbitmq-server-5198e99b0e697ae022abfacdaf2fcf4808a9ff19.tar.gz
Merge bug26368
-rw-r--r--src/rabbit_autoheal.erl108
-rw-r--r--src/rabbit_node_monitor.erl22
2 files changed, 69 insertions, 61 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
index beb06a8c..90458741 100644
--- a/src/rabbit_autoheal.erl
+++ b/src/rabbit_autoheal.erl
@@ -117,50 +117,50 @@ node_down(Node, _State) ->
handle_msg({request_start, Node},
not_healing, Partitions) ->
rabbit_log:info("Autoheal request received from ~p~n", [Node]),
- rabbit_node_monitor:ping_all(),
- case rabbit_node_monitor:all_rabbit_nodes_up() of
- false -> not_healing;
- true -> AllPartitions = all_partitions(Partitions),
- {Winner, Losers} = make_decision(AllPartitions),
- rabbit_log:info("Autoheal decision~n"
- " * Partitions: ~p~n"
- " * Winner: ~p~n"
- " * Losers: ~p~n",
- [AllPartitions, Winner, Losers]),
- [send(L, {winner_is, Winner}) || L <- Losers],
- Continue = fun(Msg) ->
- handle_msg(Msg, not_healing, Partitions)
- end,
- case node() =:= Winner of
- true -> Continue({become_winner, Losers});
- false -> send(Winner, {become_winner, Losers}), %% [0]
- case lists:member(node(), Losers) of
- true -> Continue({winner_is, Winner});
- false -> {leader_waiting, Losers}
- end
- end
+ case check_other_nodes(Partitions) of
+ {error, E} ->
+ rabbit_log:info("Autoheal request denied: ~s~n", [fmt_error(E)]),
+ not_healing;
+ {ok, AllPartitions} ->
+ {Winner, Losers} = make_decision(AllPartitions),
+ rabbit_log:info("Autoheal decision~n"
+ " * Partitions: ~p~n"
+ " * Winner: ~p~n"
+ " * Losers: ~p~n",
+ [AllPartitions, Winner, Losers]),
+ Continue = fun(Msg) ->
+ handle_msg(Msg, not_healing, Partitions)
+ end,
+ case node() =:= Winner of
+ true -> Continue({become_winner, Losers});
+ false -> send(Winner, {become_winner, Losers}), %% [0]
+ case lists:member(node(), Losers) of
+ true -> Continue({winner_is, Winner});
+ false -> {leader_waiting, Losers}
+ end
+ end
end;
%% [0] If we are a loser we will never receive this message - but it
%% won't stick in the mailbox as we are restarting anyway
handle_msg({request_start, Node},
State, _Partitions) ->
- rabbit_log:info("Autoheal request received from ~p when in state ~p; "
- "ignoring~n", [Node, State]),
+ rabbit_log:info("Autoheal request received from ~p when healing; "
+ "ignoring~n", [Node]),
State;
handle_msg({become_winner, Losers},
not_healing, _Partitions) ->
rabbit_log:info("Autoheal: I am the winner, waiting for ~p to stop~n",
[Losers]),
- filter_already_down_losers(Losers, Losers);
-
-handle_msg({become_winner, Losers},
- {winner_waiting, WaitFor, Notify}, _Partitions) ->
- rabbit_log:info("Autoheal: I am the winner, waiting additionally for "
- "~p to stop~n", [Losers]),
- filter_already_down_losers(lists:usort(Losers ++ WaitFor),
- lists:usort(Losers ++ Notify));
+ %% The leader said everything was ready - do we agree? If not then
+ %% give up.
+ Down = Losers -- rabbit_node_monitor:alive_rabbit_nodes(Losers),
+ case Down of
+ [] -> [send(L, {winner_is, node()}) || L <- Losers],
+ {winner_waiting, Losers, Losers};
+ _ -> abort(Down, Losers)
+ end;
handle_msg({winner_is, Winner},
not_healing, _Partitions) ->
@@ -212,11 +212,21 @@ partition_value(Partition) ->
%% We have our local understanding of what partitions exist; but we
%% only know which nodes we have been partitioned from, not which
%% nodes are partitioned from each other.
-all_partitions(PartitionedWith) ->
+check_other_nodes(LocalPartitions) ->
Nodes = rabbit_mnesia:cluster_nodes(all),
- Partitions = [{node(), PartitionedWith} |
- rabbit_node_monitor:partitions(Nodes -- [node()])],
- all_partitions(Partitions, [Nodes]).
+ {Results, Bad} = rabbit_node_monitor:status(Nodes -- [node()]),
+ RemotePartitions = [{Node, proplists:get_value(partitions, Res)}
+ || {Node, Res} <- Results],
+ RemoteDown = [{Node, Down}
+ || {Node, Res} <- Results,
+ Down <- [Nodes -- proplists:get_value(nodes, Res)],
+ Down =/= []],
+ case {Bad, RemoteDown} of
+ {[], []} -> Partitions = [{node(), LocalPartitions} | RemotePartitions],
+ {ok, all_partitions(Partitions, [Nodes])};
+ {[], _} -> {error, {remote_down, RemoteDown}};
+ {_, _} -> {error, {nodes_down, Bad}}
+ end.
all_partitions([], Partitions) ->
Partitions;
@@ -232,25 +242,7 @@ all_partitions([{Node, CantSee} | Rest], Partitions) ->
end,
all_partitions(Rest, Partitions1).
-%% We could have received and ignored DOWN messages from some losers
-%% before becoming the winner - check for already down nodes.
-filter_already_down_losers(WantStopped, Notify) ->
- Down = WantStopped -- rabbit_node_monitor:alive_nodes(WantStopped),
- case Down of
- [] ->
- Running = rabbit_node_monitor:alive_rabbit_nodes(WantStopped),
- AlreadyStopped = WantStopped -- Running,
- case AlreadyStopped of
- [] -> ok;
- _ -> rabbit_log:info(
- "Autoheal: ~p already down~n", [AlreadyStopped])
- end,
- case Running of
- [] -> rabbit_log:info(
- "Autoheal: final node has stopped, starting...~n",[]),
- winner_finish(Notify);
- _ -> {winner_waiting, Running, Notify}
- end;
- _ ->
- abort(Down, Notify)
- end.
+fmt_error({remote_down, RemoteDown}) ->
+ rabbit_misc:format("Remote nodes disconnected:~n ~p", [RemoteDown]);
+fmt_error({nodes_down, NodesDown}) ->
+ rabbit_misc:format("Local nodes down: ~p", [NodesDown]).
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl
index a948115d..e6069387 100644
--- a/src/rabbit_node_monitor.erl
+++ b/src/rabbit_node_monitor.erl
@@ -24,7 +24,7 @@
write_cluster_status/1, read_cluster_status/0,
update_cluster_status/0, reset_cluster_status/0]).
-export([notify_node_up/0, notify_joined_cluster/0, notify_left_cluster/1]).
--export([partitions/0, partitions/1, subscribe/1]).
+-export([partitions/0, partitions/1, status/1, subscribe/1]).
-export([pause_minority_guard/0]).
%% gen_server callbacks
@@ -62,6 +62,7 @@
-spec(partitions/0 :: () -> [node()]).
-spec(partitions/1 :: ([node()]) -> [{node(), [node()]}]).
+-spec(status/1 :: ([node()]) -> {[{node(), [node()]}], [node()]}).
-spec(subscribe/1 :: (pid()) -> 'ok').
-spec(pause_minority_guard/0 :: () -> 'ok' | 'pausing').
@@ -186,6 +187,9 @@ partitions(Nodes) ->
{Replies, _} = gen_server:multi_call(Nodes, ?SERVER, partitions, infinity),
Replies.
+status(Nodes) ->
+ gen_server:multi_call(Nodes, ?SERVER, status, infinity).
+
subscribe(Pid) ->
gen_server:cast(?SERVER, {subscribe, Pid}).
@@ -252,6 +256,10 @@ init([]) ->
handle_call(partitions, _From, State = #state{partitions = Partitions}) ->
{reply, Partitions, State};
+handle_call(status, _From, State = #state{partitions = Partitions}) ->
+ {reply, [{partitions, Partitions},
+ {nodes, [node() | nodes()]}], State};
+
handle_call(_Request, _From, State) ->
{noreply, State}.
@@ -528,8 +536,16 @@ run_outside_applications(Fun) ->
%% Ensure only one such process at a time, the
%% exit(badarg) is harmless if one is already running
try register(rabbit_outside_app_process, self()) of
- true -> Fun()
- catch error:badarg -> ok
+ true ->
+ try
+ Fun()
+ catch _:E ->
+ rabbit_log:error(
+ "rabbit_outside_app_process:~n~p~n~p~n",
+ [E, erlang:get_stacktrace()])
+ end
+ catch error:badarg ->
+ ok
end
end).