summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/rabbit_autoheal.erl29
1 files changed, 29 insertions, 0 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
index 90458741..b5d64992 100644
--- a/src/rabbit_autoheal.erl
+++ b/src/rabbit_autoheal.erl
@@ -21,6 +21,8 @@
%% The named process we are running in.
-define(SERVER, rabbit_node_monitor).
+-define(MNESIA_STOPPED_PING_INTERNAL, 200).
+
%%----------------------------------------------------------------------------
%% In order to autoheal we want to:
@@ -194,9 +196,36 @@ abort(Down, Notify) ->
winner_finish(Notify).
winner_finish(Notify) ->
+ %% There is a race in Mnesia causing a starting loser to hang
+ %% forever if another loser stops at the same time: the starting
+ %% node connects to the other node, negotiates the protocol and
+ %% attempts to acquire a write lock on the schema on the other node.
+ %% If the other node stops between the protocol negotiation and lock
+ %% request, the starting node never gets an answer to its lock
+ %% request.
+ %%
+ %% To work around the problem, we make sure Mnesia is stopped on all
+ %% losing nodes before sending the "autoheal_safe_to_start" signal.
+ wait_for_mnesia_shutdown(Notify),
[{rabbit_outside_app_process, N} ! autoheal_safe_to_start || N <- Notify],
not_healing.
+wait_for_mnesia_shutdown([Node | Rest] = AllNodes) ->
+ case rpc:call(Node, mnesia, system_info, [is_running]) of
+ no ->
+ wait_for_mnesia_shutdown(Rest);
+ Running when
+ Running =:= yes orelse
+ Running =:= starting orelse
+ Running =:= stopping ->
+ timer:sleep(?MNESIA_STOPPED_PING_INTERNAL),
+ wait_for_mnesia_shutdown(AllNodes);
+ _ ->
+ wait_for_mnesia_shutdown(Rest)
+ end;
+wait_for_mnesia_shutdown([]) ->
+ ok.
+
make_decision(AllPartitions) ->
Sorted = lists:sort([{partition_value(P), P} || P <- AllPartitions]),
[[Winner | _] | Rest] = lists:reverse([P || {_, P} <- Sorted]),