Autoheal: Make sure Mnesia is stopped on all losers before they restart

This works around a race in Mnesia where a starting loser would hang forever. This happens when a starting loser connects to another loser, negotiates the Mnesia protocol and attempts to acquire a write lock on the other node's schema. If the other nodes stops right between the protocol negotiation and the lock request, the starting node never receives an answer to its request. Before this fix, the hang occurred after at most 30 minutes looping on the partitions:autoheal test in rabbitmq-test. With the fix, RabbitMQ survived an all night long run.
author: Jean-Sebastien Pedron <jean-sebastien@rabbitmq.com> 2014-12-10 10:55:12 +0100
committer: Jean-Sebastien Pedron <jean-sebastien@rabbitmq.com> 2014-12-10 10:55:12 +0100
commit: 2a44901be5e4a70dad2523555996c5f552a9fbf7 (patch)
tree: 142ad75a83e56ac586d47d84521516ff081126f6
parent: 1749f8c8e0c65a4c09df720ade033706c6d37468 (diff)
download: rabbitmq-server-2a44901be5e4a70dad2523555996c5f552a9fbf7.tar.gz
1 files changed, 29 insertions, 0 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
index 90458741..a4ec86bf 100644
--- a/src/rabbit_autoheal.erl
+++ b/src/rabbit_autoheal.erl
@@ -21,6 +21,8 @@
 %% The named process we are running in.
 -define(SERVER, rabbit_node_monitor).
 
+-define(MNESIA_STOPPED_PING_INTERNAL, 200).
+
 %%----------------------------------------------------------------------------
 
 %% In order to autoheal we want to:
@@ -194,9 +196,36 @@ abort(Down, Notify) ->
     winner_finish(Notify).
 
 winner_finish(Notify) ->
+    %% There is a race in Mnesia causing a starting loser to hang
+    %% forever if another loser stops at the same time: the starting
+    %% node connects to the other node, negotiates the protocol and
+    %% attemps to acquire a write lock on the schema on the other node.
+    %% If the other node stops between the protocol negotiation and lock
+    %% request, the starting node never gets and answer to its lock
+    %% request.
+    %%
+    %% To workaround the problem, we make sure Mnesia is stopped on all
+    %% loosing nodes before sending the "autoheal_safe_to_start" signal.
+    wait_for_mnesia_shutdown(Notify),
     [{rabbit_outside_app_process, N} ! autoheal_safe_to_start || N <- Notify],
     not_healing.
 
+wait_for_mnesia_shutdown([Node | Rest] = AllNodes) ->
+    case rpc:call(Node, mnesia, system_info, [is_running]) of
+        no ->
+            wait_for_mnesia_shutdown(Rest);
+        Running when
+        Running =:= yes orelse
+        Running =:= starting orelse
+        Running =:= stopping ->
+            timer:sleep(?MNESIA_STOPPED_PING_INTERNAL),
+            wait_for_mnesia_shutdown(AllNodes);
+        _ ->
+            wait_for_mnesia_shutdown(Rest)
+    end;
+wait_for_mnesia_shutdown([]) ->
+    ok.
+
 make_decision(AllPartitions) ->
     Sorted = lists:sort([{partition_value(P), P} || P <- AllPartitions]),
     [[Winner | _] | Rest] = lists:reverse([P || {_, P} <- Sorted]),
author	Jean-Sebastien Pedron <jean-sebastien@rabbitmq.com>	2014-12-10 10:55:12 +0100
committer	Jean-Sebastien Pedron <jean-sebastien@rabbitmq.com>	2014-12-10 10:55:12 +0100
commit	2a44901be5e4a70dad2523555996c5f552a9fbf7 (patch)
tree	142ad75a83e56ac586d47d84521516ff081126f6
parent	1749f8c8e0c65a4c09df720ade033706c6d37468 (diff)
download	rabbitmq-server-2a44901be5e4a70dad2523555996c5f552a9fbf7.tar.gz