diff options
author | Jean-Sebastien Pedron <jean-sebastien@rabbitmq.com> | 2014-12-10 10:55:12 +0100 |
---|---|---|
committer | Jean-Sebastien Pedron <jean-sebastien@rabbitmq.com> | 2014-12-10 10:55:12 +0100 |
commit | 2a44901be5e4a70dad2523555996c5f552a9fbf7 (patch) | |
tree | 142ad75a83e56ac586d47d84521516ff081126f6 | |
parent | 1749f8c8e0c65a4c09df720ade033706c6d37468 (diff) | |
download | rabbitmq-server-2a44901be5e4a70dad2523555996c5f552a9fbf7.tar.gz |
Autoheal: Make sure Mnesia is stopped on all losers before they restart
This works around a race in Mnesia where a starting loser would hang
forever. This happens when a starting loser connects to another loser,
negotiates the Mnesia protocol and attempts to acquire a write lock on
the other node's schema. If the other nodes stops right between the
protocol negotiation and the lock request, the starting node never
receives an answer to its request.
Before this fix, the hang occurred after at most 30 minutes looping on
the partitions:autoheal test in rabbitmq-test. With the fix, RabbitMQ
survived an all night long run.
-rw-r--r-- | src/rabbit_autoheal.erl | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl index 90458741..a4ec86bf 100644 --- a/src/rabbit_autoheal.erl +++ b/src/rabbit_autoheal.erl @@ -21,6 +21,8 @@ %% The named process we are running in. -define(SERVER, rabbit_node_monitor). +-define(MNESIA_STOPPED_PING_INTERNAL, 200). + %%---------------------------------------------------------------------------- %% In order to autoheal we want to: @@ -194,9 +196,36 @@ abort(Down, Notify) -> winner_finish(Notify). winner_finish(Notify) -> + %% There is a race in Mnesia causing a starting loser to hang + %% forever if another loser stops at the same time: the starting + %% node connects to the other node, negotiates the protocol and + %% attemps to acquire a write lock on the schema on the other node. + %% If the other node stops between the protocol negotiation and lock + %% request, the starting node never gets and answer to its lock + %% request. + %% + %% To workaround the problem, we make sure Mnesia is stopped on all + %% loosing nodes before sending the "autoheal_safe_to_start" signal. + wait_for_mnesia_shutdown(Notify), [{rabbit_outside_app_process, N} ! autoheal_safe_to_start || N <- Notify], not_healing. +wait_for_mnesia_shutdown([Node | Rest] = AllNodes) -> + case rpc:call(Node, mnesia, system_info, [is_running]) of + no -> + wait_for_mnesia_shutdown(Rest); + Running when + Running =:= yes orelse + Running =:= starting orelse + Running =:= stopping -> + timer:sleep(?MNESIA_STOPPED_PING_INTERNAL), + wait_for_mnesia_shutdown(AllNodes); + _ -> + wait_for_mnesia_shutdown(Rest) + end; +wait_for_mnesia_shutdown([]) -> + ok. + make_decision(AllPartitions) -> Sorted = lists:sort([{partition_value(P), P} || P <- AllPartitions]), [[Winner | _] | Rest] = lists:reverse([P || {_, P} <- Sorted]), |