diff options
author | Simon MacMullen <simon@rabbitmq.com> | 2014-10-14 15:16:34 +0100 |
---|---|---|
committer | Simon MacMullen <simon@rabbitmq.com> | 2014-10-14 15:16:34 +0100 |
commit | 6178f678ef85a7a94bc00368d36343163703dea1 (patch) | |
tree | 0ab928780eae02065aa5fe72607ee031b257b345 /src/rabbit_autoheal.erl | |
parent | 110d418c2f7bcf8935a3e8319d032c3355a3767c (diff) | |
download | rabbitmq-server-6178f678ef85a7a94bc00368d36343163703dea1.tar.gz |
Switch to having the winner inform the losers that they need to stop, rather than having the leader do it. This fixes the race where the leader tells them to stop before the partition has healed from the winner's POV. So it should be simpler and more correct.
Diffstat (limited to 'src/rabbit_autoheal.erl')
-rw-r--r-- | src/rabbit_autoheal.erl | 33 |
1 files changed, 8 insertions, 25 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl index 5a6cd48f..259e6ec2 100644 --- a/src/rabbit_autoheal.erl +++ b/src/rabbit_autoheal.erl @@ -127,7 +127,6 @@ handle_msg({request_start, Node}, " * Winner: ~p~n" " * Losers: ~p~n", [AllPartitions, Winner, Losers]), - [send(L, {winner_is, Winner}) || L <- Losers], Continue = fun(Msg) -> handle_msg(Msg, not_healing, Partitions) end, @@ -153,7 +152,14 @@ handle_msg({become_winner, Losers}, not_healing, _Partitions) -> rabbit_log:info("Autoheal: I am the winner, waiting for ~p to stop~n", [Losers]), - filter_already_down_losers(Losers, Losers); + %% The leader said everything was ready - do we agree? If not then + %% give up. + Down = Losers -- rabbit_node_monitor:alive_rabbit_nodes(Losers), + case Down of + [] -> [send(L, {winner_is, node()}) || L <- Losers], + {winner_waiting, Losers, Losers}; + _ -> abort(Down, Losers) + end; handle_msg({winner_is, Winner}, not_healing, _Partitions) -> @@ -224,26 +230,3 @@ all_partitions([{Node, CantSee} | Rest], Partitions) -> _ -> [A, B | Others] end, all_partitions(Rest, Partitions1). - -%% We could have received and ignored DOWN messages from some losers -%% before becoming the winner - check for already down nodes. -filter_already_down_losers(WantStopped, Notify) -> - Down = WantStopped -- rabbit_node_monitor:alive_nodes(WantStopped), - case Down of - [] -> - Running = rabbit_node_monitor:alive_rabbit_nodes(WantStopped), - AlreadyStopped = WantStopped -- Running, - case AlreadyStopped of - [] -> ok; - _ -> rabbit_log:info( - "Autoheal: ~p already down~n", [AlreadyStopped]) - end, - case Running of - [] -> rabbit_log:info( - "Autoheal: final node has stopped, starting...~n",[]), - winner_finish(Notify); - _ -> {winner_waiting, Running, Notify} - end; - _ -> - abort(Down, Notify) - end. |