How to recover from partitioning after 'pause_if_all_down' is configurablebug26465

Now that 'pause_if_all_down' accepts a list of preferred nodes, it is possible that these nodes are spread across multiple partitions. For example, suppose we have nodes A and B in datacenter #1 and nodes C and D in datacenter #2, and we set {pause_if_all_down, [A, C]}, If the link between both datacenters is lost, A/B and C/D forms two partitions. RabbitMQ continues to run at both sites because all nodes see at least one node from the preferred nodes list. When the link comes back, we need to handle the recovery. Therefore, a user can specify the strategy: o {pause_if_all_down, [...], ignore} (default) o {pause_if_all_down, [...], autoheal} This third parameter is mandatory. If the strategy is 'ignore', RabbitMQ is started again on paused nodes, as soon as they see another node from the preferred nodes list. This is the default behaviour. If the strategy is 'autoheal', RabbitMQ is started again, like in 'ignore' mode, but when all nodes are up, autohealing kicks in as well. Compared to plain 'autoheal' mode, the chance of loosing data is low because paused nodes never drifted away from the cluster. When they start again, they join the cluster and resume operations as any starting node.
author: Jean-Sebastien Pedron <jean-sebastien@rabbitmq.com> 2014-12-24 15:16:27 +0100
committer: Jean-Sebastien Pedron <jean-sebastien@rabbitmq.com> 2014-12-24 15:16:27 +0100
commit: f9fbda2cd963704af13708b0b2f320fbdc7f398f (patch)
tree: ae9ec17ce26a740078eac0e55b9f41884843cae5
parent: 35d332b0d2f723fadc86c86c98d31db5dfac05d0 (diff)
download: rabbitmq-server-bug26465.tar.gz
2 files changed, 15 insertions, 6 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
index 90458741..8b292bb5 100644
--- a/src/rabbit_autoheal.erl
+++ b/src/rabbit_autoheal.erl
@@ -16,7 +16,8 @@
 
 -module(rabbit_autoheal).
 
--export([init/0, maybe_start/1, rabbit_down/2, node_down/2, handle_msg/3]).
+-export([init/0, enabled/0, maybe_start/1, rabbit_down/2, node_down/2,
+         handle_msg/3]).
 
 %% The named process we are running in.
 -define(SERVER, rabbit_node_monitor).
@@ -80,7 +81,11 @@ maybe_start(State) ->
     State.
 
 enabled() ->
-    {ok, autoheal} =:= application:get_env(rabbit, cluster_partition_handling).
+    case application:get_env(rabbit, cluster_partition_handling) of
+        {ok, autoheal}                         -> true;
+        {ok, {pause_if_all_down, _, autoheal}} -> true;
+        _                                      -> false
+    end.
 
 
 %% This is the winner receiving its last notification that a node has
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl
index 6845ada4..fedbfd78 100644
--- a/src/rabbit_node_monitor.erl
+++ b/src/rabbit_node_monitor.erl
@@ -221,7 +221,7 @@ pause_partition_guard() ->
             case M of
                 pause_minority ->
                     pause_minority_guard([]);
-                {pause_if_all_down, PreferredNodes} ->
+                {pause_if_all_down, PreferredNodes, _} ->
                     case verify_pause_if_all_down_list(PreferredNodes) of
                         []    -> put(pause_partition_guard, not_pause_mode),
                                  ok;
@@ -562,7 +562,7 @@ handle_dead_node(Node, State = #state{autoheal = Autoheal}) ->
                 false -> await_cluster_recovery(fun majority/0)
             end,
             State;
-        {ok, {pause_if_all_down, PreferredNodes}} ->
+        {ok, {pause_if_all_down, PreferredNodes, HowToRecover}} ->
             case verify_pause_if_all_down_list(PreferredNodes) of
                 []    -> ok;
                 Nodes -> case in_preferred_partition(Nodes) of
@@ -571,7 +571,11 @@ handle_dead_node(Node, State = #state{autoheal = Autoheal}) ->
                                         fun in_preferred_partition/0)
                          end
             end,
-            State;
+            case HowToRecover of
+                autoheal -> State#state{autoheal =
+                              rabbit_autoheal:node_down(Node, Autoheal)};
+                _        -> State
+            end;
         {ok, ignore} ->
             State;
         {ok, autoheal} ->
@@ -747,7 +751,7 @@ majority() ->
     length(alive_nodes(Nodes)) / length(Nodes) > 0.5.
 
 in_preferred_partition() ->
-    {ok, {pause_if_all_down, PreferredNodes}} =
+    {ok, {pause_if_all_down, PreferredNodes, _}} =
         application:get_env(rabbit, cluster_partition_handling),
     in_preferred_partition(PreferredNodes).
author	Jean-Sebastien Pedron <jean-sebastien@rabbitmq.com>	2014-12-24 15:16:27 +0100
committer	Jean-Sebastien Pedron <jean-sebastien@rabbitmq.com>	2014-12-24 15:16:27 +0100
commit	f9fbda2cd963704af13708b0b2f320fbdc7f398f (patch)
tree	ae9ec17ce26a740078eac0e55b9f41884843cae5
parent	35d332b0d2f723fadc86c86c98d31db5dfac05d0 (diff)
download	rabbitmq-server-bug26465.tar.gz