summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSimon MacMullen <simon@rabbitmq.com>2014-10-08 15:41:31 +0100
committerSimon MacMullen <simon@rabbitmq.com>2014-10-08 15:41:31 +0100
commit264fd053a141db698a6b078f3fdf2e65378424c9 (patch)
tree2a9eb73cdebfd9bae6fe9f11fe34f68c6ca043a0
parent26fb4517fe9fdb5918b9f5c2665663a7f89be5e7 (diff)
downloadrabbitmq-server-264fd053a141db698a6b078f3fdf2e65378424c9.tar.gz
Defend against partitions at the wrong time causing badness.
-rw-r--r--src/rabbit_mirror_queue_sync.erl24
1 files changed, 18 insertions, 6 deletions
diff --git a/src/rabbit_mirror_queue_sync.erl b/src/rabbit_mirror_queue_sync.erl
index e3fae4c0..d1ef5f30 100644
--- a/src/rabbit_mirror_queue_sync.erl
+++ b/src/rabbit_mirror_queue_sync.erl
@@ -156,18 +156,30 @@ syncer(Ref, Log, MPid, SPids) ->
%% We wait for a reply from the slaves so that we know they are in
%% a receive block and will thus receive messages we send to them
%% *without* those messages ending up in their gen_server2 pqueue.
- case [SPid || SPid <- SPids,
- receive
- {sync_ready, Ref, SPid} -> true;
- {sync_deny, Ref, SPid} -> false;
- {'DOWN', _, process, SPid, _} -> false
- end] of
+ case await_slaves(Ref, SPids) of
[] -> Log("all slaves already synced", []);
SPids1 -> MPid ! {ready, self()},
Log("mirrors ~p to sync", [[node(SPid) || SPid <- SPids1]]),
syncer_loop(Ref, MPid, SPids1)
end.
+await_slaves(Ref, SPids) ->
+ Nodes = rabbit_mnesia:cluster_nodes(running),
+ [SPid || SPid <- SPids,
+ lists:member(node(SPid), Nodes) andalso %% [0]
+ receive
+ {sync_ready, Ref, SPid} -> true;
+ {sync_deny, Ref, SPid} -> false;
+ {'DOWN', _, process, SPid, _} -> false
+ end].
+%% [0] This check is in case there's been a partition which has then
+%% healed in between the master retrieving the slave pids from Mnesia
+%% and sending 'sync_start' over GM. If so there might be slaves on the
+%% other side of the partition which we can monitor (since they have
+%% rejoined the distributed system with us) but which did not get the
+%% 'sync_start' and so will not reply. We need to act as though they are
+%% down.
+
syncer_loop(Ref, MPid, SPids) ->
MPid ! {next, Ref},
receive