diff options
author | Simon MacMullen <simon@rabbitmq.com> | 2014-06-27 14:49:09 +0100 |
---|---|---|
committer | Simon MacMullen <simon@rabbitmq.com> | 2014-06-27 14:49:09 +0100 |
commit | eff0e28206975dc2a5088e008563ffabdcb2357c (patch) | |
tree | 86c6802f385f0ae88c4ef794f282ee0b60205a17 | |
parent | 678f4713e7a725e02fa5f2727704809e8ab3e104 (diff) | |
download | rabbitmq-server-bug26225.tar.gz |
Separate out responsibilities in the various node state detection functions. Only ping_all/0 is allowed to establish new tcp connections (and thus take significant time for them to time out if necessary). This removes a significant delay while waiting for pause_minority to start.bug26225
-rw-r--r-- | src/rabbit_autoheal.erl | 1 | ||||
-rw-r--r-- | src/rabbit_node_monitor.erl | 26 |
2 files changed, 19 insertions, 8 deletions
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl index 826bfc45..c5237d34 100644 --- a/src/rabbit_autoheal.erl +++ b/src/rabbit_autoheal.erl @@ -118,6 +118,7 @@ node_down(Node, _State) -> handle_msg({request_start, Node}, not_healing, Partitions) -> rabbit_log:info("Autoheal request received from ~p~n", [Node]), + rabbit_node_monitor:ping_all(), case rabbit_node_monitor:all_rabbit_nodes_up() of false -> not_healing; true -> AllPartitions = all_partitions(Partitions), diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl index 88c7cc2c..22b0c280 100644 --- a/src/rabbit_node_monitor.erl +++ b/src/rabbit_node_monitor.erl @@ -31,7 +31,7 @@ code_change/3]). %% Utils --export([all_rabbit_nodes_up/0, run_outside_applications/1]). +-export([all_rabbit_nodes_up/0, run_outside_applications/1, ping_all/0]). -define(SERVER, ?MODULE). -define(RABBIT_UP_RPC_TIMEOUT, 2000). @@ -63,6 +63,7 @@ -spec(all_rabbit_nodes_up/0 :: () -> boolean()). -spec(run_outside_applications/1 :: (fun (() -> any())) -> pid()). +-spec(ping_all/0 :: () -> 'ok'). -endif. @@ -301,12 +302,11 @@ handle_info(ping_nodes, State) -> %% to ping the nodes that are up, after all. State1 = State#state{down_ping_timer = undefined}, Self = self(), - %% all_nodes_up() both pings all the nodes and tells us if we need to again. - %% %% We ping in a separate process since in a partition it might %% take some noticeable length of time and we don't want to block %% the node monitor for that long. spawn_link(fun () -> + ping_all(), case all_nodes_up() of true -> ok; false -> Self ! ping_again @@ -361,11 +361,10 @@ handle_dead_node(Node, State = #state{autoheal = Autoheal}) -> await_cluster_recovery() -> rabbit_log:warning("Cluster minority status detected - awaiting recovery~n", []), - Nodes = rabbit_mnesia:cluster_nodes(all), run_outside_applications(fun () -> rabbit_networking:killall(), rabbit:stop(), - wait_for_cluster_recovery(Nodes) + wait_for_cluster_recovery() end), ok. @@ -382,11 +381,12 @@ run_outside_applications(Fun) -> end end). -wait_for_cluster_recovery(Nodes) -> +wait_for_cluster_recovery() -> + ping_all(), case majority() of true -> rabbit:start(); false -> timer:sleep(?RABBIT_DOWN_PING_INTERVAL), - wait_for_cluster_recovery(Nodes) + wait_for_cluster_recovery() end. handle_dead_rabbit(Node, State = #state{partitions = Partitions, @@ -454,6 +454,11 @@ del_node(Node, Nodes) -> Nodes -- [Node]. %% functions here. "rabbit" in a function's name implies we test if %% the rabbit application is up, not just the node. +%% As we use these functions to decide what to do in pause_minority +%% state, they *must* be fast, even in the case where TCP connections +%% are timing out. So that means we should be careful about whether we +%% connect to nodes which are currently disconnected. + majority() -> Nodes = rabbit_mnesia:cluster_nodes(all), length(alive_nodes(Nodes)) / length(Nodes) > 0.5. @@ -466,9 +471,14 @@ all_rabbit_nodes_up() -> Nodes = rabbit_mnesia:cluster_nodes(all), length(alive_rabbit_nodes(Nodes)) =:= length(Nodes). -alive_nodes(Nodes) -> [N || N <- Nodes, pong =:= net_adm:ping(N)]. +alive_nodes(Nodes) -> [N || N <- Nodes, lists:member(N, [node()|nodes()])]. alive_rabbit_nodes() -> alive_rabbit_nodes(rabbit_mnesia:cluster_nodes(all)). alive_rabbit_nodes(Nodes) -> [N || N <- alive_nodes(Nodes), rabbit:is_running(N)]. + +%% This one is allowed to connect! +ping_all() -> + [net_adm:ping(N) || N <- rabbit_mnesia:cluster_nodes(all)], + ok. |