summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Klishin <mklishin@pivotal.io>2020-01-13 20:42:21 +0300
committerGitHub <noreply@github.com>2020-01-13 20:42:21 +0300
commit5576cbc781dc1e2eeb1d1d3773ad6d60daacdec7 (patch)
tree6d3b1b732b3254d7532e0d75766968a25ad04509
parent5d32f7edc9afbc0f397e37b25e482071cee00c70 (diff)
parent8ff98742d0d8ed5b64233d1bb9cf63b5a0c0d0e4 (diff)
downloadrabbitmq-server-git-5576cbc781dc1e2eeb1d1d3773ad6d60daacdec7.tar.gz
Merge pull request #2209 from rabbitmq/tomyouyou-tomyouyou-autoheal-l
Autoheal: the "autoheal_safe_to_start" state transition is not guaranteed to arrive on time
-rw-r--r--Makefile1
-rw-r--r--src/rabbit_autoheal.erl6
-rw-r--r--test/partitions_SUITE.erl5
3 files changed, 10 insertions, 2 deletions
diff --git a/Makefile b/Makefile
index b0b6ecb3f6..420d8b32f0 100644
--- a/Makefile
+++ b/Makefile
@@ -56,6 +56,7 @@ define PROJECT_ENV
{reverse_dns_lookups, false},
{cluster_partition_handling, ignore},
{cluster_keepalive_interval, 10000},
+ {autoheal_state_transition_timeout, 60000},
{tcp_listen_options, [{backlog, 128},
{nodelay, true},
{linger, {true, 0}},
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
index 77165fc26c..316c8c89cb 100644
--- a/src/rabbit_autoheal.erl
+++ b/src/rabbit_autoheal.erl
@@ -372,6 +372,7 @@ wait_for_supervisors(Monitors) ->
restart_loser(State, Winner) ->
rabbit_log:warning(
"Autoheal: we were selected to restart; winner is ~p~n", [Winner]),
+ NextStateTimeout = application:get_env(rabbit, autoheal_state_transition_timeout, 60000),
rabbit_node_monitor:run_outside_applications(
fun () ->
MRef = erlang:monitor(process, {?SERVER, Winner}),
@@ -381,6 +382,11 @@ restart_loser(State, Winner) ->
not_healing;
autoheal_safe_to_start ->
State
+ after NextStateTimeout ->
+ rabbit_log:warning(
+ "Autoheal: timed out waiting for a safe-to-start message from the winner (~p); will retry",
+ [Winner]),
+ not_healing
end,
erlang:demonitor(MRef, [flush]),
%% During the restart, the autoheal state is lost so we
diff --git a/test/partitions_SUITE.erl b/test/partitions_SUITE.erl
index 12a43b9fa6..1c7151d209 100644
--- a/test/partitions_SUITE.erl
+++ b/test/partitions_SUITE.erl
@@ -21,7 +21,7 @@
-compile(export_all).
-%% We set ticktime to 1s and setuptime is 7s so to make sure it
+%% We set ticktime to 1s and setup time is 7s so to make sure it
%% passes...
-define(DELAY, 8000).
@@ -119,7 +119,7 @@ end_per_testcase(Testcase, Config) ->
rabbit_ct_helpers:testcase_finished(Config1, Testcase).
%% -------------------------------------------------------------------
-%% Testcases.
+%% Test cases.
%% -------------------------------------------------------------------
ignore(Config) ->
@@ -400,6 +400,7 @@ partial_pause_minority(Config) ->
ok.
partial_pause_if_all_down(Config) ->
+ rabbit_ct_broker_helpers:rpc_all(Config, application, set_env, [rabbit, autoheal_state_transition_timeout, 3000]),
[A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
set_mode(Config, {pause_if_all_down, [B], ignore}),
block([{A, B}]),