summaryrefslogtreecommitdiff
path: root/deps/rabbit_common/src/supervisor2.erl
diff options
context:
space:
mode:
authorAliaksey Artamonau <aliaksiej.artamonau@gmail.com>2017-05-27 19:06:18 -0700
committerAliaksey Artamonau <aliaksiej.artamonau@gmail.com>2017-05-27 19:54:01 -0700
commitf8c6a2da5ed5c34c51c6e7c97752cb56574549c2 (patch)
treeff5981b971c7cf9a737c3bc48077be0a7200f691 /deps/rabbit_common/src/supervisor2.erl
parent3117d68ce3ef83956c187eabbf417667b0969ab3 (diff)
downloadrabbitmq-server-git-f8c6a2da5ed5c34c51c6e7c97752cb56574549c2.tar.gz
[supervisor2] Mark child restarting when delaying restart.
Otherwise the child might not be restarted on failure under certain circumstances (see the code to reproduce the issue below). Here's the sequence of events leading to the issue. 1. A child crashes multiple times forcing the supervisor2 to delay-restart it. 2. Upon scheduling the delayed restart, the supervisor calls state_del_child/2 which sets child's pid to undefined. 3. After the delay, on restart attempt the child crashes before replying to the supervisor. 4. The supervisor schedules another restart attempt via try_again_restart/3. Before that restart/3 would try to mark the child as being restarted by means of calling restarting/1 on child's pid. But since the pid is undefined, the function keeps it unchanged. 5. When the supervisor handles the try_again_restart cast, it expects the process to be in restarting state. Since it's not the case, it just ignores the message. 6. As a result, the child doesn't get restarted. This patch addresses the issue by marking the child as being restarted in step 2. The only visible behavior change for the user is that when the child is scheduled to be restarted, supervisor2:which_children would return restarting instead of undefined for the child. This seems appropriate and nobody should have relied on the old behavior. The code to reproduce the issue: -module(reproducer). -behavior(supervisor2). -export([start/0, init/1, child/0]). start_link() -> supervisor2:start_link({local, parent}, ?MODULE, []). init([]) -> {ok, {{one_for_one, 1, 10}, [{child, {proc_lib, start_link, [?MODULE, child, []]}, {permanent, 2}, brutal_kill, worker, []}]}}. child() -> io:format("starting child~n"), case application:get_env(kernel, fail_in_init) of undefined -> ok; _ -> io:format("crashing in init~n"), exit(die) end, proc_lib:init_ack({ok, self()}), exit(die). start() -> {ok, _Sup} = start_link(), timer:sleep(500), application:set_env(kernel, fail_in_init, true), timer:sleep(10000), io:format("done~n"), init:stop(). Output without the patch: $ erl -noinput -run reproducer starting child starting child starting child crashing in init done Output with the patch: $ erl -noinput -run reproducer starting child starting child starting child crashing in init starting child crashing in init starting child crashing in init starting child crashing in init starting child crashing in init done
Diffstat (limited to 'deps/rabbit_common/src/supervisor2.erl')
-rw-r--r--deps/rabbit_common/src/supervisor2.erl3
1 files changed, 2 insertions, 1 deletions
diff --git a/deps/rabbit_common/src/supervisor2.erl b/deps/rabbit_common/src/supervisor2.erl
index 8cddee3b74..f3d58e2895 100644
--- a/deps/rabbit_common/src/supervisor2.erl
+++ b/deps/rabbit_common/src/supervisor2.erl
@@ -878,7 +878,8 @@ do_restart_delay({RestartType, Delay}, Reason, Child, State) ->
_TRef = erlang:send_after(trunc(Delay*1000), self(),
{delayed_restart,
{{RestartType, Delay}, Reason, Child}}),
- {ok, state_del_child(Child, State)}
+ OldPid = Child#child.pid,
+ {ok, replace_child(Child#child{pid=restarting(OldPid)}, State)}
end.
restart(Child, State) ->