diff options
author | Aliaksey Artamonau <aliaksiej.artamonau@gmail.com> | 2017-05-27 19:06:18 -0700 |
---|---|---|
committer | Aliaksey Artamonau <aliaksiej.artamonau@gmail.com> | 2017-05-27 19:54:01 -0700 |
commit | f8c6a2da5ed5c34c51c6e7c97752cb56574549c2 (patch) | |
tree | ff5981b971c7cf9a737c3bc48077be0a7200f691 /deps/rabbit_common/src/supervisor2.erl | |
parent | 3117d68ce3ef83956c187eabbf417667b0969ab3 (diff) | |
download | rabbitmq-server-git-f8c6a2da5ed5c34c51c6e7c97752cb56574549c2.tar.gz |
[supervisor2] Mark child restarting when delaying restart.
Otherwise the child might not be restarted on failure under certain
circumstances (see the code to reproduce the issue below).
Here's the sequence of events leading to the issue.
1. A child crashes multiple times forcing the supervisor2 to
delay-restart it.
2. Upon scheduling the delayed restart, the supervisor calls
state_del_child/2 which sets child's pid to undefined.
3. After the delay, on restart attempt the child crashes before
replying to the supervisor.
4. The supervisor schedules another restart attempt via
try_again_restart/3. Before that restart/3 would try to mark the
child as being restarted by means of calling restarting/1 on
child's pid. But since the pid is undefined, the function keeps it
unchanged.
5. When the supervisor handles the try_again_restart cast, it expects
the process to be in restarting state. Since it's not the case, it
just ignores the message.
6. As a result, the child doesn't get restarted.
This patch addresses the issue by marking the child as being restarted
in step 2. The only visible behavior change for the user is that when
the child is scheduled to be restarted, supervisor2:which_children
would return restarting instead of undefined for the child. This seems
appropriate and nobody should have relied on the old behavior.
The code to reproduce the issue:
-module(reproducer).
-behavior(supervisor2).
-export([start/0, init/1, child/0]).
start_link() ->
supervisor2:start_link({local, parent}, ?MODULE, []).
init([]) ->
{ok, {{one_for_one, 1, 10},
[{child, {proc_lib, start_link, [?MODULE, child, []]},
{permanent, 2}, brutal_kill, worker, []}]}}.
child() ->
io:format("starting child~n"),
case application:get_env(kernel, fail_in_init) of
undefined ->
ok;
_ ->
io:format("crashing in init~n"),
exit(die)
end,
proc_lib:init_ack({ok, self()}),
exit(die).
start() ->
{ok, _Sup} = start_link(),
timer:sleep(500),
application:set_env(kernel, fail_in_init, true),
timer:sleep(10000),
io:format("done~n"),
init:stop().
Output without the patch:
$ erl -noinput -run reproducer
starting child
starting child
starting child
crashing in init
done
Output with the patch:
$ erl -noinput -run reproducer
starting child
starting child
starting child
crashing in init
starting child
crashing in init
starting child
crashing in init
starting child
crashing in init
starting child
crashing in init
done
Diffstat (limited to 'deps/rabbit_common/src/supervisor2.erl')
-rw-r--r-- | deps/rabbit_common/src/supervisor2.erl | 3 |
1 files changed, 2 insertions, 1 deletions
diff --git a/deps/rabbit_common/src/supervisor2.erl b/deps/rabbit_common/src/supervisor2.erl index 8cddee3b74..f3d58e2895 100644 --- a/deps/rabbit_common/src/supervisor2.erl +++ b/deps/rabbit_common/src/supervisor2.erl @@ -878,7 +878,8 @@ do_restart_delay({RestartType, Delay}, Reason, Child, State) -> _TRef = erlang:send_after(trunc(Delay*1000), self(), {delayed_restart, {{RestartType, Delay}, Reason, Child}}), - {ok, state_del_child(Child, State)} + OldPid = Child#child.pid, + {ok, replace_child(Child#child{pid=restarting(OldPid)}, State)} end. restart(Child, State) -> |