diff options
author | Rickard Green <rickard@erlang.org> | 2022-02-09 17:03:55 +0100 |
---|---|---|
committer | Rickard Green <rickard@erlang.org> | 2022-02-22 07:13:49 +0100 |
commit | 54437747c93b6512375574b17ce0ac60baf7fdfb (patch) | |
tree | c68f896f3129bfd008813eef15a0a9e14c7ea683 | |
parent | 6d9aa305fbf128ae70209cc2dae8db2113b41d00 (diff) | |
download | erlang-54437747c93b6512375574b17ce0ac60baf7fdfb.tar.gz |
Adjust shutdown in peer:stop()
* Default to the new shutdown type 'halt' which performs erlang:halt()
and waits for dist connection to go down.
* Graceful shutdown using init:stop() also waits for dist connection to go
down.
* Old shutdown type 'brutal_kill' renamed to 'close', since it just close
connection used for supervision and then return.
-rw-r--r-- | lib/stdlib/doc/src/peer.xml | 79 | ||||
-rw-r--r-- | lib/stdlib/src/peer.erl | 146 | ||||
-rw-r--r-- | lib/stdlib/test/peer_SUITE.erl | 77 |
3 files changed, 279 insertions, 23 deletions
diff --git a/lib/stdlib/doc/src/peer.xml b/lib/stdlib/doc/src/peer.xml index a5b513d810..6ac2c6b278 100644 --- a/lib/stdlib/doc/src/peer.xml +++ b/lib/stdlib/doc/src/peer.xml @@ -379,6 +379,11 @@ is specified instead of a timeout, the peer will send <c>Tag</c> to the requested process.</p></desc> </datatype> + <datatype> + <name name="disconnect_timeout"/> + <desc><p>Disconnect timeout. See + <seemfa marker="#stop/1"><c>stop()</c></seemfa>.</p></desc> + </datatype> </datatypes> <funcs> @@ -525,12 +530,76 @@ <func> <name name="stop" arity="1" since="OTP @OTP-17720@"/> <fsummary>Stop controlling process and terminate peer node.</fsummary> + <type name="disconnect_timeout"/> <desc> - <p>Depending on the <c>shutdown</c> option used to start the peer, - stopping may be graceful (via <c>init:stop()</c>) or less so, - by just closing the control connection, which triggers <c>erlang:halt()</c> - executed by the peer.</p> - <p>A graceful shutdown can be slower, or even block indefinitely.</p> + <p> + Stops a peer node. How the node is stopped depends on the + <seetype marker="#start_options"><c>shutdown</c></seetype> + option passed when starting the peer node. Currently the + following <c>shutdown</c> options are supported: + </p> + <taglist> + <tag><c>halt</c></tag> + <item><p> + This is the default shutdown behavior. It behaves as <c>shutdown</c> + option <c>{halt, DefaultTimeout}</c> where <c>DefaultTimeout</c> + currently equals <c>5000</c>. + </p></item> + <tag><c>{halt, Timeout :: disconnect_timeout()}</c></tag> + <item><p> + Triggers a call to + <seemfa marker="erts:erlang#halt/0"><c>erlang:halt()</c></seemfa> + on the peer node and then waits for the Erlang distribution + connection to the peer node to be taken down. If this connection + has not been taken down after <c>Timeout</c> milliseconds, it will + forcefully be taken down by <c>peer:stop/1</c>. See the + <seeerl marker="#dist_connection_close">warning</seeerl> below for + more info about this. + </p></item> + <tag><c>Timeout :: disconnect_timeout()</c></tag> + <item><p> + Triggers a call to + <seemfa marker="erts:init#stop/0"><c>init:stop()</c></seemfa> + on the peer node and then waits for the Erlang distribution + connection to the peer node to be taken down. If this connection + has not been taken down after <c>Timeout</c> milliseconds, it will + forcefully be taken down by <c>peer:stop/1</c>. See the + <seeerl marker="#dist_connection_close">warning</seeerl> below for + more info about this. + </p></item> + <tag><c>close</c></tag> + <item><p> + Close the <i>control connection</i> to the peer node and + return. This is the fastest way for the caller of + <c>peer:stop/1</c> to stop a peer node. + </p> + <p> + Note that if the Erlang distribution connection is not used as + control connection it might not have been taken down when + <c>peer:stop/1</c> returns. Also note that the + <seeerl marker="#dist_connection_close">warning</seeerl> below + applies when the Erlang distribution connection is used as control + connection. + </p> + </item> + </taglist> + + <marker id="dist_connection_close"/> + <warning> + <p> + In the cases where the Erlang distribution connection is taken + down by <c>peer:stop/1</c>, other code independent of the peer + code might react to the connection loss before the peer node is + stopped which might cause undesirable effects. For example, + <seeerl marker="kernel:global#prevent_overlapping_partitions"><c>global</c></seeerl> + might trigger even more Erlang distribution connections to other + nodes to be taken down. The potential undesirable effects are, + however, not limited to this. It is hard to say what the effects + will be since these effects can be caused by any code with links + or monitors to something on the origin node, or code monitoring + the connection to the origin node. + </p> + </warning> </desc> </func> diff --git a/lib/stdlib/src/peer.erl b/lib/stdlib/src/peer.erl index b0af4cd830..1c04b03b10 100644 --- a/lib/stdlib/src/peer.erl +++ b/lib/stdlib/src/peer.erl @@ -112,8 +112,10 @@ args => [string()], %% additional command line parameters to append env => [{string(), string()}], %% additional environment variables wait_boot => wait_boot(), %% default is synchronous start with 15 sec timeout - shutdown => brutal_kill | %% halt the peer brutally (default) - timeout() %% send init:stop() request and wait up to specified timeout + shutdown => close | %% close supervision channel + halt | %% The default... %% stop node using erlang:halt() wait default timeout for nodedown + {halt, disconnect_timeout()} | %% stop node using erlang:halt() wait timeout() for nodedown + disconnect_timeout() %% send init:stop() request and wait up to specified timeout for nodedown }. %% Peer node states @@ -122,9 +124,22 @@ -export_type([ start_options/0, peer_state/0, - exec/0 + exec/0, + disconnect_timeout/0 ]). +%% Maximum integer timeout value in a receive... +-define (MAX_INT_TIMEOUT, 4294967295). + +%% Default time we wait for distributed connection to be removed, +%% when shutdown type is halt, before we disconnect from the node... +-define (DEFAULT_HALT_DISCONNECT_TIMEOUT, 5000). + +%% Minimum time we wait for distributed connection to be removed, +%% before we disconnect from the node (except in the shutdown +%% close case)... +-define (MIN_DISCONNECT_TIMEOUT, 1000). + %% Socket connect timeout, for TCP connection. -define (CONNECT_TIMEOUT, 10000). @@ -140,6 +155,8 @@ %% Default timeout for peer node to boot. -define (WAIT_BOOT_TIMEOUT, 15000). +-type disconnect_timeout() :: ?MIN_DISCONNECT_TIMEOUT..?MAX_INT_TIMEOUT | infinity. + %% @doc Creates random node name, using "peer" as prefix. -spec random_name() -> string(). random_name() -> @@ -269,7 +286,16 @@ init([Notify, Options]) -> receive {'EXIT', Port, _} -> undefined end end, - State = #peer_state{options = Options, notify = Notify, args = Args, exec = Exec}, + %% Remove the default 'halt' shutdown option if present; the default is + %% defined in terminate()... + SaveOptions = case maps:find(shutdown, Options) of + {ok, halt} -> + maps:remove(shutdown, Options); + _ -> + Options + end, + + State = #peer_state{options = SaveOptions, notify = Notify, args = Args, exec = Exec}, %% accept TCP connection if requested if ListenSocket =:= undefined -> @@ -386,38 +412,101 @@ handle_info({tcp_closed, Sock}, #peer_state{connection = Sock} = State) -> -spec terminate(Reason :: term(), state()) -> ok. terminate(_Reason, #peer_state{connection = Port, options = Options, node = Node}) -> - case {maps:get(shutdown, Options, brutal_kill), maps:find(connection, Options)} of - {brutal_kill, {ok, standard_io}} -> + case {maps:get(shutdown, Options, {halt, ?DEFAULT_HALT_DISCONNECT_TIMEOUT}), + maps:find(connection, Options)} of + {close, {ok, standard_io}} -> Port /= undefined andalso (catch erlang:port_close(Port)); - {brutal_kill, {ok, _TCP}} -> + {close, {ok, _TCP}} -> Port /= undefined andalso (catch gen_tcp:close(Port)); - {brutal_kill, error} -> - net_kernel:disconnect(Node); + {close, error} -> + _ = erlang:disconnect_node(Node); + {{halt, Timeout}, {ok, standard_io}} -> + Port /= undefined andalso (catch erlang:port_close(Port)), + wait_disconnected(Node, {timeout, Timeout}); + {{halt, Timeout}, {ok, _TCP}} -> + Port /= undefined andalso (catch gen_tcp:close(Port)), + wait_disconnected(Node, {timeout, Timeout}); + {{halt, Timeout}, error} -> + try + _ = erpc:call(Node, erlang, halt, [], Timeout), + ok + catch + error:{erpc,noconnection} -> ok; + _:_ -> force_disconnect_node(Node) + end; {Shutdown, error} -> Timeout = shutdown(dist, undefined, Node, Shutdown), - receive {nodedown, Node} -> ok after Timeout -> ok end; + wait_disconnected(Node, {timeout, Timeout}); {Shutdown, {ok, standard_io}} -> Timeout = shutdown(port, Port, Node, Shutdown), + Deadline = deadline(Timeout), receive {'EXIT', Port, _Reason2} -> ok after Timeout -> ok end, - catch erlang:port_close(Port); + catch erlang:port_close(Port), + wait_disconnected(Node, Deadline); {Shutdown, {ok, _TCP}} -> Timeout = shutdown(tcp, Port, Node, Shutdown), + Deadline = deadline(Timeout), receive {tcp_closed, Port} -> ok after Timeout -> ok end, - catch catch gen_tcp:close(Port) + catch catch gen_tcp:close(Port), + wait_disconnected(Node, Deadline) end, ok. %%-------------------------------------------------------------------- %% Internal implementation +deadline(infinity) -> + {timeout, infinity}; +deadline(Timeout) when is_integer(Timeout) -> + {deadline, erlang:monotonic_time(millisecond) + Timeout}. + +wait_disconnected(Node, WaitUntil) -> + %% Should only be called just before we are exiting the caller, so + %% we do not bother disabling nodes monitoring if we enable it and + %% do not flush any nodeup/nodedown messages that we got due to the + %% nodes monitoring... + case lists:member(Node, nodes(connected)) of + false -> + ok; + true -> + _ = net_kernel:monitor_nodes(true, [{node_type, all}]), + %% Need to check connected nodes list again, since it + %% might have disconnected before we enabled nodes + %% monitoring... + case lists:member(Node, nodes(connected)) of + false -> + ok; + true -> + Tmo = case WaitUntil of + {timeout, T} -> + T; + {deadline, T} -> + TL = T - erlang:monotonic_time(millisecond), + if TL < 0 -> 0; + true -> TL + end + end, + receive {nodedown, Node, _} -> ok + after Tmo -> force_disconnect_node(Node) + end + end + end. + +force_disconnect_node(Node) -> + _ = erlang:disconnect_node(Node), + logger:warning("peer:stop() timed out waiting for disconnect from " + "node ~p. The connection was forcefully taken down.", + [Node]). + %% This hack is a temporary workaround for test coverage reports -shutdown(_Type, _Port, _Node, Timeout) when is_integer(Timeout); timeout =:= infinity -> +shutdown(_Type, _Port, Node, Timeout) when is_integer(Timeout); Timeout =:= infinity -> + erpc:cast(Node, init, stop, []), Timeout; -shutdown(dist, undefined, Node, {Timeout, CoverNode}) when is_integer(Timeout); timeout =:= infinity -> +shutdown(dist, undefined, Node, {Timeout, CoverNode}) when is_integer(Timeout); Timeout =:= infinity -> rpc:call(CoverNode, cover, flush, [Node]), erpc:cast(Node, init, stop, []), Timeout; -shutdown(Type, Port, Node, {Timeout, CoverNode}) when is_integer(Timeout); timeout =:= infinity -> +shutdown(Type, Port, Node, {Timeout, CoverNode}) when is_integer(Timeout); Timeout =:= infinity -> rpc:call(CoverNode, cover, flush, [Node]), Port /= undefined andalso origin_to_peer(Type, Port, {cast, init, stop, []}), Timeout. @@ -445,7 +534,34 @@ verify_args(Options) -> ok; {ok, Err} -> error({exec, Err}) + end, + case maps:find(shutdown, Options) of + {ok, close} -> + ok; + {ok, halt} -> + ok; + {ok, {halt, Tmo}} when (is_integer(Tmo) + andalso ?MIN_DISCONNECT_TIMEOUT =< Tmo + andalso Tmo =< ?MAX_INT_TIMEOUT) + orelse Tmo == infinity -> + ok; + {ok, Tmo} when (is_integer(Tmo) + andalso ?MIN_DISCONNECT_TIMEOUT =< Tmo + andalso Tmo =< ?MAX_INT_TIMEOUT) + orelse Tmo == infinity -> + ok; + {ok, {Tmo, Node}} when ((is_integer(Tmo) + andalso ?MIN_DISCONNECT_TIMEOUT =< Tmo + andalso Tmo =< ?MAX_INT_TIMEOUT) + orelse Tmo == infinity) + andalso is_atom(Node) -> + ok; + error -> + ok; + {ok, Err2} -> + error({shutdown, Err2}) end. + make_notify_ref(infinity) -> {self(), make_ref()}; diff --git a/lib/stdlib/test/peer_SUITE.erl b/lib/stdlib/test/peer_SUITE.erl index 185bfdc18d..73b98bec07 100644 --- a/lib/stdlib/test/peer_SUITE.erl +++ b/lib/stdlib/test/peer_SUITE.erl @@ -32,6 +32,11 @@ detached/0, detached/1, dyn_peer/0, dyn_peer/1, stop_peer/0, stop_peer/1, + shutdown_halt/0, shutdown_halt/1, + shutdown_halt_timeout/0, shutdown_halt_timeout/1, + shutdown_stop/0, shutdown_stop/1, + shutdown_stop_timeout/0, shutdown_stop_timeout/1, + shutdown_close/0, shutdown_close/1, init_debug/0, init_debug/1, io_redirect/0, io_redirect/1, multi_node/0, multi_node/1, @@ -45,14 +50,17 @@ suite() -> [{timetrap, {minutes, 1}}]. +shutdown_alternatives() -> + [shutdown_halt, shutdown_halt_timeout, shutdown_stop, shutdown_stop_timeout, shutdown_close]. + alternative() -> - [basic, peer_states, cast, detached, dyn_peer, stop_peer, io_redirect, - multi_node, duplicate_name]. + [basic, peer_states, cast, detached, dyn_peer, stop_peer, + io_redirect, multi_node, duplicate_name] ++ shutdown_alternatives(). groups() -> [ {dist, [parallel], [errors, dist, peer_down_crash, peer_down_continue, peer_down_boot, - dist_up_down, dist_localhost]}, + dist_up_down, dist_localhost] ++ shutdown_alternatives()}, {dist_seq, [], [dist_io_redirect]}, %% Cannot be run in parallel in dist group {tcp, [parallel], alternative()}, {standard_io, [parallel], [init_debug | alternative()]}, @@ -328,6 +336,69 @@ stop_peer(Config) when is_list(Config) -> %% shutdown node peer:stop(Peer). +shutdown_halt() -> + [{doc, "Test that peer shutdown halt wait until node connection is down"}]. +shutdown_halt(Config) when is_list(Config) -> + false = shutdown_test(Config, ?FUNCTION_NAME, halt, 500, false, 1000), + ok. + +shutdown_halt_timeout() -> + [{doc, "Test that peer shutdown halt forcefully takes down connection on timeout"}]. +shutdown_halt_timeout(Config) when is_list(Config) -> + false = shutdown_test(Config, ?FUNCTION_NAME, {halt, 1000}, 5000, true, 1500), + ok. + +shutdown_stop() -> + [{doc, "Test that peer shutdown stop wait until node connection is down"}]. +shutdown_stop(Config) when is_list(Config) -> + false = shutdown_test(Config, ?FUNCTION_NAME, infinity, 500, false, 2000), + ok. + +shutdown_stop_timeout() -> + [{doc, "Test that peer shutdown stop forcefully takes down connection on timeout"}]. +shutdown_stop_timeout(Config) when is_list(Config) -> + false = shutdown_test(Config, ?FUNCTION_NAME, 1000, 5000, true, 2500), + ok. + +shutdown_close() -> + [{doc, "Test that peer shutdown close does not wait for dist connection"}]. +shutdown_close(Config) when is_list(Config) -> + _ = shutdown_test(Config, ?FUNCTION_NAME, close, 5000, true, 200), + ok. + +shutdown_test(Config, TC, Shutdown, BlockTime, StopWhileBlocked, MaxWaitTime) -> + Options0 = #{name => ?CT_PEER_NAME(TC), + shutdown => Shutdown, + args => ["-hidden", "-pa", filename:dirname(code:which(?MODULE)), + "-setcookie", atom_to_list(erlang:get_cookie())]}, + Options = case proplists:get_value(connection, Config) of + undefined -> Options0; + Conn -> maps:put(connection, Conn, Options0) + end, + {ok, Peer, Node} = peer:start_link(Options), + EnsureBlockedWait = 500, + BlockStart = erlang:monotonic_time(millisecond), + erpc:cast(Node, + fun () -> + erts_debug:set_internal_state(available_internal_state, true), + erts_debug:set_internal_state(block, BlockTime+EnsureBlockedWait) + end), + receive after EnsureBlockedWait -> ok end, %% Ensure blocked... + Start = erlang:monotonic_time(millisecond), + peer:stop(Peer), + End = erlang:monotonic_time(millisecond), + WaitTime = End - Start, + BlockTimeLeft = BlockTime + EnsureBlockedWait - (Start - BlockStart), + Connected = lists:member(Node, nodes(connected)), + ct:pal("Connected = ~p~nWaitTime = ~p~nBlockTimeLeft = ~p~n", + [Connected, WaitTime, BlockTimeLeft]), + true = WaitTime =< MaxWaitTime, + case StopWhileBlocked of + true -> ok; + false -> true = WaitTime >= BlockTimeLeft, ok + end, + Connected. + init_debug() -> [{doc, "Test that debug messages in init work"}]. |