summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRickard Green <rickard@erlang.org>2022-02-09 17:03:55 +0100
committerRickard Green <rickard@erlang.org>2022-02-22 07:13:49 +0100
commit54437747c93b6512375574b17ce0ac60baf7fdfb (patch)
treec68f896f3129bfd008813eef15a0a9e14c7ea683
parent6d9aa305fbf128ae70209cc2dae8db2113b41d00 (diff)
downloaderlang-54437747c93b6512375574b17ce0ac60baf7fdfb.tar.gz
Adjust shutdown in peer:stop()
* Default to the new shutdown type 'halt' which performs erlang:halt() and waits for dist connection to go down. * Graceful shutdown using init:stop() also waits for dist connection to go down. * Old shutdown type 'brutal_kill' renamed to 'close', since it just close connection used for supervision and then return.
-rw-r--r--lib/stdlib/doc/src/peer.xml79
-rw-r--r--lib/stdlib/src/peer.erl146
-rw-r--r--lib/stdlib/test/peer_SUITE.erl77
3 files changed, 279 insertions, 23 deletions
diff --git a/lib/stdlib/doc/src/peer.xml b/lib/stdlib/doc/src/peer.xml
index a5b513d810..6ac2c6b278 100644
--- a/lib/stdlib/doc/src/peer.xml
+++ b/lib/stdlib/doc/src/peer.xml
@@ -379,6 +379,11 @@
is specified instead of a timeout, the peer will send <c>Tag</c> to the
requested process.</p></desc>
</datatype>
+ <datatype>
+ <name name="disconnect_timeout"/>
+ <desc><p>Disconnect timeout. See
+ <seemfa marker="#stop/1"><c>stop()</c></seemfa>.</p></desc>
+ </datatype>
</datatypes>
<funcs>
@@ -525,12 +530,76 @@
<func>
<name name="stop" arity="1" since="OTP @OTP-17720@"/>
<fsummary>Stop controlling process and terminate peer node.</fsummary>
+ <type name="disconnect_timeout"/>
<desc>
- <p>Depending on the <c>shutdown</c> option used to start the peer,
- stopping may be graceful (via <c>init:stop()</c>) or less so,
- by just closing the control connection, which triggers <c>erlang:halt()</c>
- executed by the peer.</p>
- <p>A graceful shutdown can be slower, or even block indefinitely.</p>
+ <p>
+ Stops a peer node. How the node is stopped depends on the
+ <seetype marker="#start_options"><c>shutdown</c></seetype>
+ option passed when starting the peer node. Currently the
+ following <c>shutdown</c> options are supported:
+ </p>
+ <taglist>
+ <tag><c>halt</c></tag>
+ <item><p>
+ This is the default shutdown behavior. It behaves as <c>shutdown</c>
+ option <c>{halt, DefaultTimeout}</c> where <c>DefaultTimeout</c>
+ currently equals <c>5000</c>.
+ </p></item>
+ <tag><c>{halt, Timeout :: disconnect_timeout()}</c></tag>
+ <item><p>
+ Triggers a call to
+ <seemfa marker="erts:erlang#halt/0"><c>erlang:halt()</c></seemfa>
+ on the peer node and then waits for the Erlang distribution
+ connection to the peer node to be taken down. If this connection
+ has not been taken down after <c>Timeout</c> milliseconds, it will
+ forcefully be taken down by <c>peer:stop/1</c>. See the
+ <seeerl marker="#dist_connection_close">warning</seeerl> below for
+ more info about this.
+ </p></item>
+ <tag><c>Timeout :: disconnect_timeout()</c></tag>
+ <item><p>
+ Triggers a call to
+ <seemfa marker="erts:init#stop/0"><c>init:stop()</c></seemfa>
+ on the peer node and then waits for the Erlang distribution
+ connection to the peer node to be taken down. If this connection
+ has not been taken down after <c>Timeout</c> milliseconds, it will
+ forcefully be taken down by <c>peer:stop/1</c>. See the
+ <seeerl marker="#dist_connection_close">warning</seeerl> below for
+ more info about this.
+ </p></item>
+ <tag><c>close</c></tag>
+ <item><p>
+ Close the <i>control connection</i> to the peer node and
+ return. This is the fastest way for the caller of
+ <c>peer:stop/1</c> to stop a peer node.
+ </p>
+ <p>
+ Note that if the Erlang distribution connection is not used as
+ control connection it might not have been taken down when
+ <c>peer:stop/1</c> returns. Also note that the
+ <seeerl marker="#dist_connection_close">warning</seeerl> below
+ applies when the Erlang distribution connection is used as control
+ connection.
+ </p>
+ </item>
+ </taglist>
+
+ <marker id="dist_connection_close"/>
+ <warning>
+ <p>
+ In the cases where the Erlang distribution connection is taken
+ down by <c>peer:stop/1</c>, other code independent of the peer
+ code might react to the connection loss before the peer node is
+ stopped which might cause undesirable effects. For example,
+ <seeerl marker="kernel:global#prevent_overlapping_partitions"><c>global</c></seeerl>
+ might trigger even more Erlang distribution connections to other
+ nodes to be taken down. The potential undesirable effects are,
+ however, not limited to this. It is hard to say what the effects
+ will be since these effects can be caused by any code with links
+ or monitors to something on the origin node, or code monitoring
+ the connection to the origin node.
+ </p>
+ </warning>
</desc>
</func>
diff --git a/lib/stdlib/src/peer.erl b/lib/stdlib/src/peer.erl
index b0af4cd830..1c04b03b10 100644
--- a/lib/stdlib/src/peer.erl
+++ b/lib/stdlib/src/peer.erl
@@ -112,8 +112,10 @@
args => [string()], %% additional command line parameters to append
env => [{string(), string()}], %% additional environment variables
wait_boot => wait_boot(), %% default is synchronous start with 15 sec timeout
- shutdown => brutal_kill | %% halt the peer brutally (default)
- timeout() %% send init:stop() request and wait up to specified timeout
+ shutdown => close | %% close supervision channel
+ halt | %% The default... %% stop node using erlang:halt() wait default timeout for nodedown
+ {halt, disconnect_timeout()} | %% stop node using erlang:halt() wait timeout() for nodedown
+ disconnect_timeout() %% send init:stop() request and wait up to specified timeout for nodedown
}.
%% Peer node states
@@ -122,9 +124,22 @@
-export_type([
start_options/0,
peer_state/0,
- exec/0
+ exec/0,
+ disconnect_timeout/0
]).
+%% Maximum integer timeout value in a receive...
+-define (MAX_INT_TIMEOUT, 4294967295).
+
+%% Default time we wait for distributed connection to be removed,
+%% when shutdown type is halt, before we disconnect from the node...
+-define (DEFAULT_HALT_DISCONNECT_TIMEOUT, 5000).
+
+%% Minimum time we wait for distributed connection to be removed,
+%% before we disconnect from the node (except in the shutdown
+%% close case)...
+-define (MIN_DISCONNECT_TIMEOUT, 1000).
+
%% Socket connect timeout, for TCP connection.
-define (CONNECT_TIMEOUT, 10000).
@@ -140,6 +155,8 @@
%% Default timeout for peer node to boot.
-define (WAIT_BOOT_TIMEOUT, 15000).
+-type disconnect_timeout() :: ?MIN_DISCONNECT_TIMEOUT..?MAX_INT_TIMEOUT | infinity.
+
%% @doc Creates random node name, using "peer" as prefix.
-spec random_name() -> string().
random_name() ->
@@ -269,7 +286,16 @@ init([Notify, Options]) ->
receive {'EXIT', Port, _} -> undefined end
end,
- State = #peer_state{options = Options, notify = Notify, args = Args, exec = Exec},
+ %% Remove the default 'halt' shutdown option if present; the default is
+ %% defined in terminate()...
+ SaveOptions = case maps:find(shutdown, Options) of
+ {ok, halt} ->
+ maps:remove(shutdown, Options);
+ _ ->
+ Options
+ end,
+
+ State = #peer_state{options = SaveOptions, notify = Notify, args = Args, exec = Exec},
%% accept TCP connection if requested
if ListenSocket =:= undefined ->
@@ -386,38 +412,101 @@ handle_info({tcp_closed, Sock}, #peer_state{connection = Sock} = State) ->
-spec terminate(Reason :: term(), state()) -> ok.
terminate(_Reason, #peer_state{connection = Port, options = Options, node = Node}) ->
- case {maps:get(shutdown, Options, brutal_kill), maps:find(connection, Options)} of
- {brutal_kill, {ok, standard_io}} ->
+ case {maps:get(shutdown, Options, {halt, ?DEFAULT_HALT_DISCONNECT_TIMEOUT}),
+ maps:find(connection, Options)} of
+ {close, {ok, standard_io}} ->
Port /= undefined andalso (catch erlang:port_close(Port));
- {brutal_kill, {ok, _TCP}} ->
+ {close, {ok, _TCP}} ->
Port /= undefined andalso (catch gen_tcp:close(Port));
- {brutal_kill, error} ->
- net_kernel:disconnect(Node);
+ {close, error} ->
+ _ = erlang:disconnect_node(Node);
+ {{halt, Timeout}, {ok, standard_io}} ->
+ Port /= undefined andalso (catch erlang:port_close(Port)),
+ wait_disconnected(Node, {timeout, Timeout});
+ {{halt, Timeout}, {ok, _TCP}} ->
+ Port /= undefined andalso (catch gen_tcp:close(Port)),
+ wait_disconnected(Node, {timeout, Timeout});
+ {{halt, Timeout}, error} ->
+ try
+ _ = erpc:call(Node, erlang, halt, [], Timeout),
+ ok
+ catch
+ error:{erpc,noconnection} -> ok;
+ _:_ -> force_disconnect_node(Node)
+ end;
{Shutdown, error} ->
Timeout = shutdown(dist, undefined, Node, Shutdown),
- receive {nodedown, Node} -> ok after Timeout -> ok end;
+ wait_disconnected(Node, {timeout, Timeout});
{Shutdown, {ok, standard_io}} ->
Timeout = shutdown(port, Port, Node, Shutdown),
+ Deadline = deadline(Timeout),
receive {'EXIT', Port, _Reason2} -> ok after Timeout -> ok end,
- catch erlang:port_close(Port);
+ catch erlang:port_close(Port),
+ wait_disconnected(Node, Deadline);
{Shutdown, {ok, _TCP}} ->
Timeout = shutdown(tcp, Port, Node, Shutdown),
+ Deadline = deadline(Timeout),
receive {tcp_closed, Port} -> ok after Timeout -> ok end,
- catch catch gen_tcp:close(Port)
+ catch catch gen_tcp:close(Port),
+ wait_disconnected(Node, Deadline)
end,
ok.
%%--------------------------------------------------------------------
%% Internal implementation
+deadline(infinity) ->
+ {timeout, infinity};
+deadline(Timeout) when is_integer(Timeout) ->
+ {deadline, erlang:monotonic_time(millisecond) + Timeout}.
+
+wait_disconnected(Node, WaitUntil) ->
+ %% Should only be called just before we are exiting the caller, so
+ %% we do not bother disabling nodes monitoring if we enable it and
+ %% do not flush any nodeup/nodedown messages that we got due to the
+ %% nodes monitoring...
+ case lists:member(Node, nodes(connected)) of
+ false ->
+ ok;
+ true ->
+ _ = net_kernel:monitor_nodes(true, [{node_type, all}]),
+ %% Need to check connected nodes list again, since it
+ %% might have disconnected before we enabled nodes
+ %% monitoring...
+ case lists:member(Node, nodes(connected)) of
+ false ->
+ ok;
+ true ->
+ Tmo = case WaitUntil of
+ {timeout, T} ->
+ T;
+ {deadline, T} ->
+ TL = T - erlang:monotonic_time(millisecond),
+ if TL < 0 -> 0;
+ true -> TL
+ end
+ end,
+ receive {nodedown, Node, _} -> ok
+ after Tmo -> force_disconnect_node(Node)
+ end
+ end
+ end.
+
+force_disconnect_node(Node) ->
+ _ = erlang:disconnect_node(Node),
+ logger:warning("peer:stop() timed out waiting for disconnect from "
+ "node ~p. The connection was forcefully taken down.",
+ [Node]).
+
%% This hack is a temporary workaround for test coverage reports
-shutdown(_Type, _Port, _Node, Timeout) when is_integer(Timeout); timeout =:= infinity ->
+shutdown(_Type, _Port, Node, Timeout) when is_integer(Timeout); Timeout =:= infinity ->
+ erpc:cast(Node, init, stop, []),
Timeout;
-shutdown(dist, undefined, Node, {Timeout, CoverNode}) when is_integer(Timeout); timeout =:= infinity ->
+shutdown(dist, undefined, Node, {Timeout, CoverNode}) when is_integer(Timeout); Timeout =:= infinity ->
rpc:call(CoverNode, cover, flush, [Node]),
erpc:cast(Node, init, stop, []),
Timeout;
-shutdown(Type, Port, Node, {Timeout, CoverNode}) when is_integer(Timeout); timeout =:= infinity ->
+shutdown(Type, Port, Node, {Timeout, CoverNode}) when is_integer(Timeout); Timeout =:= infinity ->
rpc:call(CoverNode, cover, flush, [Node]),
Port /= undefined andalso origin_to_peer(Type, Port, {cast, init, stop, []}),
Timeout.
@@ -445,7 +534,34 @@ verify_args(Options) ->
ok;
{ok, Err} ->
error({exec, Err})
+ end,
+ case maps:find(shutdown, Options) of
+ {ok, close} ->
+ ok;
+ {ok, halt} ->
+ ok;
+ {ok, {halt, Tmo}} when (is_integer(Tmo)
+ andalso ?MIN_DISCONNECT_TIMEOUT =< Tmo
+ andalso Tmo =< ?MAX_INT_TIMEOUT)
+ orelse Tmo == infinity ->
+ ok;
+ {ok, Tmo} when (is_integer(Tmo)
+ andalso ?MIN_DISCONNECT_TIMEOUT =< Tmo
+ andalso Tmo =< ?MAX_INT_TIMEOUT)
+ orelse Tmo == infinity ->
+ ok;
+ {ok, {Tmo, Node}} when ((is_integer(Tmo)
+ andalso ?MIN_DISCONNECT_TIMEOUT =< Tmo
+ andalso Tmo =< ?MAX_INT_TIMEOUT)
+ orelse Tmo == infinity)
+ andalso is_atom(Node) ->
+ ok;
+ error ->
+ ok;
+ {ok, Err2} ->
+ error({shutdown, Err2})
end.
+
make_notify_ref(infinity) ->
{self(), make_ref()};
diff --git a/lib/stdlib/test/peer_SUITE.erl b/lib/stdlib/test/peer_SUITE.erl
index 185bfdc18d..73b98bec07 100644
--- a/lib/stdlib/test/peer_SUITE.erl
+++ b/lib/stdlib/test/peer_SUITE.erl
@@ -32,6 +32,11 @@
detached/0, detached/1,
dyn_peer/0, dyn_peer/1,
stop_peer/0, stop_peer/1,
+ shutdown_halt/0, shutdown_halt/1,
+ shutdown_halt_timeout/0, shutdown_halt_timeout/1,
+ shutdown_stop/0, shutdown_stop/1,
+ shutdown_stop_timeout/0, shutdown_stop_timeout/1,
+ shutdown_close/0, shutdown_close/1,
init_debug/0, init_debug/1,
io_redirect/0, io_redirect/1,
multi_node/0, multi_node/1,
@@ -45,14 +50,17 @@
suite() ->
[{timetrap, {minutes, 1}}].
+shutdown_alternatives() ->
+ [shutdown_halt, shutdown_halt_timeout, shutdown_stop, shutdown_stop_timeout, shutdown_close].
+
alternative() ->
- [basic, peer_states, cast, detached, dyn_peer, stop_peer, io_redirect,
- multi_node, duplicate_name].
+ [basic, peer_states, cast, detached, dyn_peer, stop_peer,
+ io_redirect, multi_node, duplicate_name] ++ shutdown_alternatives().
groups() ->
[
{dist, [parallel], [errors, dist, peer_down_crash, peer_down_continue, peer_down_boot,
- dist_up_down, dist_localhost]},
+ dist_up_down, dist_localhost] ++ shutdown_alternatives()},
{dist_seq, [], [dist_io_redirect]}, %% Cannot be run in parallel in dist group
{tcp, [parallel], alternative()},
{standard_io, [parallel], [init_debug | alternative()]},
@@ -328,6 +336,69 @@ stop_peer(Config) when is_list(Config) ->
%% shutdown node
peer:stop(Peer).
+shutdown_halt() ->
+ [{doc, "Test that peer shutdown halt wait until node connection is down"}].
+shutdown_halt(Config) when is_list(Config) ->
+ false = shutdown_test(Config, ?FUNCTION_NAME, halt, 500, false, 1000),
+ ok.
+
+shutdown_halt_timeout() ->
+ [{doc, "Test that peer shutdown halt forcefully takes down connection on timeout"}].
+shutdown_halt_timeout(Config) when is_list(Config) ->
+ false = shutdown_test(Config, ?FUNCTION_NAME, {halt, 1000}, 5000, true, 1500),
+ ok.
+
+shutdown_stop() ->
+ [{doc, "Test that peer shutdown stop wait until node connection is down"}].
+shutdown_stop(Config) when is_list(Config) ->
+ false = shutdown_test(Config, ?FUNCTION_NAME, infinity, 500, false, 2000),
+ ok.
+
+shutdown_stop_timeout() ->
+ [{doc, "Test that peer shutdown stop forcefully takes down connection on timeout"}].
+shutdown_stop_timeout(Config) when is_list(Config) ->
+ false = shutdown_test(Config, ?FUNCTION_NAME, 1000, 5000, true, 2500),
+ ok.
+
+shutdown_close() ->
+ [{doc, "Test that peer shutdown close does not wait for dist connection"}].
+shutdown_close(Config) when is_list(Config) ->
+ _ = shutdown_test(Config, ?FUNCTION_NAME, close, 5000, true, 200),
+ ok.
+
+shutdown_test(Config, TC, Shutdown, BlockTime, StopWhileBlocked, MaxWaitTime) ->
+ Options0 = #{name => ?CT_PEER_NAME(TC),
+ shutdown => Shutdown,
+ args => ["-hidden", "-pa", filename:dirname(code:which(?MODULE)),
+ "-setcookie", atom_to_list(erlang:get_cookie())]},
+ Options = case proplists:get_value(connection, Config) of
+ undefined -> Options0;
+ Conn -> maps:put(connection, Conn, Options0)
+ end,
+ {ok, Peer, Node} = peer:start_link(Options),
+ EnsureBlockedWait = 500,
+ BlockStart = erlang:monotonic_time(millisecond),
+ erpc:cast(Node,
+ fun () ->
+ erts_debug:set_internal_state(available_internal_state, true),
+ erts_debug:set_internal_state(block, BlockTime+EnsureBlockedWait)
+ end),
+ receive after EnsureBlockedWait -> ok end, %% Ensure blocked...
+ Start = erlang:monotonic_time(millisecond),
+ peer:stop(Peer),
+ End = erlang:monotonic_time(millisecond),
+ WaitTime = End - Start,
+ BlockTimeLeft = BlockTime + EnsureBlockedWait - (Start - BlockStart),
+ Connected = lists:member(Node, nodes(connected)),
+ ct:pal("Connected = ~p~nWaitTime = ~p~nBlockTimeLeft = ~p~n",
+ [Connected, WaitTime, BlockTimeLeft]),
+ true = WaitTime =< MaxWaitTime,
+ case StopWhileBlocked of
+ true -> ok;
+ false -> true = WaitTime >= BlockTimeLeft, ok
+ end,
+ Connected.
+
init_debug() ->
[{doc, "Test that debug messages in init work"}].