summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichal Kuratczyk <mkuratczyk@vmware.com>2021-11-02 13:37:08 +0100
committermergify-bot <noreply@mergify.io>2021-11-03 09:21:00 +0000
commita92a5327d5619fca4ef18805b09ad9b6faf3463e (patch)
treec936c41d0b25d51c55915f480add04a0c7a803df
parent054b7e4c5ac3bad6140dfb7df147991d973aa3a0 (diff)
downloadrabbitmq-server-git-a92a5327d5619fca4ef18805b09ad9b6faf3463e.tar.gz
Use erlang:system_info(creation) as GUID
Node GUID allows to differentiate between different incarnations of a node. However, since rabbit may take some time to start (many queues/bindings, etc), there could be a significant difference between Erlang VM being up and responding to RPC requests and the new GUID being announced. During that time, node monitor could incorrectly assume there was a network partition, while in fact a node was simply restarted. With this change, as soon as the Erlang VM is up, we can tell whether it was restarted and avoid false positives. Additionally, we now log if any queues were deleted on behalf of the restarted node. This can take quite a long time if there are many transient queues (eg. auto-delete queues). The longer this takes, the higher were the odds of a restarted node being up again by the time check_partial_partition was called. We may need to reconsider this logic as well but for now - we just log this activity. Co-authored-by: Loïc Hoguin <lhoguin@vmware.com>
-rw-r--r--deps/rabbit/src/rabbit_amqqueue.erl6
-rw-r--r--deps/rabbit/src/rabbit_node_monitor.erl18
2 files changed, 19 insertions, 5 deletions
diff --git a/deps/rabbit/src/rabbit_amqqueue.erl b/deps/rabbit/src/rabbit_amqqueue.erl
index 410105e310..1706ca434c 100644
--- a/deps/rabbit/src/rabbit_amqqueue.erl
+++ b/deps/rabbit/src/rabbit_amqqueue.erl
@@ -1905,7 +1905,11 @@ maybe_clear_recoverable_node(Node, Q) ->
-spec on_node_down(node()) -> 'ok'.
on_node_down(Node) ->
- {QueueNames, QueueDeletions} = delete_queues_on_node_down(Node),
+ {Time, {QueueNames, QueueDeletions}} = timer:tc(fun() -> delete_queues_on_node_down(Node) end),
+ case length(QueueNames) of
+ 0 -> ok;
+ _ -> rabbit_log:info("~p transient queues from an old incarnation of node ~p deleted in ~fs", [length(QueueNames), Node, Time/1000000])
+ end,
notify_queue_binding_deletions(QueueDeletions),
rabbit_core_metrics:queues_deleted(QueueNames),
notify_queues_deleted(QueueNames),
diff --git a/deps/rabbit/src/rabbit_node_monitor.erl b/deps/rabbit/src/rabbit_node_monitor.erl
index 4de4e4f1ca..55b3bcb9eb 100644
--- a/deps/rabbit/src/rabbit_node_monitor.erl
+++ b/deps/rabbit/src/rabbit_node_monitor.erl
@@ -366,7 +366,7 @@ init([]) ->
{ok, ensure_keepalive_timer(#state{monitors = Monitors,
subscribers = pmon:new(),
partitions = [],
- guid = rabbit_guid:gen(),
+ guid = erlang:system_info(creation),
node_guids = maps:new(),
autoheal = rabbit_autoheal:init()})}.
@@ -416,6 +416,13 @@ handle_cast(notify_node_up, State = #state{guid = GUID}) ->
%% disconnected, it would become a minority, pause, realise it's not
%% in a minority any more, and come back, still partitioned (albeit no
%% longer partially).
+%%
+%% UPDATE: The GUID is actually not a GUID anymore - it is the value
+%% returned by erlang:system_info(creation). This prevent false-positives
+%% in a situation when a node is restarted (Erlang VM is up) but the rabbit
+%% app is not yet up. The GUID was only generated and announced upon rabbit
+%% startup; creation is available immediately. Therefore we can tell that
+%% the node was restarted, before it announces the new value.
%% ----------------------------------------------------------------------------
handle_cast({node_up, Node, NodeType, GUID},
@@ -435,15 +442,18 @@ handle_cast({check_partial_partition, Node, Rep, NodeGUID, MyGUID, RepGUID},
maps:find(Node, GUIDs) =:= {ok, NodeGUID} of
true -> spawn_link( %%[1]
fun () ->
- case rpc:call(Node, rabbit, is_running, []) of
+ case rpc:call(Node, erlang, system_info, [creation]) of
{badrpc, _} -> ok;
- _ ->
+ NodeGUID ->
rabbit_log:warning("Received a 'DOWN' message"
" from ~p but still can"
" communicate with it ",
[Node]),
cast(Rep, {partial_partition,
- Node, node(), RepGUID})
+ Node, node(), RepGUID});
+ _ ->
+ rabbit_log:warning("Node ~p was restarted", [Node]),
+ ok
end
end);
false -> ok