diff options
author | Michal Kuratczyk <mkuratczyk@vmware.com> | 2021-11-02 13:37:08 +0100 |
---|---|---|
committer | mergify-bot <noreply@mergify.io> | 2021-11-03 09:21:00 +0000 |
commit | a92a5327d5619fca4ef18805b09ad9b6faf3463e (patch) | |
tree | c936c41d0b25d51c55915f480add04a0c7a803df | |
parent | 054b7e4c5ac3bad6140dfb7df147991d973aa3a0 (diff) | |
download | rabbitmq-server-git-a92a5327d5619fca4ef18805b09ad9b6faf3463e.tar.gz |
Use erlang:system_info(creation) as GUID
Node GUID allows to differentiate between different incarnations of a node.
However, since rabbit may take some time to start (many queues/bindings, etc),
there could be a significant difference between Erlang VM being up and
responding to RPC requests and the new GUID being announced. During that
time, node monitor could incorrectly assume there was a network
partition, while in fact a node was simply restarted. With this change,
as soon as the Erlang VM is up, we can tell whether it was restarted and
avoid false positives.
Additionally, we now log if any queues were deleted on behalf of the
restarted node. This can take quite a long time if there are many transient
queues (eg. auto-delete queues). The longer this takes, the higher were the
odds of a restarted node being up again by the time
check_partial_partition was called. We may need to reconsider this logic
as well but for now - we just log this activity.
Co-authored-by: Loïc Hoguin <lhoguin@vmware.com>
-rw-r--r-- | deps/rabbit/src/rabbit_amqqueue.erl | 6 | ||||
-rw-r--r-- | deps/rabbit/src/rabbit_node_monitor.erl | 18 |
2 files changed, 19 insertions, 5 deletions
diff --git a/deps/rabbit/src/rabbit_amqqueue.erl b/deps/rabbit/src/rabbit_amqqueue.erl index 410105e310..1706ca434c 100644 --- a/deps/rabbit/src/rabbit_amqqueue.erl +++ b/deps/rabbit/src/rabbit_amqqueue.erl @@ -1905,7 +1905,11 @@ maybe_clear_recoverable_node(Node, Q) -> -spec on_node_down(node()) -> 'ok'. on_node_down(Node) -> - {QueueNames, QueueDeletions} = delete_queues_on_node_down(Node), + {Time, {QueueNames, QueueDeletions}} = timer:tc(fun() -> delete_queues_on_node_down(Node) end), + case length(QueueNames) of + 0 -> ok; + _ -> rabbit_log:info("~p transient queues from an old incarnation of node ~p deleted in ~fs", [length(QueueNames), Node, Time/1000000]) + end, notify_queue_binding_deletions(QueueDeletions), rabbit_core_metrics:queues_deleted(QueueNames), notify_queues_deleted(QueueNames), diff --git a/deps/rabbit/src/rabbit_node_monitor.erl b/deps/rabbit/src/rabbit_node_monitor.erl index 4de4e4f1ca..55b3bcb9eb 100644 --- a/deps/rabbit/src/rabbit_node_monitor.erl +++ b/deps/rabbit/src/rabbit_node_monitor.erl @@ -366,7 +366,7 @@ init([]) -> {ok, ensure_keepalive_timer(#state{monitors = Monitors, subscribers = pmon:new(), partitions = [], - guid = rabbit_guid:gen(), + guid = erlang:system_info(creation), node_guids = maps:new(), autoheal = rabbit_autoheal:init()})}. @@ -416,6 +416,13 @@ handle_cast(notify_node_up, State = #state{guid = GUID}) -> %% disconnected, it would become a minority, pause, realise it's not %% in a minority any more, and come back, still partitioned (albeit no %% longer partially). +%% +%% UPDATE: The GUID is actually not a GUID anymore - it is the value +%% returned by erlang:system_info(creation). This prevent false-positives +%% in a situation when a node is restarted (Erlang VM is up) but the rabbit +%% app is not yet up. The GUID was only generated and announced upon rabbit +%% startup; creation is available immediately. Therefore we can tell that +%% the node was restarted, before it announces the new value. %% ---------------------------------------------------------------------------- handle_cast({node_up, Node, NodeType, GUID}, @@ -435,15 +442,18 @@ handle_cast({check_partial_partition, Node, Rep, NodeGUID, MyGUID, RepGUID}, maps:find(Node, GUIDs) =:= {ok, NodeGUID} of true -> spawn_link( %%[1] fun () -> - case rpc:call(Node, rabbit, is_running, []) of + case rpc:call(Node, erlang, system_info, [creation]) of {badrpc, _} -> ok; - _ -> + NodeGUID -> rabbit_log:warning("Received a 'DOWN' message" " from ~p but still can" " communicate with it ", [Node]), cast(Rep, {partial_partition, - Node, node(), RepGUID}) + Node, node(), RepGUID}); + _ -> + rabbit_log:warning("Node ~p was restarted", [Node]), + ok end end); false -> ok |