From 5f4b08236c8806d3fd2d56f0f11ddc8309b6e506 Mon Sep 17 00:00:00 2001 From: Jean-Sebastien Pedron Date: Mon, 24 Nov 2014 20:40:21 +0100 Subject: Add plugin paths to the beginning of the code path The goal is to make sure the right application is picked, in case the same one is available as both a RabbitMQ plugin and an external Erlang application. For instance, this could be the case with the Cowboy application: a version is available in the plugins, but the user could add an incompatible version to Erlang/OTP libdir or set ERL_LIBS to point to it. There's one exception currently: eldap. This application used to be available as a 3rd party one. But since Erlang R15B01, it's included in the standard library. We trust this version to have a stable API. Therefore, if the node runs on R15B01 or later and eldap version is 1.0+, we use this one. In all other cases, we don't trust it and prefer the RabbitMQ plugin. --- src/rabbit_plugins.erl | 59 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/src/rabbit_plugins.erl b/src/rabbit_plugins.erl index e290fb53..fe614b6a 100644 --- a/src/rabbit_plugins.erl +++ b/src/rabbit_plugins.erl @@ -90,7 +90,7 @@ list(PluginsDir) -> EZs = [{ez, EZ} || EZ <- filelib:wildcard("*.ez", PluginsDir)], FreeApps = [{app, App} || App <- filelib:wildcard("*/ebin/*.app", PluginsDir)], - {Plugins, Problems} = + {AvailablePlugins, Problems} = lists:foldl(fun ({error, EZ, Reason}, {Plugins1, Problems1}) -> {Plugins1, [{EZ, Reason} | Problems1]}; (Plugin = #plugin{}, {Plugins1, Problems1}) -> @@ -102,6 +102,7 @@ list(PluginsDir) -> _ -> rabbit_log:warning( "Problem reading some plugins: ~p~n", [Problems]) end, + Plugins = lists:filter(fun keep_plugin/1, AvailablePlugins), ensure_dependencies(Plugins). %% @doc Read the list of enabled plugins from the supplied term file. @@ -132,6 +133,55 @@ dependencies(Reverse, Sources, AllPlugins) -> true = digraph:delete(G), Dests. +%% For a few known cases, an externally provided plugin can be trusted. +%% In this special case, it overrides the plugin. +keep_plugin(#plugin{name = App} = Plugin) -> + case application:load(App) of + {error, {already_loaded, _}} -> + not plugin_provided_by_otp(Plugin); + ok -> + Ret = not plugin_provided_by_otp(Plugin), + application:unload(App), + Ret; + _ -> + true + end. + +plugin_provided_by_otp(#plugin{name = eldap, version = PluginVsn}) -> + %% eldap was added to Erlang/OTP R15B01. We prefer this version + %% to the plugin. Before, eldap always advertised version "1". In + %% R15B01, it got proper versionning starting from "1.0". If eldap's + %% version is "1", we keep using the plugin, otherwise we take the + %% OTP application. As an extra check, we look at ERTS version to be + %% sure we're on R15B01. + case application:get_key(eldap, vsn) of + {ok, PluginVsn} -> + %% The plugin itself; the plugin was previously added to the + %% code path. + false; + {ok, "1"} -> + %% The version available on GitHub, not part of OTP. + false; + {ok, Vsn} -> + try rabbit_misc:version_compare(Vsn, "1.0", gte) of + true -> + %% Probably part of OTP. Let's check ERTS version to + %% be sure. + rabbit_misc:version_compare( + erlang:system_info(version), "5.9.1", gte); + false -> + %% Probably not part of OTP. Use the plugin to be safe. + false + catch + _:_ -> + %% Couldn't parse the version. It's not the OTP + %% application. + false + end + end; +plugin_provided_by_otp(_) -> + false. + %% Make sure we don't list OTP apps in here, and also that we detect %% missing dependencies. ensure_dependencies(Plugins) -> @@ -158,7 +208,7 @@ is_loadable(App) -> ok -> application:unload(App), true; _ -> false - end. + end. %%---------------------------------------------------------------------------- @@ -197,8 +247,9 @@ clean_plugin(Plugin, ExpandDir) -> delete_recursively(rabbit_misc:format("~s/~s", [ExpandDir, Plugin])). prepare_dir_plugin(PluginAppDescPath) -> - code:add_path(filename:dirname(PluginAppDescPath)), - list_to_atom(filename:basename(PluginAppDescPath, ".app")). + rabbit_log:info("Plugins: adding \"~s\" to the beginning of the code path.~n", + [filename:dirname(PluginAppDescPath)]), + code:add_patha(filename:dirname(PluginAppDescPath)). %%---------------------------------------------------------------------------- -- cgit v1.2.1 From 6c026c7445f76b0a7a20cd4d0af21298f20fc930 Mon Sep 17 00:00:00 2001 From: Simon MacMullen Date: Wed, 26 Nov 2014 14:34:58 +0000 Subject: deb / RPM changelogs --- packaging/RPMS/Fedora/rabbitmq-server.spec | 3 +++ packaging/debs/Debian/debian/changelog | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/packaging/RPMS/Fedora/rabbitmq-server.spec b/packaging/RPMS/Fedora/rabbitmq-server.spec index f363bce6..c77d7e9d 100644 --- a/packaging/RPMS/Fedora/rabbitmq-server.spec +++ b/packaging/RPMS/Fedora/rabbitmq-server.spec @@ -130,6 +130,9 @@ done rm -rf %{buildroot} %changelog +* Wed Nov 26 2014 simon@rabbitmq.com 3.4.2-1 +- New Upstream Release + * Wed Oct 29 2014 simon@rabbitmq.com 3.4.1-1 - New Upstream Release diff --git a/packaging/debs/Debian/debian/changelog b/packaging/debs/Debian/debian/changelog index a47caa0e..5e3744fd 100644 --- a/packaging/debs/Debian/debian/changelog +++ b/packaging/debs/Debian/debian/changelog @@ -1,3 +1,9 @@ +rabbitmq-server (3.4.2-1) unstable; urgency=low + + * New Upstream Release + + -- Simon MacMullen Wed, 26 Nov 2014 12:11:12 +0000 + rabbitmq-server (3.4.1-1) unstable; urgency=low * New Upstream Release -- cgit v1.2.1 From 15bc16942ca5a7132c289df204258ec234abc129 Mon Sep 17 00:00:00 2001 From: Simon MacMullen Date: Wed, 26 Nov 2014 16:45:25 +0000 Subject: Don't halt execution if RABBITMQ_CTL_ERL_ARGS is set. --- scripts/rabbitmq-env | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/rabbitmq-env b/scripts/rabbitmq-env index 63cfda3c..5a3e73bc 100755 --- a/scripts/rabbitmq-env +++ b/scripts/rabbitmq-env @@ -111,3 +111,6 @@ DEFAULT_NODE_PORT=5672 [ "x" = "x$RABBITMQ_CTL_ERL_ARGS" ] && RABBITMQ_CTL_ERL_ARGS=${CTL_ERL_ARGS} ##--- End of overridden variables + +# Since we source this elsewhere, don't accidentally stop execution +true -- cgit v1.2.1 From f8466d59675c57c89935c7b2466d7e3177440f40 Mon Sep 17 00:00:00 2001 From: Jean-Sebastien Pedron Date: Wed, 26 Nov 2014 18:21:44 +0100 Subject: Only check ERTS version to determine if we accept an external eldap --- src/rabbit_plugins.erl | 34 +++------------------------------- 1 file changed, 3 insertions(+), 31 deletions(-) diff --git a/src/rabbit_plugins.erl b/src/rabbit_plugins.erl index fe614b6a..fd6acb5c 100644 --- a/src/rabbit_plugins.erl +++ b/src/rabbit_plugins.erl @@ -148,37 +148,9 @@ keep_plugin(#plugin{name = App} = Plugin) -> end. plugin_provided_by_otp(#plugin{name = eldap, version = PluginVsn}) -> - %% eldap was added to Erlang/OTP R15B01. We prefer this version - %% to the plugin. Before, eldap always advertised version "1". In - %% R15B01, it got proper versionning starting from "1.0". If eldap's - %% version is "1", we keep using the plugin, otherwise we take the - %% OTP application. As an extra check, we look at ERTS version to be - %% sure we're on R15B01. - case application:get_key(eldap, vsn) of - {ok, PluginVsn} -> - %% The plugin itself; the plugin was previously added to the - %% code path. - false; - {ok, "1"} -> - %% The version available on GitHub, not part of OTP. - false; - {ok, Vsn} -> - try rabbit_misc:version_compare(Vsn, "1.0", gte) of - true -> - %% Probably part of OTP. Let's check ERTS version to - %% be sure. - rabbit_misc:version_compare( - erlang:system_info(version), "5.9.1", gte); - false -> - %% Probably not part of OTP. Use the plugin to be safe. - false - catch - _:_ -> - %% Couldn't parse the version. It's not the OTP - %% application. - false - end - end; + %% eldap was added to Erlang/OTP R15B01 (ERTS 5.9.1). In this case, + %% we prefer this version to the plugin. + rabbit_misc:version_compare(erlang:system_info(version), "5.9.1", gte); plugin_provided_by_otp(_) -> false. -- cgit v1.2.1 From 6276d667177d915aff5c5b86e895a9b38b7ea2ae Mon Sep 17 00:00:00 2001 From: Jean-Sebastien Pedron Date: Thu, 27 Nov 2014 12:13:09 +0100 Subject: Check cluster consistency before joining a cluster Before this change, the node would join the cluster and the consistency would be only checked during application restart (rabbitmqctl start_app). This could lead to crash during the join due to Erlang, Mnesia or RabbitMQ incompatibilities. Now, the join_cluster command reports the problem. Here's an example: $ rabbitmqctl join_cluster Clustering node with ... Error: {inconsistent_cluster, "OTP version mismatch: local node is R16B03-1, remote node 17.3"} --- src/rabbit_mnesia.erl | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/src/rabbit_mnesia.erl b/src/rabbit_mnesia.erl index fa51dd70..b55d53b6 100644 --- a/src/rabbit_mnesia.erl +++ b/src/rabbit_mnesia.erl @@ -174,18 +174,25 @@ join_cluster(DiscoveryNode, NodeType) -> {ClusterNodes, _, _} = discover_cluster([DiscoveryNode]), case me_in_nodes(ClusterNodes) of false -> - %% reset the node. this simplifies things and it will be needed in - %% this case - we're joining a new cluster with new nodes which - %% are not in synch with the current node. I also lifts the burden - %% of reseting the node from the user. - reset_gracefully(), - - %% Join the cluster - rabbit_log:info("Clustering with ~p as ~p node~n", - [ClusterNodes, NodeType]), - ok = init_db_with_mnesia(ClusterNodes, NodeType, true, true), - rabbit_node_monitor:notify_joined_cluster(), - ok; + case check_cluster_consistency(DiscoveryNode, false) of + {ok, _} -> + %% reset the node. this simplifies things and it + %% will be needed in this case - we're joining a new + %% cluster with new nodes which are not in synch + %% with the current node. It also lifts the burden + %% of resetting the node from the user. + reset_gracefully(), + + %% Join the cluster + rabbit_log:info("Clustering with ~p as ~p node~n", + [ClusterNodes, NodeType]), + ok = init_db_with_mnesia(ClusterNodes, NodeType, + true, true), + rabbit_node_monitor:notify_joined_cluster(), + ok; + {error, Reason} -> + {error, Reason} + end; true -> rabbit_log:info("Already member of cluster: ~p~n", [ClusterNodes]), {ok, already_member} @@ -545,7 +552,7 @@ maybe_force_load() -> check_cluster_consistency() -> %% We want to find 0 or 1 consistent nodes. case lists:foldl( - fun (Node, {error, _}) -> check_cluster_consistency(Node); + fun (Node, {error, _}) -> check_cluster_consistency(Node, true); (_Node, {ok, Status}) -> {ok, Status} end, {error, not_found}, nodes_excl_me(cluster_nodes(all))) of @@ -575,17 +582,22 @@ check_cluster_consistency() -> throw(E) end. -check_cluster_consistency(Node) -> +check_cluster_consistency(Node, AlreadyFormed) -> case rpc:call(Node, rabbit_mnesia, node_info, []) of {badrpc, _Reason} -> {error, not_found}; {_OTP, _Rabbit, {error, _}} -> {error, not_found}; - {OTP, Rabbit, {ok, Status}} -> + {OTP, Rabbit, {ok, Status}} when AlreadyFormed -> case check_consistency(OTP, Rabbit, Node, Status) of {error, _} = E -> E; {ok, Res} -> {ok, Res} end; + {OTP, Rabbit, {ok, Status}} -> + case check_consistency(OTP, Rabbit) of + {error, _} = E -> E; + ok -> {ok, Status} + end; {_OTP, Rabbit, _Hash, _Status} -> %% delegate hash checking implies version mismatch version_error("Rabbit", rabbit_misc:version(), Rabbit) -- cgit v1.2.1 From f6cfb1a6195aca0af66652b918160dcce02ce34b Mon Sep 17 00:00:00 2001 From: Simon MacMullen Date: Fri, 28 Nov 2014 10:41:59 +0000 Subject: Rename this to indicate what it does, not the current circumstances in which it is called. --- src/rabbit_mnesia.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rabbit_mnesia.erl b/src/rabbit_mnesia.erl index b55d53b6..80fbcd68 100644 --- a/src/rabbit_mnesia.erl +++ b/src/rabbit_mnesia.erl @@ -582,13 +582,13 @@ check_cluster_consistency() -> throw(E) end. -check_cluster_consistency(Node, AlreadyFormed) -> +check_cluster_consistency(Node, CheckNodesConsistency) -> case rpc:call(Node, rabbit_mnesia, node_info, []) of {badrpc, _Reason} -> {error, not_found}; {_OTP, _Rabbit, {error, _}} -> {error, not_found}; - {OTP, Rabbit, {ok, Status}} when AlreadyFormed -> + {OTP, Rabbit, {ok, Status}} when CheckNodesConsistency -> case check_consistency(OTP, Rabbit, Node, Status) of {error, _} = E -> E; {ok, Res} -> {ok, Res} -- cgit v1.2.1 From 57d48705f822d41ca7f13b3c72c94f1b4dd7b91b Mon Sep 17 00:00:00 2001 From: Jean-Sebastien Pedron Date: Fri, 28 Nov 2014 12:33:18 +0100 Subject: Remove "Plugins: adding to code path" log message --- src/rabbit_plugins.erl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/rabbit_plugins.erl b/src/rabbit_plugins.erl index fd6acb5c..3aad48d1 100644 --- a/src/rabbit_plugins.erl +++ b/src/rabbit_plugins.erl @@ -219,8 +219,6 @@ clean_plugin(Plugin, ExpandDir) -> delete_recursively(rabbit_misc:format("~s/~s", [ExpandDir, Plugin])). prepare_dir_plugin(PluginAppDescPath) -> - rabbit_log:info("Plugins: adding \"~s\" to the beginning of the code path.~n", - [filename:dirname(PluginAppDescPath)]), code:add_patha(filename:dirname(PluginAppDescPath)). %%---------------------------------------------------------------------------- -- cgit v1.2.1 From d6798de8bf063780787c87dc7c0d934d27452cab Mon Sep 17 00:00:00 2001 From: Jean-Sebastien Pedron Date: Fri, 28 Nov 2014 12:45:14 +0100 Subject: No need to load/unload the plugin around the plugin_provided_by_otp/1 check While here, remove the unused PluginVsn variable. --- src/rabbit_plugins.erl | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/src/rabbit_plugins.erl b/src/rabbit_plugins.erl index 3aad48d1..dfde5289 100644 --- a/src/rabbit_plugins.erl +++ b/src/rabbit_plugins.erl @@ -102,7 +102,8 @@ list(PluginsDir) -> _ -> rabbit_log:warning( "Problem reading some plugins: ~p~n", [Problems]) end, - Plugins = lists:filter(fun keep_plugin/1, AvailablePlugins), + Plugins = lists:filter(fun(P) -> not plugin_provided_by_otp(P) end, + AvailablePlugins), ensure_dependencies(Plugins). %% @doc Read the list of enabled plugins from the supplied term file. @@ -135,19 +136,7 @@ dependencies(Reverse, Sources, AllPlugins) -> %% For a few known cases, an externally provided plugin can be trusted. %% In this special case, it overrides the plugin. -keep_plugin(#plugin{name = App} = Plugin) -> - case application:load(App) of - {error, {already_loaded, _}} -> - not plugin_provided_by_otp(Plugin); - ok -> - Ret = not plugin_provided_by_otp(Plugin), - application:unload(App), - Ret; - _ -> - true - end. - -plugin_provided_by_otp(#plugin{name = eldap, version = PluginVsn}) -> +plugin_provided_by_otp(#plugin{name = eldap}) -> %% eldap was added to Erlang/OTP R15B01 (ERTS 5.9.1). In this case, %% we prefer this version to the plugin. rabbit_misc:version_compare(erlang:system_info(version), "5.9.1", gte); -- cgit v1.2.1 From bca1e2ea0903c8ce1c130a08a16f86eff5fb0e44 Mon Sep 17 00:00:00 2001 From: Jean-Sebastien Pedron Date: Wed, 3 Dec 2014 15:43:57 +0100 Subject: Throw an error if at least one plugin's module can't be loaded This prevents a plugin from being enabled if it won't be able to actually run later. A use case for this is a plugin built with Erlang version N, but executed on Erlang version M, where M isn't capable of running the bytecode from N. This was the case with Eralng R14B vs. R15B. The "rabbitmq-plugins enable " command reports the error and the plugin remains disabled. A node reports the error too and refuses to start, exactly as if the plugin was missing. --- src/rabbit_plugins.erl | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/rabbit_plugins.erl b/src/rabbit_plugins.erl index dfde5289..55f7359b 100644 --- a/src/rabbit_plugins.erl +++ b/src/rabbit_plugins.erl @@ -208,7 +208,27 @@ clean_plugin(Plugin, ExpandDir) -> delete_recursively(rabbit_misc:format("~s/~s", [ExpandDir, Plugin])). prepare_dir_plugin(PluginAppDescPath) -> - code:add_patha(filename:dirname(PluginAppDescPath)). + PluginEbinDir = filename:dirname(PluginAppDescPath), + Plugin = filename:basename(PluginAppDescPath, ".app"), + code:add_patha(PluginEbinDir), + case filelib:wildcard(PluginEbinDir++ "/*.beam") of + [] -> + ok; + [BeamPath | _] -> + Module = list_to_atom(filename:basename(BeamPath, ".beam")), + case code:ensure_loaded(Module) of + {module, _} -> + ok; + {error, badfile} -> + rabbit_log:error("Failed to enable plugin \"~s\": " + "it may have been built with an " + "incompatible (more recent?) " + "version of Erlang~n", [Plugin]), + throw({plugin_built_with_incompatible_erlang, Plugin}); + Error -> + throw({plugin_module_unloadable, Plugin, Error}) + end + end. %%---------------------------------------------------------------------------- -- cgit v1.2.1 From d7e77822640f57c6b8f6c5318985a604612f91d6 Mon Sep 17 00:00:00 2001 From: Simon MacMullen Date: Fri, 5 Dec 2014 11:24:14 +0000 Subject: Add top_memory_use() and top_binary_refs(). --- src/rabbit_diagnostics.erl | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/src/rabbit_diagnostics.erl b/src/rabbit_diagnostics.erl index bf45b757..3abd05a8 100644 --- a/src/rabbit_diagnostics.erl +++ b/src/rabbit_diagnostics.erl @@ -17,10 +17,11 @@ -module(rabbit_diagnostics). -define(PROCESS_INFO, - [current_stacktrace, initial_call, dictionary, message_queue_len, - links, monitors, monitored_by, heap_size]). + [registered_name, current_stacktrace, initial_call, dictionary, + message_queue_len, links, monitors, monitored_by, heap_size]). --export([maybe_stuck/0, maybe_stuck/1]). +-export([maybe_stuck/0, maybe_stuck/1, top_memory_use/0, top_memory_use/1, + top_binary_refs/0, top_binary_refs/1]). maybe_stuck() -> maybe_stuck(5000). @@ -75,5 +76,32 @@ maybe_stuck_stacktrace({_M, F, _A}) -> _ -> false end. +top_memory_use() -> top_memory_use(30). + +top_memory_use(Count) -> + Pids = processes(), + io:format("Memory use: top ~p of ~p processes.~n", [Count, length(Pids)]), + Procs = [{catch process_info(Pid, memory), catch info(Pid)} || Pid <- Pids], + Sorted = lists:sublist(lists:reverse(lists:sort(Procs)), Count), + io:format("~p~n", [Sorted]). + +top_binary_refs() -> top_binary_refs(30). + +top_binary_refs(Count) -> + Pids = processes(), + io:format("Binary refs: top ~p of ~p processes.~n", [Count, length(Pids)]), + Procs = [{binary_refs(Pid), catch info(Pid)} || Pid <- Pids], + Sorted = lists:sublist(lists:reverse(lists:sort(Procs)), Count), + io:format("~p~n", [Sorted]). + +binary_refs(Pid) -> + Refs = try + {binary, Rs} = process_info(Pid, binary), + Rs + catch _:badarg -> [] + end, + lists:sum([Sz || {_Ptr, Sz} <- lists:usort([{Ptr, Sz} || + {Ptr, Sz, _Cnt} <- Refs])]). + info(Pid) -> [{pid, Pid} | process_info(Pid, ?PROCESS_INFO)]. -- cgit v1.2.1 From 9bf5de793ba9ccc2b6fe437753b42995d3cf89af Mon Sep 17 00:00:00 2001 From: Simon MacMullen Date: Fri, 5 Dec 2014 11:32:17 +0000 Subject: Abstraction: safe process_info(). --- src/rabbit_diagnostics.erl | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/rabbit_diagnostics.erl b/src/rabbit_diagnostics.erl index 3abd05a8..9fc0fabd 100644 --- a/src/rabbit_diagnostics.erl +++ b/src/rabbit_diagnostics.erl @@ -42,13 +42,13 @@ maybe_stuck(Pids, Timeout) -> maybe_stuck(Pids2, Timeout - 500). looks_stuck(Pid) -> - case catch process_info(Pid, status) of + case info(Pid, status, gone) of {status, waiting} -> %% It's tempting to just check for message_queue_len > 0 %% here rather than mess around with stack traces and %% heuristics. But really, sometimes freshly stuck %% processes can have 0 messages... - case catch erlang:process_info(Pid, current_stacktrace) of + case info(Pid, current_stacktrace, gone) of {current_stacktrace, [H|_]} -> maybe_stuck_stacktrace(H); _ -> @@ -81,7 +81,7 @@ top_memory_use() -> top_memory_use(30). top_memory_use(Count) -> Pids = processes(), io:format("Memory use: top ~p of ~p processes.~n", [Count, length(Pids)]), - Procs = [{catch process_info(Pid, memory), catch info(Pid)} || Pid <- Pids], + Procs = [{info(Pid, memory, 0), info(Pid)} || Pid <- Pids], Sorted = lists:sublist(lists:reverse(lists:sort(Procs)), Count), io:format("~p~n", [Sorted]). @@ -90,18 +90,24 @@ top_binary_refs() -> top_binary_refs(30). top_binary_refs(Count) -> Pids = processes(), io:format("Binary refs: top ~p of ~p processes.~n", [Count, length(Pids)]), - Procs = [{binary_refs(Pid), catch info(Pid)} || Pid <- Pids], + Procs = [{{binary_refs, binary_refs(Pid)}, info(Pid)} || Pid <- Pids], Sorted = lists:sublist(lists:reverse(lists:sort(Procs)), Count), io:format("~p~n", [Sorted]). binary_refs(Pid) -> - Refs = try - {binary, Rs} = process_info(Pid, binary), - Rs - catch _:badarg -> [] - end, + {binary, Refs} = info(Pid, binary, []), lists:sum([Sz || {_Ptr, Sz} <- lists:usort([{Ptr, Sz} || {Ptr, Sz, _Cnt} <- Refs])]). info(Pid) -> - [{pid, Pid} | process_info(Pid, ?PROCESS_INFO)]. + [{pid, Pid} | info(Pid, ?PROCESS_INFO, [])]. + +info(Pid, Infos, Default) -> + try + process_info(Pid, Infos) + catch + _:_ -> case is_atom(Infos) of + true -> {Infos, Default}; + false -> Default + end + end. -- cgit v1.2.1 From 2a44901be5e4a70dad2523555996c5f552a9fbf7 Mon Sep 17 00:00:00 2001 From: Jean-Sebastien Pedron Date: Wed, 10 Dec 2014 10:55:12 +0100 Subject: Autoheal: Make sure Mnesia is stopped on all losers before they restart This works around a race in Mnesia where a starting loser would hang forever. This happens when a starting loser connects to another loser, negotiates the Mnesia protocol and attempts to acquire a write lock on the other node's schema. If the other nodes stops right between the protocol negotiation and the lock request, the starting node never receives an answer to its request. Before this fix, the hang occurred after at most 30 minutes looping on the partitions:autoheal test in rabbitmq-test. With the fix, RabbitMQ survived an all night long run. --- src/rabbit_autoheal.erl | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl index 90458741..a4ec86bf 100644 --- a/src/rabbit_autoheal.erl +++ b/src/rabbit_autoheal.erl @@ -21,6 +21,8 @@ %% The named process we are running in. -define(SERVER, rabbit_node_monitor). +-define(MNESIA_STOPPED_PING_INTERNAL, 200). + %%---------------------------------------------------------------------------- %% In order to autoheal we want to: @@ -194,9 +196,36 @@ abort(Down, Notify) -> winner_finish(Notify). winner_finish(Notify) -> + %% There is a race in Mnesia causing a starting loser to hang + %% forever if another loser stops at the same time: the starting + %% node connects to the other node, negotiates the protocol and + %% attemps to acquire a write lock on the schema on the other node. + %% If the other node stops between the protocol negotiation and lock + %% request, the starting node never gets and answer to its lock + %% request. + %% + %% To workaround the problem, we make sure Mnesia is stopped on all + %% loosing nodes before sending the "autoheal_safe_to_start" signal. + wait_for_mnesia_shutdown(Notify), [{rabbit_outside_app_process, N} ! autoheal_safe_to_start || N <- Notify], not_healing. +wait_for_mnesia_shutdown([Node | Rest] = AllNodes) -> + case rpc:call(Node, mnesia, system_info, [is_running]) of + no -> + wait_for_mnesia_shutdown(Rest); + Running when + Running =:= yes orelse + Running =:= starting orelse + Running =:= stopping -> + timer:sleep(?MNESIA_STOPPED_PING_INTERNAL), + wait_for_mnesia_shutdown(AllNodes); + _ -> + wait_for_mnesia_shutdown(Rest) + end; +wait_for_mnesia_shutdown([]) -> + ok. + make_decision(AllPartitions) -> Sorted = lists:sort([{partition_value(P), P} || P <- AllPartitions]), [[Winner | _] | Rest] = lists:reverse([P || {_, P} <- Sorted]), -- cgit v1.2.1 From b1cad59e2ae0f67bda87eeaaeb3fbc8ed5a14c32 Mon Sep 17 00:00:00 2001 From: Simon MacMullen Date: Wed, 10 Dec 2014 15:28:42 +0000 Subject: Minor language corrections. --- src/rabbit_autoheal.erl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl index a4ec86bf..b5d64992 100644 --- a/src/rabbit_autoheal.erl +++ b/src/rabbit_autoheal.erl @@ -199,13 +199,13 @@ winner_finish(Notify) -> %% There is a race in Mnesia causing a starting loser to hang %% forever if another loser stops at the same time: the starting %% node connects to the other node, negotiates the protocol and - %% attemps to acquire a write lock on the schema on the other node. + %% attempts to acquire a write lock on the schema on the other node. %% If the other node stops between the protocol negotiation and lock - %% request, the starting node never gets and answer to its lock + %% request, the starting node never gets an answer to its lock %% request. %% - %% To workaround the problem, we make sure Mnesia is stopped on all - %% loosing nodes before sending the "autoheal_safe_to_start" signal. + %% To work around the problem, we make sure Mnesia is stopped on all + %% losing nodes before sending the "autoheal_safe_to_start" signal. wait_for_mnesia_shutdown(Notify), [{rabbit_outside_app_process, N} ! autoheal_safe_to_start || N <- Notify], not_healing. -- cgit v1.2.1