diff options
author | Emile Joubert <emile@rabbitmq.com> | 2012-10-26 13:36:00 +0100 |
---|---|---|
committer | Emile Joubert <emile@rabbitmq.com> | 2012-10-26 13:36:00 +0100 |
commit | 2f30a50915ffa167b18637ec9c60436e825b73b2 (patch) | |
tree | c57032840fc66ab23ac68ecb055b6eb0b4892d8e | |
parent | 839c886755d361caf8b909f958eedd74a42083ee (diff) | |
parent | b9c8164436a44c771b4b456ffbedebb9c042dd2a (diff) | |
download | rabbitmq-server-2f30a50915ffa167b18637ec9c60436e825b73b2.tar.gz |
Merged bug25243 into default
-rw-r--r-- | src/rabbit_amqqueue_process.erl | 8 | ||||
-rw-r--r-- | src/rabbit_mnesia.erl | 154 | ||||
-rw-r--r-- | src/rabbit_runtime_parameters.erl | 12 |
3 files changed, 89 insertions, 85 deletions
diff --git a/src/rabbit_amqqueue_process.erl b/src/rabbit_amqqueue_process.erl index 68f95778..8d05a78c 100644 --- a/src/rabbit_amqqueue_process.erl +++ b/src/rabbit_amqqueue_process.erl @@ -1309,11 +1309,11 @@ handle_info(drop_expired, State) -> noreply(drop_expired_messages(State#q{ttl_timer_ref = undefined})); handle_info(emit_stats, State) -> - %% Do not invoke noreply as it would see no timer and create a new one. emit_stats(State), - State1 = rabbit_event:reset_stats_timer(State, #q.stats_timer), - assert_invariant(State1), - {noreply, State1, hibernate}; + {noreply, State1, Timeout} = noreply(State), + %% Need to reset *after* we've been through noreply/1 so we do not + %% just create another timer always and therefore never hibernate + {noreply, rabbit_event:reset_stats_timer(State1, #q.stats_timer), Timeout}; handle_info({'DOWN', _MonitorRef, process, DownPid, _Reason}, State = #q{q = #amqqueue{exclusive_owner = DownPid}}) -> diff --git a/src/rabbit_mnesia.erl b/src/rabbit_mnesia.erl index 04ac0904..942048f9 100644 --- a/src/rabbit_mnesia.erl +++ b/src/rabbit_mnesia.erl @@ -108,8 +108,24 @@ init() -> ok. init_from_config() -> - {ok, {TryNodes, NodeType}} = - application:get_env(rabbit, cluster_nodes), + {TryNodes, NodeType} = + case application:get_env(rabbit, cluster_nodes) of + {ok, Nodes} when is_list(Nodes) -> + Config = {Nodes -- [node()], case lists:member(node(), Nodes) of + true -> disc; + false -> ram + end}, + error_logger:warning_msg( + "Converting legacy 'cluster_nodes' configuration~n ~w~n" + "to~n ~w.~n~n" + "Please update the configuration to the new format " + "{Nodes, NodeType}, where Nodes contains the nodes that the " + "node will try to cluster with, and NodeType is either " + "'disc' or 'ram'~n", [Nodes, Config]), + Config; + {ok, Config} -> + Config + end, case find_good_node(nodes_excl_me(TryNodes)) of {ok, Node} -> rabbit_log:info("Node '~p' selected for clustering from " @@ -158,7 +174,7 @@ join_cluster(DiscoveryNode, NodeType) -> %% this case - we're joining a new cluster with new nodes which %% are not in synch with the current node. I also lifts the burden %% of reseting the node from the user. - reset(false), + reset_gracefully(), %% Join the cluster rabbit_misc:local_info_msg("Clustering with ~p as ~p node~n", @@ -172,39 +188,35 @@ join_cluster(DiscoveryNode, NodeType) -> %% cluster, has no cluster configuration, no local database, and no %% persisted messages reset() -> + ensure_mnesia_not_running(), rabbit_misc:local_info_msg("Resetting Rabbit~n", []), - reset(false). + reset_gracefully(). force_reset() -> + ensure_mnesia_not_running(), rabbit_misc:local_info_msg("Resetting Rabbit forcefully~n", []), - reset(true). + wipe(). + +reset_gracefully() -> + AllNodes = cluster_nodes(all), + %% Reconnecting so that we will get an up to date nodes. We don't + %% need to check for consistency because we are resetting. + %% Force=true here so that reset still works when clustered with a + %% node which is down. + init_db_with_mnesia(AllNodes, node_type(), false, false), + case is_only_clustered_disc_node() of + true -> e(resetting_only_disc_node); + false -> ok + end, + leave_cluster(), + rabbit_misc:ensure_ok(mnesia:delete_schema([node()]), cannot_delete_schema), + wipe(). -reset(Force) -> - ensure_mnesia_not_running(), - Nodes = case Force of - true -> - nodes(); - false -> - AllNodes = cluster_nodes(all), - %% Reconnecting so that we will get an up to date - %% nodes. We don't need to check for consistency - %% because we are resetting. Force=true here so - %% that reset still works when clustered with a - %% node which is down. - init_db_with_mnesia(AllNodes, node_type(), false, false), - case is_only_clustered_disc_node() of - true -> e(resetting_only_disc_node); - false -> ok - end, - leave_cluster(), - rabbit_misc:ensure_ok(mnesia:delete_schema([node()]), - cannot_delete_schema), - cluster_nodes(all) - end, +wipe() -> %% We need to make sure that we don't end up in a distributed %% Erlang system with nodes while not being in an Mnesia cluster %% with them. We don't handle that well. - [erlang:disconnect_node(N) || N <- Nodes], + [erlang:disconnect_node(N) || N <- cluster_nodes(all)], %% remove persisted messages and any other garbage we find ok = rabbit_file:recursive_delete(filelib:wildcard(dir() ++ "/*")), ok = rabbit_node_monitor:reset_cluster_status(), @@ -221,7 +233,9 @@ change_cluster_node_type(Type) -> {ok, Status} -> Status; {error, _Reason} -> e(cannot_connect_to_cluster) end, - Node = case RunningNodes of + %% We might still be marked as running by a remote node since the + %% information of us going down might not have propagated yet. + Node = case RunningNodes -- [node()] of [] -> e(no_online_cluster_nodes); [Node0|_] -> Node0 end, @@ -276,18 +290,18 @@ forget_cluster_node(Node, RemoveWhenOffline) -> end. remove_node_offline_node(Node) -> - %% We want the running nodes *now*, so we don't call - %% `cluster_nodes(running)' which will just get what's in the cluster status - %% file. - case {running_nodes(cluster_nodes(all)) -- [Node], node_type()} of + %% Here `mnesia:system_info(running_db_nodes)' will RPC, but that's what we + %% want - we need to know the running nodes *now*. If the current node is a + %% RAM node it will return bogus results, but we don't care since we only do + %% this operation from disc nodes. + case {mnesia:system_info(running_db_nodes) -- [Node], node_type()} of {[], disc} -> - %% Note that while we check if the nodes was the last to - %% go down, apart from the node we're removing from, this - %% is still unsafe. Consider the situation in which A and - %% B are clustered. A goes down, and records B as the - %% running node. Then B gets clustered with C, C goes down - %% and B goes down. In this case, C is the second-to-last, - %% but we don't know that and we'll remove B from A + %% Note that while we check if the nodes was the last to go down, + %% apart from the node we're removing from, this is still unsafe. + %% Consider the situation in which A and B are clustered. A goes + %% down, and records B as the running node. Then B gets clustered + %% with C, C goes down and B goes down. In this case, C is the + %% second-to-last, but we don't know that and we'll remove B from A %% anyway, even if that will lead to bad things. case cluster_nodes(running) -- [node(), Node] of [] -> start_mnesia(), @@ -339,7 +353,7 @@ cluster_nodes(WhichNodes) -> cluster_status(WhichNodes). %% This function is the actual source of information, since it gets %% the data from mnesia. Obviously it'll work only when mnesia is %% running. -mnesia_nodes() -> +cluster_status_from_mnesia() -> case mnesia:system_info(is_running) of no -> {error, mnesia_not_running}; @@ -359,39 +373,33 @@ mnesia_nodes() -> disc -> nodes_incl_me(DiscCopies); ram -> DiscCopies end, - {ok, {AllNodes, DiscNodes}}; + %% `mnesia:system_info(running_db_nodes)' is safe since + %% we know that mnesia is running + RunningNodes = mnesia:system_info(running_db_nodes), + {ok, {AllNodes, DiscNodes, RunningNodes}}; false -> {error, tables_not_present} end end. cluster_status(WhichNodes) -> - %% I don't want to call `running_nodes/1' unless if necessary, since it's - %% pretty expensive. - {AllNodes1, DiscNodes1, RunningNodesThunk} = - case mnesia_nodes() of - {ok, {AllNodes, DiscNodes}} -> - {AllNodes, DiscNodes, fun() -> running_nodes(AllNodes) end}; + {AllNodes, DiscNodes, RunningNodes} = Nodes = + case cluster_status_from_mnesia() of + {ok, Nodes0} -> + Nodes0; {error, _Reason} -> - {AllNodes, DiscNodes, RunningNodes} = + {AllNodes0, DiscNodes0, RunningNodes0} = rabbit_node_monitor:read_cluster_status(), %% The cluster status file records the status when the node is %% online, but we know for sure that the node is offline now, so %% we can remove it from the list of running nodes. - {AllNodes, DiscNodes, fun() -> nodes_excl_me(RunningNodes) end} + {AllNodes0, DiscNodes0, nodes_excl_me(RunningNodes0)} end, case WhichNodes of - status -> {AllNodes1, DiscNodes1, RunningNodesThunk()}; - all -> AllNodes1; - disc -> DiscNodes1; - ram -> AllNodes1 -- DiscNodes1; - running -> RunningNodesThunk() - end. - -cluster_status_from_mnesia() -> - case mnesia_nodes() of - {ok, {AllNodes, DiscNodes}} -> {ok, {AllNodes, DiscNodes, - running_nodes(AllNodes)}}; - {error, _} = Err -> Err + status -> Nodes; + all -> AllNodes; + disc -> DiscNodes; + ram -> AllNodes -- DiscNodes; + running -> RunningNodes end. node_info() -> @@ -726,14 +734,6 @@ change_extra_db_nodes(ClusterNodes0, CheckOtherNodes) -> Nodes end. -%% We're not using `mnesia:system_info(running_db_nodes)' directly -%% because if the node is a RAM node it won't know about other nodes -%% when mnesia is stopped -running_nodes(Nodes) -> - {Replies, _BadNodes} = rpc:multicall(Nodes, - rabbit_mnesia, is_running_remote, []), - [Node || {Running, Node} <- Replies, Running]. - is_running_remote() -> {mnesia:system_info(is_running) =:= yes, node()}. check_consistency(OTP, Rabbit) -> @@ -817,21 +817,23 @@ e(Tag) -> throw({error, {Tag, error_description(Tag)}}). error_description(clustering_only_disc_node) -> "You cannot cluster a node if it is the only disc node in its existing " " cluster. If new nodes joined while this node was offline, use " - "\"update_cluster_nodes\" to add them manually."; + "'update_cluster_nodes' to add them manually."; error_description(resetting_only_disc_node) -> "You cannot reset a node when it is the only disc node in a cluster. " "Please convert another node of the cluster to a disc node first."; error_description(already_clustered) -> - "You are already clustered with the nodes you have selected."; + "You are already clustered with the nodes you have selected. If the " + "node you are trying to cluster with is not present in the current " + "node status, use 'update_cluster_nodes'."; error_description(not_clustered) -> "Non-clustered nodes can only be disc nodes."; error_description(cannot_connect_to_cluster) -> "Could not connect to the cluster nodes present in this node's " "status file. If the cluster has changed, you can use the " - "\"update_cluster_nodes\" command to point to the new cluster nodes."; + "'update_cluster_nodes' command to point to the new cluster nodes."; error_description(no_online_cluster_nodes) -> "Could not find any online cluster nodes. If the cluster has changed, " - "you can use the 'recluster' command."; + "you can use the 'update_cluster_nodes' command."; error_description(cannot_connect_to_node) -> "Could not connect to the cluster node provided."; error_description(inconsistent_cluster) -> @@ -846,11 +848,11 @@ error_description(offline_node_no_offline_flag) -> "but can be done with the --offline flag. Please consult the manual " "for rabbitmqctl for more information."; error_description(not_last_node_to_go_down) -> - "The node you're trying to remove from was not the last to go down " + "The node you are trying to remove from was not the last to go down " "(excluding the node you are removing). Please use the the last node " "to go down to remove nodes when the cluster is offline."; error_description(removing_node_from_offline_node) -> - "To remove a node remotely from an offline node, the node you're removing " + "To remove a node remotely from an offline node, the node you are removing " "from must be a disc node and all the other nodes must be offline."; error_description(no_running_cluster_nodes) -> "You cannot leave a cluster if no online nodes are present.". diff --git a/src/rabbit_runtime_parameters.erl b/src/rabbit_runtime_parameters.erl index 4a83e61f..49060409 100644 --- a/src/rabbit_runtime_parameters.erl +++ b/src/rabbit_runtime_parameters.erl @@ -39,14 +39,16 @@ -spec(clear_any/3 :: (rabbit_types:vhost(), binary(), binary()) -> ok_or_error_string()). -spec(list/0 :: () -> [rabbit_types:infos()]). --spec(list/1 :: (rabbit_types:vhost()) -> [rabbit_types:infos()]). --spec(list_strict/1 :: (binary()) -> [rabbit_types:infos()] | 'not_found'). --spec(list/2 :: (rabbit_types:vhost(), binary()) -> [rabbit_types:infos()]). --spec(list_strict/2 :: (rabbit_types:vhost(), binary()) +-spec(list/1 :: (rabbit_types:vhost() | '_') -> [rabbit_types:infos()]). +-spec(list_strict/1 :: (binary() | '_') + -> [rabbit_types:infos()] | 'not_found'). +-spec(list/2 :: (rabbit_types:vhost() | '_', binary() | '_') + -> [rabbit_types:infos()]). +-spec(list_strict/2 :: (rabbit_types:vhost() | '_', binary() | '_') -> [rabbit_types:infos()] | 'not_found'). -spec(list_formatted/1 :: (rabbit_types:vhost()) -> [rabbit_types:infos()]). -spec(lookup/3 :: (rabbit_types:vhost(), binary(), binary()) - -> rabbit_types:infos()). + -> rabbit_types:infos() | 'not_found'). -spec(value/3 :: (rabbit_types:vhost(), binary(), binary()) -> term()). -spec(value/4 :: (rabbit_types:vhost(), binary(), binary(), term()) -> term()). -spec(info_keys/0 :: () -> rabbit_types:info_keys()). |