%% The contents of this file are subject to the Mozilla Public License
%% Version 1.1 (the "License"); you may not use this file except in
%% compliance with the License. You may obtain a copy of the License
%% at https://www.mozilla.org/MPL/
%%
%% Software distributed under the License is distributed on an "AS IS"
%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
%% the License for the specific language governing rights and
%% limitations under the License.
%%
%% The Original Code is RabbitMQ.
%%
%% The Initial Developer of the Original Code is GoPivotal, Inc.
%% Copyright (c) 2007-2020 Pivotal Software, Inc.  All rights reserved.
%%

-module(rabbit_autoheal).

-export([init/0, enabled/0, maybe_start/1, rabbit_down/2, node_down/2,
         handle_msg/3, process_down/2]).

%% The named process we are running in.
-define(SERVER, rabbit_node_monitor).

-define(MNESIA_STOPPED_PING_INTERNAL, 200).

-define(AUTOHEAL_STATE_AFTER_RESTART, rabbit_autoheal_state_after_restart).

%%----------------------------------------------------------------------------

%% In order to autoheal we want to:
%%
%% * Find the winning partition
%% * Stop all nodes in other partitions
%% * Wait for them all to be stopped
%% * Start them again
%%
%% To keep things simple, we assume all nodes are up. We don't start
%% unless all nodes are up, and if a node goes down we abandon the
%% whole process. To further keep things simple we also defer the
%% decision as to the winning node to the "leader" - arbitrarily
%% selected as the first node in the cluster.
%%
%% To coordinate the restarting nodes we pick a special node from the
%% winning partition - the "winner". Restarting nodes then stop, and
%% wait for it to tell them it is safe to start again. The winner
%% determines that a node has stopped just by seeing if its rabbit app
%% stops - if a node stops for any other reason it just gets a message
%% it will ignore, and otherwise we carry on.
%%
%% Meanwhile, the leader may continue to receive new autoheal requests:
%% all of them are ignored. The winner notifies the leader when the
%% current autoheal process is finished (ie. when all losers stopped and
%% were asked to start again) or was aborted. When the leader receives
%% the notification or if it looses contact with the winner, it can
%% accept new autoheal requests.
%%
%% The winner and the leader are not necessarily the same node.
%%
%% The leader can be a loser and will restart in this case. It remembers
%% there is an autoheal in progress by temporarily saving the autoheal
%% state to the application environment.
%%
%% == Possible states ==
%%
%% not_healing
%%   - the default
%%
%% {winner_waiting, OutstandingStops, Notify}
%%   - we are the winner and are waiting for all losing nodes to stop
%%   before telling them they can restart
%%
%% {leader_waiting, Winner, Notify}
%%   - we are the leader, and have already assigned the winner and losers.
%%   We are waiting for a confirmation from the winner that the autoheal
%%   process has ended. Meanwhile we can ignore autoheal requests.
%%   Because we may be a loser too, this state is saved to the application
%%   environment and restored on startup.
%%
%% restarting
%%   - we are restarting. Of course the node monitor immediately dies
%%   then so this state does not last long. We therefore send the
%%   autoheal_safe_to_start message to the rabbit_outside_app_process
%%   instead.
%%
%% == Message flow ==
%%
%% 1. Any node (leader included) >> {request_start, node()} >> Leader
%%      When Mnesia detects it is running partitioned or
%%      when a remote node starts, rabbit_node_monitor calls
%%      rabbit_autoheal:maybe_start/1. The message above is sent to the
%%      leader so the leader can take a decision.
%%
%% 2. Leader >> {become_winner, Losers} >> Winner
%%      The leader notifies the winner so the latter can proceed with
%%      the autoheal.
%%
%% 3. Winner >> {winner_is, Winner} >> All losers
%%      The winner notifies losers they must stop.
%%
%% 4. Winner >> autoheal_safe_to_start >> All losers
%%      When either all losers stopped or the autoheal process was
%%      aborted, the winner notifies losers they can start again.
%%
%% 5. Leader >> report_autoheal_status >> Winner
%%      The leader asks the autoheal status to the winner. This only
%%      happens when the leader is a loser too. If this is not the case,
%%      this message is never sent.
%%
%% 6. Winner >> {autoheal_finished, Winner} >> Leader
%%      The winner notifies the leader that the autoheal process was
%%      either finished or aborted (ie. autoheal_safe_to_start was sent
%%      to losers).

%%----------------------------------------------------------------------------

init() ->
    %% We check the application environment for a saved autoheal state
    %% saved during a restart. If this node is a leader, it is used
    %% to determine if it needs to ask the winner to report about the
    %% autoheal progress.
    State = case application:get_env(rabbit, ?AUTOHEAL_STATE_AFTER_RESTART) of
        {ok, S}   -> S;
        undefined -> not_healing
    end,
    ok = application:unset_env(rabbit, ?AUTOHEAL_STATE_AFTER_RESTART),
    case State of
        {leader_waiting, Winner, _} ->
            rabbit_log:info(
              "Autoheal: in progress, requesting report from ~p~n", [Winner]),
            send(Winner, report_autoheal_status);
        _ ->
            ok
    end,
    State.

maybe_start(not_healing) ->
    case enabled() of
        true  -> Leader = leader(),
                 send(Leader, {request_start, node()}),
                 rabbit_log:info("Autoheal request sent to ~p~n", [Leader]),
                 not_healing;
        false -> not_healing
    end;
maybe_start(State) ->
    State.

enabled() ->
    case application:get_env(rabbit, cluster_partition_handling) of
        {ok, autoheal}                         -> true;
        {ok, {pause_if_all_down, _, autoheal}} -> true;
        _                                      -> false
    end.

leader() ->
    [Leader | _] = lists:usort(rabbit_mnesia:cluster_nodes(all)),
    Leader.

%% This is the winner receiving its last notification that a node has
%% stopped - all nodes can now start again
rabbit_down(Node, {winner_waiting, [Node], Notify}) ->
    rabbit_log:info("Autoheal: final node has stopped, starting...~n",[]),
    winner_finish(Notify);

rabbit_down(Node, {winner_waiting, WaitFor, Notify}) ->
    {winner_waiting, WaitFor -- [Node], Notify};

rabbit_down(Winner, {leader_waiting, Winner, Losers}) ->
    abort([Winner], Losers);

rabbit_down(_Node, State) ->
    %% Ignore. Either:
    %%     o  we already cancelled the autoheal process;
    %%     o  we are still waiting the winner's report.
    State.

node_down(_Node, not_healing) ->
    not_healing;

node_down(Node, {winner_waiting, _, Notify}) ->
    abort([Node], Notify);

node_down(Node, {leader_waiting, Node, _Notify}) ->
    %% The winner went down, we don't know what to do so we simply abort.
    rabbit_log:info("Autoheal: aborting - winner ~p went down~n", [Node]),
    not_healing;

node_down(Node, {leader_waiting, _, _} = St) ->
    %% If it is a partial partition, the winner might continue with the
    %% healing process. If it is a full partition, the winner will also
    %% see it and abort. Let's wait for it.
    rabbit_log:info("Autoheal: ~p went down, waiting for winner decision ~n", [Node]),
    St;

node_down(Node, _State) ->
    rabbit_log:info("Autoheal: aborting - ~p went down~n", [Node]),
    not_healing.

%% If the process that has to restart the node crashes for an unexpected reason,
%% we go back to a not healing state so the node is able to recover.
process_down({'EXIT', Pid, Reason}, {restarting, Pid}) when Reason =/= normal ->
    rabbit_log:info("Autoheal: aborting - the process responsible for restarting the "
                    "node terminated with reason: ~p~n", [Reason]),
    not_healing;

process_down(_, State) ->
    State.

%% By receiving this message we become the leader
%% TODO should we try to debounce this?
handle_msg({request_start, Node},
           not_healing, Partitions) ->
    rabbit_log:info("Autoheal request received from ~p~n", [Node]),
    case check_other_nodes(Partitions) of
        {error, E} ->
            rabbit_log:info("Autoheal request denied: ~s~n", [fmt_error(E)]),
            not_healing;
        {ok, AllPartitions} ->
            {Winner, Losers} = make_decision(AllPartitions),
            rabbit_log:info("Autoheal decision~n"
                            "  * Partitions: ~p~n"
                            "  * Winner:     ~p~n"
                            "  * Losers:     ~p~n",
                            [AllPartitions, Winner, Losers]),
            case node() =:= Winner of
                true  -> handle_msg({become_winner, Losers},
                                    not_healing, Partitions);
                false -> send(Winner, {become_winner, Losers}),
                         {leader_waiting, Winner, Losers}
            end
    end;

handle_msg({request_start, Node},
           State, _Partitions) ->
    rabbit_log:info("Autoheal request received from ~p when healing; "
                    "ignoring~n", [Node]),
    State;

handle_msg({become_winner, Losers},
           not_healing, _Partitions) ->
    rabbit_log:info("Autoheal: I am the winner, waiting for ~p to stop~n",
                    [Losers]),
    stop_partition(Losers);

handle_msg({become_winner, Losers},
           {winner_waiting, _, Losers}, _Partitions) ->
    %% The leader has aborted the healing, might have seen us down but
    %% we didn't see the same. Let's try again as it is the same partition.
    rabbit_log:info("Autoheal: I am the winner and received a duplicated "
		    "request, waiting again for ~p to stop~n", [Losers]),
    stop_partition(Losers);

handle_msg({become_winner, _},
           {winner_waiting, _, Losers}, _Partitions) ->
    %% Something has happened to the leader, it might have seen us down but we
    %% are still alive. Partitions have changed, cannot continue.
    rabbit_log:info("Autoheal: I am the winner and received another healing "
		    "request, partitions have changed to ~p. Aborting ~n", [Losers]),
    winner_finish(Losers),
    not_healing;

handle_msg({winner_is, Winner}, State = not_healing,
           _Partitions) ->
    %% This node is a loser, nothing else.
    Pid = restart_loser(State, Winner),
    {restarting, Pid};
handle_msg({winner_is, Winner}, State = {leader_waiting, Winner, _},
           _Partitions) ->
    %% This node is the leader and a loser at the same time.
    Pid = restart_loser(State, Winner),
    {restarting, Pid};

handle_msg(Request, {restarting, Pid} = St, _Partitions) ->
    %% ignore, we can contribute no further
    rabbit_log:info("Autoheal: Received the request ~p while waiting for ~p "
                    "to restart the node. Ignoring it ~n", [Request, Pid]),
    St;

handle_msg(report_autoheal_status, not_healing, _Partitions) ->
    %% The leader is asking about the autoheal status to us (the
    %% winner). This happens when the leader is a loser and it just
    %% restarted. We are in the "not_healing" state, so the previous
    %% autoheal process ended: let's tell this to the leader.
    send(leader(), {autoheal_finished, node()}),
    not_healing;

handle_msg(report_autoheal_status, State, _Partitions) ->
    %% Like above, the leader is asking about the autoheal status. We
    %% are not finished with it. There is no need to send anything yet
    %% to the leader: we will send the notification when it is over.
    State;

handle_msg({autoheal_finished, Winner},
           {leader_waiting, Winner, _}, _Partitions) ->
    %% The winner is finished with the autoheal process and notified us
    %% (the leader). We can transition to the "not_healing" state and
    %% accept new requests.
    rabbit_log:info("Autoheal finished according to winner ~p~n", [Winner]),
    not_healing;

handle_msg({autoheal_finished, Winner}, not_healing, _Partitions)
           when Winner =:= node() ->
    %% We are the leader and the winner. The state already transitioned
    %% to "not_healing" at the end of the autoheal process.
    rabbit_log:info("Autoheal finished according to winner ~p~n", [node()]),
    not_healing;

handle_msg({autoheal_finished, Winner}, not_healing, _Partitions) ->
    %% We might have seen the winner down during a partial partition and
    %% transitioned to not_healing. However, the winner was still able
    %% to finish. Let it pass.
    rabbit_log:info("Autoheal finished according to winner ~p."
		    " Unexpected, I might have previously seen the winner down~n", [Winner]),
    not_healing.

%%----------------------------------------------------------------------------

send(Node, Msg) -> {?SERVER, Node} ! {autoheal_msg, Msg}.

abort(Down, Notify) ->
    rabbit_log:info("Autoheal: aborting - ~p down~n", [Down]),
    %% Make sure any nodes waiting for us start - it won't necessarily
    %% heal the partition but at least they won't get stuck.
    %% If we are executing this, we are not stopping. Thus, don't wait
    %% for ourselves!
    winner_finish(Notify -- [node()]).

winner_finish(Notify) ->
    %% There is a race in Mnesia causing a starting loser to hang
    %% forever if another loser stops at the same time: the starting
    %% node connects to the other node, negotiates the protocol and
    %% attempts to acquire a write lock on the schema on the other node.
    %% If the other node stops between the protocol negotiation and lock
    %% request, the starting node never gets an answer to its lock
    %% request.
    %%
    %% To work around the problem, we make sure Mnesia is stopped on all
    %% losing nodes before sending the "autoheal_safe_to_start" signal.
    wait_for_mnesia_shutdown(Notify),
    [{rabbit_outside_app_process, N} ! autoheal_safe_to_start || N <- Notify],
    send(leader(), {autoheal_finished, node()}),
    not_healing.

%% This improves the previous implementation, but could still potentially enter an infinity
%% loop. If it also possible that for when it finishes some of the nodes have been
%% manually restarted, but we can't do much more (apart from stop them again). So let it
%% continue and notify all the losers to restart.
wait_for_mnesia_shutdown(AllNodes) ->
    Monitors = lists:foldl(fun(Node, Monitors0) ->
				   pmon:monitor({mnesia_sup, Node}, Monitors0)
			   end, pmon:new(), AllNodes),
    wait_for_supervisors(Monitors).

wait_for_supervisors(Monitors) ->
    case pmon:is_empty(Monitors) of
	true ->
	    ok;
	false ->
	    receive
		{'DOWN', _MRef, process, {mnesia_sup, _} = I, _Reason} ->
		    wait_for_supervisors(pmon:erase(I, Monitors))
	    after
		60000 ->
		    AliveLosers = [Node || {_, Node} <- pmon:monitored(Monitors)],
		    rabbit_log:info("Autoheal: mnesia in nodes ~p is still up, sending "
				    "winner notification again to these ~n", [AliveLosers]),
		    [send(L, {winner_is, node()}) || L <- AliveLosers],
		    wait_for_mnesia_shutdown(AliveLosers)
	    end
    end.

restart_loser(State, Winner) ->
    rabbit_log:warning(
      "Autoheal: we were selected to restart; winner is ~p~n", [Winner]),
    NextStateTimeout = application:get_env(rabbit, autoheal_state_transition_timeout, 60000),
    rabbit_node_monitor:run_outside_applications(
      fun () ->
              MRef = erlang:monitor(process, {?SERVER, Winner}),
              rabbit:stop(),
              NextState = receive
                  {'DOWN', MRef, process, {?SERVER, Winner}, _Reason} ->
                      not_healing;
                  autoheal_safe_to_start ->
                      State
                  after NextStateTimeout ->
                      rabbit_log:warning(
                          "Autoheal: timed out waiting for a safe-to-start message from the winner (~p); will retry",
                          [Winner]),
                      not_healing
              end,
              erlang:demonitor(MRef, [flush]),
              %% During the restart, the autoheal state is lost so we
              %% store it in the application environment temporarily so
              %% init/0 can pick it up.
              %%
              %% This is useful to the leader which is a loser at the
              %% same time: because the leader is restarting, there
              %% is a great chance it misses the "autoheal finished!"
              %% notification from the winner. Thanks to the saved
              %% state, it knows it needs to ask the winner if the
              %% autoheal process is finished or not.
              application:set_env(rabbit,
                ?AUTOHEAL_STATE_AFTER_RESTART, NextState),
              rabbit:start()
      end, true).

make_decision(AllPartitions) ->
    Sorted = lists:sort([{partition_value(P), P} || P <- AllPartitions]),
    [[Winner | _] | Rest] = lists:reverse([P || {_, P} <- Sorted]),
    {Winner, lists:append(Rest)}.

partition_value(Partition) ->
    Connections = [Res || Node <- Partition,
                          Res <- [rpc:call(Node, rabbit_networking,
                                           connections_local, [])],
                          is_list(Res)],
    {length(lists:append(Connections)), length(Partition)}.

%% We have our local understanding of what partitions exist; but we
%% only know which nodes we have been partitioned from, not which
%% nodes are partitioned from each other.
check_other_nodes(LocalPartitions) ->
    Nodes = rabbit_mnesia:cluster_nodes(all),
    {Results, Bad} = rabbit_node_monitor:status(Nodes -- [node()]),
    RemotePartitions = [{Node, proplists:get_value(partitions, Res)}
                        || {Node, Res} <- Results],
    RemoteDown = [{Node, Down}
                  || {Node, Res} <- Results,
                     Down <- [Nodes -- proplists:get_value(nodes, Res)],
                     Down =/= []],
    case {Bad, RemoteDown} of
        {[], []} -> Partitions = [{node(), LocalPartitions} | RemotePartitions],
                    {ok, all_partitions(Partitions, [Nodes])};
        {[], _}  -> {error, {remote_down, RemoteDown}};
        {_,  _}  -> {error, {nodes_down, Bad}}
    end.

all_partitions([], Partitions) ->
    Partitions;
all_partitions([{Node, CantSee} | Rest], Partitions) ->
    {[Containing], Others} =
        lists:partition(fun (Part) -> lists:member(Node, Part) end, Partitions),
    A = Containing -- CantSee,
    B = Containing -- A,
    Partitions1 = case {A, B} of
                      {[], _}  -> Partitions;
                      {_,  []} -> Partitions;
                      _        -> [A, B | Others]
                  end,
    all_partitions(Rest, Partitions1).

fmt_error({remote_down, RemoteDown}) ->
    rabbit_misc:format("Remote nodes disconnected:~n ~p", [RemoteDown]);
fmt_error({nodes_down, NodesDown}) ->
    rabbit_misc:format("Local nodes down: ~p", [NodesDown]).

stop_partition(Losers) ->
    %% The leader said everything was ready - do we agree? If not then
    %% give up.
    Down = Losers -- rabbit_node_monitor:alive_rabbit_nodes(Losers),
    case Down of
        [] -> [send(L, {winner_is, node()}) || L <- Losers],
              {winner_waiting, Losers, Losers};
        _  -> abort(Down, Losers)
    end.