diff options
Diffstat (limited to 'deps/rabbit/src/gm.erl')
-rw-r--r-- | deps/rabbit/src/gm.erl | 1650 |
1 files changed, 1650 insertions, 0 deletions
diff --git a/deps/rabbit/src/gm.erl b/deps/rabbit/src/gm.erl new file mode 100644 index 0000000000..af24a2958a --- /dev/null +++ b/deps/rabbit/src/gm.erl @@ -0,0 +1,1650 @@ +%% This Source Code Form is subject to the terms of the Mozilla Public +%% License, v. 2.0. If a copy of the MPL was not distributed with this +%% file, You can obtain one at https://mozilla.org/MPL/2.0/. +%% +%% Copyright (c) 2007-2020 VMware, Inc. or its affiliates. All rights reserved. +%% + +-module(gm). + +%% Guaranteed Multicast +%% ==================== +%% +%% This module provides the ability to create named groups of +%% processes to which members can be dynamically added and removed, +%% and for messages to be broadcast within the group that are +%% guaranteed to reach all members of the group during the lifetime of +%% the message. The lifetime of a message is defined as being, at a +%% minimum, the time from which the message is first sent to any +%% member of the group, up until the time at which it is known by the +%% member who published the message that the message has reached all +%% group members. +%% +%% The guarantee given is that provided a message, once sent, makes it +%% to members who do not all leave the group, the message will +%% continue to propagate to all group members. +%% +%% Another way of stating the guarantee is that if member P publishes +%% messages m and m', then for all members P', if P' is a member of +%% the group prior to the publication of m, and P' receives m', then +%% P' will receive m. +%% +%% Note that only local-ordering is enforced: i.e. if member P sends +%% message m and then message m', then for-all members P', if P' +%% receives m and m', then they will receive m' after m. Causality +%% ordering is _not_ enforced. I.e. if member P receives message m +%% and as a result publishes message m', there is no guarantee that +%% other members P' will receive m before m'. +%% +%% +%% API Use +%% ------- +%% +%% Mnesia must be started. Use the idempotent create_tables/0 function +%% to create the tables required. +%% +%% start_link/3 +%% Provide the group name, the callback module name, and any arguments +%% you wish to be passed into the callback module's functions. The +%% joined/2 function will be called when we have joined the group, +%% with the arguments passed to start_link and a list of the current +%% members of the group. See the callbacks specs and the comments +%% below for further details of the callback functions. +%% +%% leave/1 +%% Provide the Pid. Removes the Pid from the group. The callback +%% handle_terminate/2 function will be called. +%% +%% broadcast/2 +%% Provide the Pid and a Message. The message will be sent to all +%% members of the group as per the guarantees given above. This is a +%% cast and the function call will return immediately. There is no +%% guarantee that the message will reach any member of the group. +%% +%% confirmed_broadcast/2 +%% Provide the Pid and a Message. As per broadcast/2 except that this +%% is a call, not a cast, and only returns 'ok' once the Message has +%% reached every member of the group. Do not call +%% confirmed_broadcast/2 directly from the callback module otherwise +%% you will deadlock the entire group. +%% +%% info/1 +%% Provide the Pid. Returns a proplist with various facts, including +%% the group name and the current group members. +%% +%% validate_members/2 +%% Check whether a given member list agrees with the chosen member's +%% view. Any differences will be communicated via the members_changed +%% callback. If there are no differences then there will be no reply. +%% Note that members will not necessarily share the same view. +%% +%% forget_group/1 +%% Provide the group name. Removes its mnesia record. Makes no attempt +%% to ensure the group is empty. +%% +%% Implementation Overview +%% ----------------------- +%% +%% One possible means of implementation would be a fan-out from the +%% sender to every member of the group. This would require that the +%% group is fully connected, and, in the event that the original +%% sender of the message disappears from the group before the message +%% has made it to every member of the group, raises questions as to +%% who is responsible for sending on the message to new group members. +%% In particular, the issue is with [ Pid ! Msg || Pid <- Members ] - +%% if the sender dies part way through, who is responsible for +%% ensuring that the remaining Members receive the Msg? In the event +%% that within the group, messages sent are broadcast from a subset of +%% the members, the fan-out arrangement has the potential to +%% substantially impact the CPU and network workload of such members, +%% as such members would have to accommodate the cost of sending each +%% message to every group member. +%% +%% Instead, if the members of the group are arranged in a chain, then +%% it becomes easier to reason about who within the group has received +%% each message and who has not. It eases issues of responsibility: in +%% the event of a group member disappearing, the nearest upstream +%% member of the chain is responsible for ensuring that messages +%% continue to propagate down the chain. It also results in equal +%% distribution of sending and receiving workload, even if all +%% messages are being sent from just a single group member. This +%% configuration has the further advantage that it is not necessary +%% for every group member to know of every other group member, and +%% even that a group member does not have to be accessible from all +%% other group members. +%% +%% Performance is kept high by permitting pipelining and all +%% communication between joined group members is asynchronous. In the +%% chain A -> B -> C -> D, if A sends a message to the group, it will +%% not directly contact C or D. However, it must know that D receives +%% the message (in addition to B and C) before it can consider the +%% message fully sent. A simplistic implementation would require that +%% D replies to C, C replies to B and B then replies to A. This would +%% result in a propagation delay of twice the length of the chain. It +%% would also require, in the event of the failure of C, that D knows +%% to directly contact B and issue the necessary replies. Instead, the +%% chain forms a ring: D sends the message on to A: D does not +%% distinguish A as the sender, merely as the next member (downstream) +%% within the chain (which has now become a ring). When A receives +%% from D messages that A sent, it knows that all members have +%% received the message. However, the message is not dead yet: if C +%% died as B was sending to C, then B would need to detect the death +%% of C and forward the message on to D instead: thus every node has +%% to remember every message published until it is told that it can +%% forget about the message. This is essential not just for dealing +%% with failure of members, but also for the addition of new members. +%% +%% Thus once A receives the message back again, it then sends to B an +%% acknowledgement for the message, indicating that B can now forget +%% about the message. B does so, and forwards the ack to C. C forgets +%% the message, and forwards the ack to D, which forgets the message +%% and finally forwards the ack back to A. At this point, A takes no +%% further action: the message and its acknowledgement have made it to +%% every member of the group. The message is now dead, and any new +%% member joining the group at this point will not receive the +%% message. +%% +%% We therefore have two roles: +%% +%% 1. The sender, who upon receiving their own messages back, must +%% then send out acknowledgements, and upon receiving their own +%% acknowledgements back perform no further action. +%% +%% 2. The other group members who upon receiving messages and +%% acknowledgements must update their own internal state accordingly +%% (the sending member must also do this in order to be able to +%% accommodate failures), and forwards messages on to their downstream +%% neighbours. +%% +%% +%% Implementation: It gets trickier +%% -------------------------------- +%% +%% Chain A -> B -> C -> D +%% +%% A publishes a message which B receives. A now dies. B and D will +%% detect the death of A, and will link up, thus the chain is now B -> +%% C -> D. B forwards A's message on to C, who forwards it to D, who +%% forwards it to B. Thus B is now responsible for A's messages - both +%% publications and acknowledgements that were in flight at the point +%% at which A died. Even worse is that this is transitive: after B +%% forwards A's message to C, B dies as well. Now C is not only +%% responsible for B's in-flight messages, but is also responsible for +%% A's in-flight messages. +%% +%% Lemma 1: A member can only determine which dead members they have +%% inherited responsibility for if there is a total ordering on the +%% conflicting additions and subtractions of members from the group. +%% +%% Consider the simultaneous death of B and addition of B' that +%% transitions a chain from A -> B -> C to A -> B' -> C. Either B' or +%% C is responsible for in-flight messages from B. It is easy to +%% ensure that at least one of them thinks they have inherited B, but +%% if we do not ensure that exactly one of them inherits B, then we +%% could have B' converting publishes to acks, which then will crash C +%% as C does not believe it has issued acks for those messages. +%% +%% More complex scenarios are easy to concoct: A -> B -> C -> D -> E +%% becoming A -> C' -> E. Who has inherited which of B, C and D? +%% +%% However, for non-conflicting membership changes, only a partial +%% ordering is required. For example, A -> B -> C becoming A -> A' -> +%% B. The addition of A', between A and B can have no conflicts with +%% the death of C: it is clear that A has inherited C's messages. +%% +%% For ease of implementation, we adopt the simple solution, of +%% imposing a total order on all membership changes. +%% +%% On the death of a member, it is ensured the dead member's +%% neighbours become aware of the death, and the upstream neighbour +%% now sends to its new downstream neighbour its state, including the +%% messages pending acknowledgement. The downstream neighbour can then +%% use this to calculate which publishes and acknowledgements it has +%% missed out on, due to the death of its old upstream. Thus the +%% downstream can catch up, and continues the propagation of messages +%% through the group. +%% +%% Lemma 2: When a member is joining, it must synchronously +%% communicate with its upstream member in order to receive its +%% starting state atomically with its addition to the group. +%% +%% New members must start with the same state as their nearest +%% upstream neighbour. This ensures that it is not surprised by +%% acknowledgements they are sent, and that should their downstream +%% neighbour die, they are able to send the correct state to their new +%% downstream neighbour to ensure it can catch up. Thus in the +%% transition A -> B -> C becomes A -> A' -> B -> C becomes A -> A' -> +%% C, A' must start with the state of A, so that it can send C the +%% correct state when B dies, allowing C to detect any missed +%% messages. +%% +%% If A' starts by adding itself to the group membership, A could then +%% die, without A' having received the necessary state from A. This +%% would leave A' responsible for in-flight messages from A, but +%% having the least knowledge of all, of those messages. Thus A' must +%% start by synchronously calling A, which then immediately sends A' +%% back its state. A then adds A' to the group. If A dies at this +%% point then A' will be able to see this (as A' will fail to appear +%% in the group membership), and thus A' will ignore the state it +%% receives from A, and will simply repeat the process, trying to now +%% join downstream from some other member. This ensures that should +%% the upstream die as soon as the new member has been joined, the new +%% member is guaranteed to receive the correct state, allowing it to +%% correctly process messages inherited due to the death of its +%% upstream neighbour. +%% +%% The canonical definition of the group membership is held by a +%% distributed database. Whilst this allows the total ordering of +%% changes to be achieved, it is nevertheless undesirable to have to +%% query this database for the current view, upon receiving each +%% message. Instead, we wish for members to be able to cache a view of +%% the group membership, which then requires a cache invalidation +%% mechanism. Each member maintains its own view of the group +%% membership. Thus when the group's membership changes, members may +%% need to become aware of such changes in order to be able to +%% accurately process messages they receive. Because of the +%% requirement of a total ordering of conflicting membership changes, +%% it is not possible to use the guaranteed broadcast mechanism to +%% communicate these changes: to achieve the necessary ordering, it +%% would be necessary for such messages to be published by exactly one +%% member, which can not be guaranteed given that such a member could +%% die. +%% +%% The total ordering we enforce on membership changes gives rise to a +%% view version number: every change to the membership creates a +%% different view, and the total ordering permits a simple +%% monotonically increasing view version number. +%% +%% Lemma 3: If a message is sent from a member that holds view version +%% N, it can be correctly processed by any member receiving the +%% message with a view version >= N. +%% +%% Initially, let us suppose that each view contains the ordering of +%% every member that was ever part of the group. Dead members are +%% marked as such. Thus we have a ring of members, some of which are +%% dead, and are thus inherited by the nearest alive downstream +%% member. +%% +%% In the chain A -> B -> C, all three members initially have view +%% version 1, which reflects reality. B publishes a message, which is +%% forward by C to A. B now dies, which A notices very quickly. Thus A +%% updates the view, creating version 2. It now forwards B's +%% publication, sending that message to its new downstream neighbour, +%% C. This happens before C is aware of the death of B. C must become +%% aware of the view change before it interprets the message its +%% received, otherwise it will fail to learn of the death of B, and +%% thus will not realise it has inherited B's messages (and will +%% likely crash). +%% +%% Thus very simply, we have that each subsequent view contains more +%% information than the preceding view. +%% +%% However, to avoid the views growing indefinitely, we need to be +%% able to delete members which have died _and_ for which no messages +%% are in-flight. This requires that upon inheriting a dead member, we +%% know the last publication sent by the dead member (this is easy: we +%% inherit a member because we are the nearest downstream member which +%% implies that we know at least as much than everyone else about the +%% publications of the dead member), and we know the earliest message +%% for which the acknowledgement is still in flight. +%% +%% In the chain A -> B -> C, when B dies, A will send to C its state +%% (as C is the new downstream from A), allowing C to calculate which +%% messages it has missed out on (described above). At this point, C +%% also inherits B's messages. If that state from A also includes the +%% last message published by B for which an acknowledgement has been +%% seen, then C knows exactly which further acknowledgements it must +%% receive (also including issuing acknowledgements for publications +%% still in-flight that it receives), after which it is known there +%% are no more messages in flight for B, thus all evidence that B was +%% ever part of the group can be safely removed from the canonical +%% group membership. +%% +%% Thus, for every message that a member sends, it includes with that +%% message its view version. When a member receives a message it will +%% update its view from the canonical copy, should its view be older +%% than the view version included in the message it has received. +%% +%% The state held by each member therefore includes the messages from +%% each publisher pending acknowledgement, the last publication seen +%% from that publisher, and the last acknowledgement from that +%% publisher. In the case of the member's own publications or +%% inherited members, this last acknowledgement seen state indicates +%% the last acknowledgement retired, rather than sent. +%% +%% +%% Proof sketch +%% ------------ +%% +%% We need to prove that with the provided operational semantics, we +%% can never reach a state that is not well formed from a well-formed +%% starting state. +%% +%% Operational semantics (small step): straight-forward message +%% sending, process monitoring, state updates. +%% +%% Well formed state: dead members inherited by exactly one non-dead +%% member; for every entry in anyone's pending-acks, either (the +%% publication of the message is in-flight downstream from the member +%% and upstream from the publisher) or (the acknowledgement of the +%% message is in-flight downstream from the publisher and upstream +%% from the member). +%% +%% Proof by induction on the applicable operational semantics. +%% +%% +%% Related work +%% ------------ +%% +%% The ring configuration and double traversal of messages around the +%% ring is similar (though developed independently) to the LCR +%% protocol by [Levy 2008]. However, LCR differs in several +%% ways. Firstly, by using vector clocks, it enforces a total order of +%% message delivery, which is unnecessary for our purposes. More +%% significantly, it is built on top of a "group communication system" +%% which performs the group management functions, taking +%% responsibility away from the protocol as to how to cope with safely +%% adding and removing members. When membership changes do occur, the +%% protocol stipulates that every member must perform communication +%% with every other member of the group, to ensure all outstanding +%% deliveries complete, before the entire group transitions to the new +%% view. This, in total, requires two sets of all-to-all synchronous +%% communications. +%% +%% This is not only rather inefficient, but also does not explain what +%% happens upon the failure of a member during this process. It does +%% though entirely avoid the need for inheritance of responsibility of +%% dead members that our protocol incorporates. +%% +%% In [Marandi et al 2010], a Paxos-based protocol is described. This +%% work explicitly focuses on the efficiency of communication. LCR +%% (and our protocol too) are more efficient, but at the cost of +%% higher latency. The Ring-Paxos protocol is itself built on top of +%% IP-multicast, which rules it out for many applications where +%% point-to-point communication is all that can be required. They also +%% have an excellent related work section which I really ought to +%% read... +%% +%% +%% [Levy 2008] The Complexity of Reliable Distributed Storage, 2008. +%% [Marandi et al 2010] Ring Paxos: A High-Throughput Atomic Broadcast +%% Protocol + + +-behaviour(gen_server2). + +-export([create_tables/0, start_link/4, leave/1, broadcast/2, broadcast/3, + confirmed_broadcast/2, info/1, validate_members/2, forget_group/1]). + +-export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, + code_change/3, prioritise_info/3]). + +%% For INSTR_MOD callbacks +-export([call/3, cast/2, monitor/1, demonitor/1]). + +-export([table_definitions/0]). + +-define(GROUP_TABLE, gm_group). +-define(MAX_BUFFER_SIZE, 100000000). %% 100MB +-define(BROADCAST_TIMER, 25). +-define(FORCE_GC_TIMER, 250). +-define(VERSION_START, 0). +-define(SETS, ordsets). + +-record(state, + { self, + left, + right, + group_name, + module, + view, + pub_count, + members_state, + callback_args, + confirms, + broadcast_buffer, + broadcast_buffer_sz, + broadcast_timer, + force_gc_timer, + txn_executor, + shutting_down + }). + +-record(gm_group, { name, version, members }). + +-record(view_member, { id, aliases, left, right }). + +-record(member, { pending_ack, last_pub, last_ack }). + +-define(TABLE, {?GROUP_TABLE, [{record_name, gm_group}, + {attributes, record_info(fields, gm_group)}]}). +-define(TABLE_MATCH, {match, #gm_group { _ = '_' }}). + +-define(TAG, '$gm'). + +-export_type([group_name/0]). + +-type group_name() :: any(). +-type txn_fun() :: fun((fun(() -> any())) -> any()). + +%% The joined, members_changed and handle_msg callbacks can all return +%% any of the following terms: +%% +%% 'ok' - the callback function returns normally +%% +%% {'stop', Reason} - the callback indicates the member should stop +%% with reason Reason and should leave the group. +%% +%% {'become', Module, Args} - the callback indicates that the callback +%% module should be changed to Module and that the callback functions +%% should now be passed the arguments Args. This allows the callback +%% module to be dynamically changed. + +%% Called when we've successfully joined the group. Supplied with Args +%% provided in start_link, plus current group members. +-callback joined(Args :: term(), Members :: [pid()]) -> + ok | {stop, Reason :: term()} | {become, Module :: atom(), Args :: any()}. + +%% Supplied with Args provided in start_link, the list of new members +%% and the list of members previously known to us that have since +%% died. Note that if a member joins and dies very quickly, it's +%% possible that we will never see that member appear in either births +%% or deaths. However we are guaranteed that (1) we will see a member +%% joining either in the births here, or in the members passed to +%% joined/2 before receiving any messages from it; and (2) we will not +%% see members die that we have not seen born (or supplied in the +%% members to joined/2). +-callback members_changed(Args :: term(), + Births :: [pid()], Deaths :: [pid()]) -> + ok | {stop, Reason :: term()} | {become, Module :: atom(), Args :: any()}. + +%% Supplied with Args provided in start_link, the sender, and the +%% message. This does get called for messages injected by this member, +%% however, in such cases, there is no special significance of this +%% invocation: it does not indicate that the message has made it to +%% any other members, let alone all other members. +-callback handle_msg(Args :: term(), From :: pid(), Message :: term()) -> + ok | {stop, Reason :: term()} | {become, Module :: atom(), Args :: any()}. + +%% Called on gm member termination as per rules in gen_server, with +%% the Args provided in start_link plus the termination Reason. +-callback handle_terminate(Args :: term(), Reason :: term()) -> + ok | term(). + +-spec create_tables() -> 'ok' | {'aborted', any()}. + +create_tables() -> + create_tables([?TABLE]). + +create_tables([]) -> + ok; +create_tables([{Table, Attributes} | Tables]) -> + case mnesia:create_table(Table, Attributes) of + {atomic, ok} -> create_tables(Tables); + {aborted, {already_exists, Table}} -> create_tables(Tables); + Err -> Err + end. + +table_definitions() -> + {Name, Attributes} = ?TABLE, + [{Name, [?TABLE_MATCH | Attributes]}]. + +-spec start_link(group_name(), atom(), any(), txn_fun()) -> + rabbit_types:ok_pid_or_error(). + +start_link(GroupName, Module, Args, TxnFun) -> + gen_server2:start_link(?MODULE, [GroupName, Module, Args, TxnFun], + [{spawn_opt, [{fullsweep_after, 0}]}]). + +-spec leave(pid()) -> 'ok'. + +leave(Server) -> + gen_server2:cast(Server, leave). + +-spec broadcast(pid(), any()) -> 'ok'. + +broadcast(Server, Msg) -> broadcast(Server, Msg, 0). + +broadcast(Server, Msg, SizeHint) -> + gen_server2:cast(Server, {broadcast, Msg, SizeHint}). + +-spec confirmed_broadcast(pid(), any()) -> 'ok'. + +confirmed_broadcast(Server, Msg) -> + gen_server2:call(Server, {confirmed_broadcast, Msg}, infinity). + +-spec info(pid()) -> rabbit_types:infos(). + +info(Server) -> + gen_server2:call(Server, info, infinity). + +-spec validate_members(pid(), [pid()]) -> 'ok'. + +validate_members(Server, Members) -> + gen_server2:cast(Server, {validate_members, Members}). + +-spec forget_group(group_name()) -> 'ok'. + +forget_group(GroupName) -> + {atomic, ok} = mnesia:sync_transaction( + fun () -> + mnesia:delete({?GROUP_TABLE, GroupName}) + end), + ok. + +init([GroupName, Module, Args, TxnFun]) -> + put(process_name, {?MODULE, GroupName}), + Self = make_member(GroupName), + gen_server2:cast(self(), join), + {ok, #state { self = Self, + left = {Self, undefined}, + right = {Self, undefined}, + group_name = GroupName, + module = Module, + view = undefined, + pub_count = -1, + members_state = undefined, + callback_args = Args, + confirms = queue:new(), + broadcast_buffer = [], + broadcast_buffer_sz = 0, + broadcast_timer = undefined, + force_gc_timer = undefined, + txn_executor = TxnFun, + shutting_down = false }}. + + +handle_call({confirmed_broadcast, _Msg}, _From, + State = #state { shutting_down = {true, _} }) -> + reply(shutting_down, State); + +handle_call({confirmed_broadcast, _Msg}, _From, + State = #state { members_state = undefined }) -> + reply(not_joined, State); + +handle_call({confirmed_broadcast, Msg}, _From, + State = #state { self = Self, + right = {Self, undefined}, + module = Module, + callback_args = Args }) -> + handle_callback_result({Module:handle_msg(Args, get_pid(Self), Msg), + ok, State}); + +handle_call({confirmed_broadcast, Msg}, From, State) -> + {Result, State1 = #state { pub_count = PubCount, confirms = Confirms }} = + internal_broadcast(Msg, 0, State), + Confirms1 = queue:in({PubCount, From}, Confirms), + handle_callback_result({Result, flush_broadcast_buffer( + State1 #state { confirms = Confirms1 })}); + +handle_call(info, _From, + State = #state { members_state = undefined }) -> + reply(not_joined, State); + +handle_call(info, _From, State = #state { group_name = GroupName, + module = Module, + view = View }) -> + reply([{group_name, GroupName}, + {module, Module}, + {group_members, get_pids(alive_view_members(View))}], State); + +handle_call({add_on_right, _NewMember}, _From, + State = #state { members_state = undefined }) -> + reply(not_ready, State); + +handle_call({add_on_right, NewMember}, _From, + State = #state { self = Self, + group_name = GroupName, + members_state = MembersState, + txn_executor = TxnFun }) -> + try + Group = record_new_member_in_group( + NewMember, Self, GroupName, TxnFun), + View1 = group_to_view(check_membership(Self, Group)), + MembersState1 = remove_erased_members(MembersState, View1), + ok = send_right(NewMember, View1, + {catchup, Self, prepare_members_state(MembersState1)}), + {Result, State1} = change_view(View1, State #state { + members_state = MembersState1 }), + handle_callback_result({Result, {ok, Group}, State1}) + catch + lost_membership -> + {stop, shutdown, State} + end. + +%% add_on_right causes a catchup to be sent immediately from the left, +%% so we can never see this from the left neighbour. However, it's +%% possible for the right neighbour to send us a check_neighbours +%% immediately before that. We can't possibly handle it, but if we're +%% in this state we know a catchup is coming imminently anyway. So +%% just ignore it. +handle_cast({?TAG, _ReqVer, check_neighbours}, + State = #state { members_state = undefined }) -> + noreply(State); + +handle_cast({?TAG, ReqVer, Msg}, + State = #state { view = View, + self = Self, + members_state = MembersState, + group_name = GroupName }) -> + try + {Result, State1} = + case needs_view_update(ReqVer, View) of + true -> + View1 = group_to_view( + check_membership(Self, + dirty_read_group(GroupName))), + MemberState1 = remove_erased_members(MembersState, View1), + change_view(View1, State #state { + members_state = MemberState1 }); + false -> {ok, State} + end, + handle_callback_result( + if_callback_success( + Result, fun handle_msg_true/3, fun handle_msg_false/3, Msg, State1)) + catch + lost_membership -> + {stop, shutdown, State} + end; + +handle_cast({broadcast, _Msg, _SizeHint}, + State = #state { shutting_down = {true, _} }) -> + noreply(State); + +handle_cast({broadcast, _Msg, _SizeHint}, + State = #state { members_state = undefined }) -> + noreply(State); + +handle_cast({broadcast, Msg, _SizeHint}, + State = #state { self = Self, + right = {Self, undefined}, + module = Module, + callback_args = Args }) -> + handle_callback_result({Module:handle_msg(Args, get_pid(Self), Msg), + State}); + +handle_cast({broadcast, Msg, SizeHint}, State) -> + {Result, State1} = internal_broadcast(Msg, SizeHint, State), + handle_callback_result({Result, maybe_flush_broadcast_buffer(State1)}); + +handle_cast(join, State = #state { self = Self, + group_name = GroupName, + members_state = undefined, + module = Module, + callback_args = Args, + txn_executor = TxnFun }) -> + try + View = join_group(Self, GroupName, TxnFun), + MembersState = + case alive_view_members(View) of + [Self] -> blank_member_state(); + _ -> undefined + end, + State1 = check_neighbours(State #state { view = View, + members_state = MembersState }), + handle_callback_result( + {Module:joined(Args, get_pids(all_known_members(View))), State1}) + catch + lost_membership -> + {stop, shutdown, State} + end; + +handle_cast({validate_members, OldMembers}, + State = #state { view = View, + module = Module, + callback_args = Args }) -> + NewMembers = get_pids(all_known_members(View)), + Births = NewMembers -- OldMembers, + Deaths = OldMembers -- NewMembers, + case {Births, Deaths} of + {[], []} -> noreply(State); + _ -> Result = Module:members_changed(Args, Births, Deaths), + handle_callback_result({Result, State}) + end; + +handle_cast(leave, State) -> + {stop, normal, State}. + + +handle_info(force_gc, State) -> + garbage_collect(), + noreply(State #state { force_gc_timer = undefined }); + +handle_info(flush, State) -> + noreply( + flush_broadcast_buffer(State #state { broadcast_timer = undefined })); + +handle_info(timeout, State) -> + noreply(flush_broadcast_buffer(State)); + +handle_info({'DOWN', _MRef, process, _Pid, _Reason}, + State = #state { shutting_down = + {true, {shutdown, ring_shutdown}} }) -> + noreply(State); +handle_info({'DOWN', MRef, process, _Pid, Reason}, + State = #state { self = Self, + left = Left, + right = Right, + group_name = GroupName, + confirms = Confirms, + txn_executor = TxnFun }) -> + try + check_membership(GroupName), + Member = case {Left, Right} of + {{Member1, MRef}, _} -> Member1; + {_, {Member1, MRef}} -> Member1; + _ -> undefined + end, + case {Member, Reason} of + {undefined, _} -> + noreply(State); + {_, {shutdown, ring_shutdown}} -> + noreply(State); + _ -> + %% In the event of a partial partition we could see another member + %% go down and then remove them from Mnesia. While they can + %% recover from this they'd have to restart the queue - not + %% ideal. So let's sleep here briefly just in case this was caused + %% by a partial partition; in which case by the time we record the + %% member death in Mnesia we will probably be in a full + %% partition and will not be assassinating another member. + timer:sleep(100), + View1 = group_to_view(record_dead_member_in_group(Self, + Member, GroupName, TxnFun, true)), + handle_callback_result( + case alive_view_members(View1) of + [Self] -> maybe_erase_aliases( + State #state { + members_state = blank_member_state(), + confirms = purge_confirms(Confirms) }, + View1); + _ -> change_view(View1, State) + end) + end + catch + lost_membership -> + {stop, shutdown, State} + end; +handle_info(_, State) -> + %% Discard any unexpected messages, such as late replies from neighbour_call/2 + %% TODO: For #gm_group{} related info messages, it could be worthwhile to + %% change_view/2, as this might reflect an alteration in the gm group, meaning + %% we now need to update our state. see rabbitmq-server#914. + noreply(State). + +terminate(Reason, #state { module = Module, callback_args = Args }) -> + Module:handle_terminate(Args, Reason). + +code_change(_OldVsn, State, _Extra) -> + {ok, State}. + +prioritise_info(flush, _Len, _State) -> + 1; +%% DOWN messages should not overtake initial catchups; if they do we +%% will receive a DOWN we do not know what to do with. +prioritise_info({'DOWN', _MRef, process, _Pid, _Reason}, _Len, + #state { members_state = undefined }) -> + 0; +%% We should not prioritise DOWN messages from our left since +%% otherwise the DOWN can overtake any last activity from the left, +%% causing that activity to be lost. +prioritise_info({'DOWN', _MRef, process, LeftPid, _Reason}, _Len, + #state { left = {{_LeftVer, LeftPid}, _MRef2} }) -> + 0; +%% But prioritise all other DOWNs - we want to make sure we are not +%% sending activity into the void for too long because our right is +%% down but we don't know it. +prioritise_info({'DOWN', _MRef, process, _Pid, _Reason}, _Len, _State) -> + 1; +prioritise_info(_, _Len, _State) -> + 0. + + +handle_msg(check_neighbours, State) -> + %% no-op - it's already been done by the calling handle_cast + {ok, State}; + +handle_msg({catchup, Left, MembersStateLeft}, + State = #state { self = Self, + left = {Left, _MRefL}, + right = {Right, _MRefR}, + view = View, + members_state = undefined }) -> + ok = send_right(Right, View, {catchup, Self, MembersStateLeft}), + MembersStateLeft1 = build_members_state(MembersStateLeft), + {ok, State #state { members_state = MembersStateLeft1 }}; + +handle_msg({catchup, Left, MembersStateLeft}, + State = #state { self = Self, + left = {Left, _MRefL}, + view = View, + members_state = MembersState }) + when MembersState =/= undefined -> + MembersStateLeft1 = build_members_state(MembersStateLeft), + AllMembers = lists:usort(maps:keys(MembersState) ++ + maps:keys(MembersStateLeft1)), + {MembersState1, Activity} = + lists:foldl( + fun (Id, MembersStateActivity) -> + #member { pending_ack = PALeft, last_ack = LA } = + find_member_or_blank(Id, MembersStateLeft1), + with_member_acc( + fun (#member { pending_ack = PA } = Member, Activity1) -> + case is_member_alias(Id, Self, View) of + true -> + {_AcksInFlight, Pubs, _PA1} = + find_prefix_common_suffix(PALeft, PA), + {Member #member { last_ack = LA }, + activity_cons(Id, pubs_from_queue(Pubs), + [], Activity1)}; + false -> + {Acks, _Common, Pubs} = + find_prefix_common_suffix(PA, PALeft), + {Member, + activity_cons(Id, pubs_from_queue(Pubs), + acks_from_queue(Acks), + Activity1)} + end + end, Id, MembersStateActivity) + end, {MembersState, activity_nil()}, AllMembers), + handle_msg({activity, Left, activity_finalise(Activity)}, + State #state { members_state = MembersState1 }); + +handle_msg({catchup, _NotLeft, _MembersState}, State) -> + {ok, State}; + +handle_msg({activity, Left, Activity}, + State = #state { self = Self, + group_name = GroupName, + left = {Left, _MRefL}, + view = View, + members_state = MembersState, + confirms = Confirms }) + when MembersState =/= undefined -> + try + %% If we have to stop, do it asap so we avoid any ack confirmation + %% Membership must be checked again by erase_members_in_group, as the + %% node can be marked as dead on the meanwhile + check_membership(GroupName), + {MembersState1, {Confirms1, Activity1}} = + calculate_activity(MembersState, Confirms, Activity, Self, View), + State1 = State #state { members_state = MembersState1, + confirms = Confirms1 }, + Activity3 = activity_finalise(Activity1), + ok = maybe_send_activity(Activity3, State1), + {Result, State2} = maybe_erase_aliases(State1, View), + if_callback_success( + Result, fun activity_true/3, fun activity_false/3, Activity3, State2) + catch + lost_membership -> + {{stop, shutdown}, State} + end; + +handle_msg({activity, _NotLeft, _Activity}, State) -> + {ok, State}. + + +noreply(State) -> + {noreply, ensure_timers(State), flush_timeout(State)}. + +reply(Reply, State) -> + {reply, Reply, ensure_timers(State), flush_timeout(State)}. + +ensure_timers(State) -> + ensure_force_gc_timer(ensure_broadcast_timer(State)). + +flush_timeout(#state{broadcast_buffer = []}) -> infinity; +flush_timeout(_) -> 0. + +ensure_force_gc_timer(State = #state { force_gc_timer = TRef }) + when is_reference(TRef) -> + State; +ensure_force_gc_timer(State = #state { force_gc_timer = undefined }) -> + TRef = erlang:send_after(?FORCE_GC_TIMER, self(), force_gc), + State #state { force_gc_timer = TRef }. + +ensure_broadcast_timer(State = #state { broadcast_buffer = [], + broadcast_timer = undefined }) -> + State; +ensure_broadcast_timer(State = #state { broadcast_buffer = [], + broadcast_timer = TRef }) -> + _ = erlang:cancel_timer(TRef), + State #state { broadcast_timer = undefined }; +ensure_broadcast_timer(State = #state { broadcast_timer = undefined }) -> + TRef = erlang:send_after(?BROADCAST_TIMER, self(), flush), + State #state { broadcast_timer = TRef }; +ensure_broadcast_timer(State) -> + State. + +internal_broadcast(Msg, SizeHint, + State = #state { self = Self, + pub_count = PubCount, + module = Module, + callback_args = Args, + broadcast_buffer = Buffer, + broadcast_buffer_sz = BufferSize }) -> + PubCount1 = PubCount + 1, + {Module:handle_msg(Args, get_pid(Self), Msg), + State #state { pub_count = PubCount1, + broadcast_buffer = [{PubCount1, Msg} | Buffer], + broadcast_buffer_sz = BufferSize + SizeHint}}. + +%% The Erlang distribution mechanism has an interesting quirk - it +%% will kill the VM cold with "Absurdly large distribution output data +%% buffer" if you attempt to send a message which serialises out to +%% more than 2^31 bytes in size. It's therefore a very good idea to +%% make sure that we don't exceed that size! +%% +%% Now, we could figure out the size of messages as they come in using +%% size(term_to_binary(Msg)) or similar. The trouble is, that requires +%% us to serialise the message only to throw the serialised form +%% away. Hard to believe that's a sensible thing to do. So instead we +%% accept a size hint from the application, via broadcast/3. This size +%% hint can be the size of anything in the message which we expect +%% could be large, and we just ignore the size of any small bits of +%% the message term. Therefore MAX_BUFFER_SIZE is set somewhat +%% conservatively at 100MB - but the buffer is only to allow us to +%% buffer tiny messages anyway, so 100MB is plenty. + +maybe_flush_broadcast_buffer(State = #state{broadcast_buffer_sz = Size}) -> + case Size > ?MAX_BUFFER_SIZE of + true -> flush_broadcast_buffer(State); + false -> State + end. + +flush_broadcast_buffer(State = #state { broadcast_buffer = [] }) -> + State; +flush_broadcast_buffer(State = #state { self = Self, + members_state = MembersState, + broadcast_buffer = Buffer, + pub_count = PubCount }) -> + [{PubCount, _Msg}|_] = Buffer, %% ASSERTION match on PubCount + Pubs = lists:reverse(Buffer), + Activity = activity_cons(Self, Pubs, [], activity_nil()), + ok = maybe_send_activity(activity_finalise(Activity), State), + MembersState1 = with_member( + fun (Member = #member { pending_ack = PA }) -> + PA1 = queue:join(PA, queue:from_list(Pubs)), + Member #member { pending_ack = PA1, + last_pub = PubCount } + end, Self, MembersState), + State #state { members_state = MembersState1, + broadcast_buffer = [], + broadcast_buffer_sz = 0 }. + +%% --------------------------------------------------------------------------- +%% View construction and inspection +%% --------------------------------------------------------------------------- + +needs_view_update(ReqVer, {Ver, _View}) -> Ver < ReqVer. + +view_version({Ver, _View}) -> Ver. + +is_member_alive({dead, _Member}) -> false; +is_member_alive(_) -> true. + +is_member_alias(Self, Self, _View) -> + true; +is_member_alias(Member, Self, View) -> + ?SETS:is_element(Member, + ((fetch_view_member(Self, View)) #view_member.aliases)). + +dead_member_id({dead, Member}) -> Member. + +store_view_member(VMember = #view_member { id = Id }, {Ver, View}) -> + {Ver, maps:put(Id, VMember, View)}. + +with_view_member(Fun, View, Id) -> + store_view_member(Fun(fetch_view_member(Id, View)), View). + +fetch_view_member(Id, {_Ver, View}) -> maps:get(Id, View). + +find_view_member(Id, {_Ver, View}) -> maps:find(Id, View). + +blank_view(Ver) -> {Ver, maps:new()}. + +alive_view_members({_Ver, View}) -> maps:keys(View). + +all_known_members({_Ver, View}) -> + maps:fold( + fun (Member, #view_member { aliases = Aliases }, Acc) -> + ?SETS:to_list(Aliases) ++ [Member | Acc] + end, [], View). + +group_to_view(#gm_group { members = Members, version = Ver }) -> + Alive = lists:filter(fun is_member_alive/1, Members), + [_|_] = Alive, %% ASSERTION - can't have all dead members + add_aliases(link_view(Alive ++ Alive ++ Alive, blank_view(Ver)), Members). + +link_view([Left, Middle, Right | Rest], View) -> + case find_view_member(Middle, View) of + error -> + link_view( + [Middle, Right | Rest], + store_view_member(#view_member { id = Middle, + aliases = ?SETS:new(), + left = Left, + right = Right }, View)); + {ok, _} -> + View + end; +link_view(_, View) -> + View. + +add_aliases(View, Members) -> + Members1 = ensure_alive_suffix(Members), + {EmptyDeadSet, View1} = + lists:foldl( + fun (Member, {DeadAcc, ViewAcc}) -> + case is_member_alive(Member) of + true -> + {?SETS:new(), + with_view_member( + fun (VMember = + #view_member { aliases = Aliases }) -> + VMember #view_member { + aliases = ?SETS:union(Aliases, DeadAcc) } + end, ViewAcc, Member)}; + false -> + {?SETS:add_element(dead_member_id(Member), DeadAcc), + ViewAcc} + end + end, {?SETS:new(), View}, Members1), + 0 = ?SETS:size(EmptyDeadSet), %% ASSERTION + View1. + +ensure_alive_suffix(Members) -> + queue:to_list(ensure_alive_suffix1(queue:from_list(Members))). + +ensure_alive_suffix1(MembersQ) -> + {{value, Member}, MembersQ1} = queue:out_r(MembersQ), + case is_member_alive(Member) of + true -> MembersQ; + false -> ensure_alive_suffix1(queue:in_r(Member, MembersQ1)) + end. + + +%% --------------------------------------------------------------------------- +%% View modification +%% --------------------------------------------------------------------------- + +join_group(Self, GroupName, TxnFun) -> + join_group(Self, GroupName, dirty_read_group(GroupName), TxnFun). + +join_group(Self, GroupName, {error, not_found}, TxnFun) -> + join_group(Self, GroupName, + prune_or_create_group(Self, GroupName, TxnFun), TxnFun); +join_group(Self, _GroupName, #gm_group { members = [Self] } = Group, _TxnFun) -> + group_to_view(Group); +join_group(Self, GroupName, #gm_group { members = Members } = Group, TxnFun) -> + case lists:member(Self, Members) of + true -> + group_to_view(Group); + false -> + case lists:filter(fun is_member_alive/1, Members) of + [] -> + join_group(Self, GroupName, + prune_or_create_group(Self, GroupName, TxnFun), + TxnFun); + Alive -> + Left = lists:nth(rand:uniform(length(Alive)), Alive), + Handler = + fun () -> + join_group( + Self, GroupName, + record_dead_member_in_group(Self, + Left, GroupName, TxnFun, false), + TxnFun) + end, + try + case neighbour_call(Left, {add_on_right, Self}) of + {ok, Group1} -> group_to_view(Group1); + not_ready -> join_group(Self, GroupName, TxnFun) + end + catch + exit:{R, _} + when R =:= noproc; R =:= normal; R =:= shutdown -> + Handler(); + exit:{{R, _}, _} + when R =:= nodedown; R =:= shutdown -> + Handler() + end + end + end. + +dirty_read_group(GroupName) -> + case mnesia:dirty_read(?GROUP_TABLE, GroupName) of + [] -> {error, not_found}; + [Group] -> Group + end. + +read_group(GroupName) -> + case mnesia:read({?GROUP_TABLE, GroupName}) of + [] -> {error, not_found}; + [Group] -> Group + end. + +write_group(Group) -> mnesia:write(?GROUP_TABLE, Group, write), Group. + +prune_or_create_group(Self, GroupName, TxnFun) -> + TxnFun( + fun () -> + GroupNew = #gm_group { name = GroupName, + members = [Self], + version = get_version(Self) }, + case read_group(GroupName) of + {error, not_found} -> + write_group(GroupNew); + Group = #gm_group { members = Members } -> + case lists:any(fun is_member_alive/1, Members) of + true -> Group; + false -> write_group(GroupNew) + end + end + end). + +record_dead_member_in_group(Self, Member, GroupName, TxnFun, Verify) -> + Fun = + fun () -> + try + Group = #gm_group { members = Members, version = Ver } = + case Verify of + true -> + check_membership(Self, read_group(GroupName)); + false -> + check_group(read_group(GroupName)) + end, + case lists:splitwith( + fun (Member1) -> Member1 =/= Member end, Members) of + {_Members1, []} -> %% not found - already recorded dead + Group; + {Members1, [Member | Members2]} -> + Members3 = Members1 ++ [{dead, Member} | Members2], + write_group(Group #gm_group { members = Members3, + version = Ver + 1 }) + end + catch + lost_membership -> + %% The transaction must not be abruptly crashed, but + %% leave the gen_server to stop normally + {error, lost_membership} + end + end, + handle_lost_membership_in_txn(TxnFun, Fun). + +handle_lost_membership_in_txn(TxnFun, Fun) -> + case TxnFun(Fun) of + {error, lost_membership = T} -> + throw(T); + Any -> + Any + end. + +record_new_member_in_group(NewMember, Left, GroupName, TxnFun) -> + Fun = + fun () -> + try + Group = #gm_group { members = Members, version = Ver } = + check_membership(Left, read_group(GroupName)), + case lists:member(NewMember, Members) of + true -> + %% This avois duplicates during partial partitions, + %% as inconsistent views might happen during them + rabbit_log:warning("(~p) GM avoiding duplicate of ~p", + [self(), NewMember]), + Group; + false -> + {Prefix, [Left | Suffix]} = + lists:splitwith(fun (M) -> M =/= Left end, Members), + write_group(Group #gm_group { + members = Prefix ++ [Left, NewMember | Suffix], + version = Ver + 1 }) + end + catch + lost_membership -> + %% The transaction must not be abruptly crashed, but + %% leave the gen_server to stop normally + {error, lost_membership} + end + end, + handle_lost_membership_in_txn(TxnFun, Fun). + +erase_members_in_group(Self, Members, GroupName, TxnFun) -> + DeadMembers = [{dead, Id} || Id <- Members], + Fun = + fun () -> + try + Group = #gm_group { members = [_|_] = Members1, version = Ver } = + check_membership(Self, read_group(GroupName)), + case Members1 -- DeadMembers of + Members1 -> Group; + Members2 -> write_group( + Group #gm_group { members = Members2, + version = Ver + 1 }) + end + catch + lost_membership -> + %% The transaction must not be abruptly crashed, but + %% leave the gen_server to stop normally + {error, lost_membership} + end + end, + handle_lost_membership_in_txn(TxnFun, Fun). + +maybe_erase_aliases(State = #state { self = Self, + group_name = GroupName, + members_state = MembersState, + txn_executor = TxnFun }, View) -> + #view_member { aliases = Aliases } = fetch_view_member(Self, View), + {Erasable, MembersState1} + = ?SETS:fold( + fun (Id, {ErasableAcc, MembersStateAcc} = Acc) -> + #member { last_pub = LP, last_ack = LA } = + find_member_or_blank(Id, MembersState), + case can_erase_view_member(Self, Id, LA, LP) of + true -> {[Id | ErasableAcc], + erase_member(Id, MembersStateAcc)}; + false -> Acc + end + end, {[], MembersState}, Aliases), + View1 = case Erasable of + [] -> View; + _ -> group_to_view( + erase_members_in_group(Self, Erasable, GroupName, TxnFun)) + end, + change_view(View1, State #state { members_state = MembersState1 }). + +can_erase_view_member(Self, Self, _LA, _LP) -> false; +can_erase_view_member(_Self, _Id, N, N) -> true; +can_erase_view_member(_Self, _Id, _LA, _LP) -> false. + +neighbour_cast(N, Msg) -> ?INSTR_MOD:cast(get_pid(N), Msg). +neighbour_call(N, Msg) -> ?INSTR_MOD:call(get_pid(N), Msg, infinity). + +%% --------------------------------------------------------------------------- +%% View monitoring and maintenance +%% --------------------------------------------------------------------------- + +ensure_neighbour(_Ver, Self, {Self, undefined}, Self) -> + {Self, undefined}; +ensure_neighbour(Ver, Self, {Self, undefined}, RealNeighbour) -> + ok = neighbour_cast(RealNeighbour, {?TAG, Ver, check_neighbours}), + {RealNeighbour, maybe_monitor(RealNeighbour, Self)}; +ensure_neighbour(_Ver, _Self, {RealNeighbour, MRef}, RealNeighbour) -> + {RealNeighbour, MRef}; +ensure_neighbour(Ver, Self, {RealNeighbour, MRef}, Neighbour) -> + true = ?INSTR_MOD:demonitor(MRef), + Msg = {?TAG, Ver, check_neighbours}, + ok = neighbour_cast(RealNeighbour, Msg), + ok = case Neighbour of + Self -> ok; + _ -> neighbour_cast(Neighbour, Msg) + end, + {Neighbour, maybe_monitor(Neighbour, Self)}. + +maybe_monitor( Self, Self) -> undefined; +maybe_monitor(Other, _Self) -> ?INSTR_MOD:monitor(get_pid(Other)). + +check_neighbours(State = #state { self = Self, + left = Left, + right = Right, + view = View, + broadcast_buffer = Buffer }) -> + #view_member { left = VLeft, right = VRight } + = fetch_view_member(Self, View), + Ver = view_version(View), + Left1 = ensure_neighbour(Ver, Self, Left, VLeft), + Right1 = ensure_neighbour(Ver, Self, Right, VRight), + Buffer1 = case Right1 of + {Self, undefined} -> []; + _ -> Buffer + end, + State1 = State #state { left = Left1, right = Right1, + broadcast_buffer = Buffer1 }, + ok = maybe_send_catchup(Right, State1), + State1. + +maybe_send_catchup(Right, #state { right = Right }) -> + ok; +maybe_send_catchup(_Right, #state { self = Self, + right = {Self, undefined} }) -> + ok; +maybe_send_catchup(_Right, #state { members_state = undefined }) -> + ok; +maybe_send_catchup(_Right, #state { self = Self, + right = {Right, _MRef}, + view = View, + members_state = MembersState }) -> + send_right(Right, View, + {catchup, Self, prepare_members_state(MembersState)}). + + +%% --------------------------------------------------------------------------- +%% Catch_up delta detection +%% --------------------------------------------------------------------------- + +find_prefix_common_suffix(A, B) -> + {Prefix, A1} = find_prefix(A, B, queue:new()), + {Common, Suffix} = find_common(A1, B, queue:new()), + {Prefix, Common, Suffix}. + +%% Returns the elements of A that occur before the first element of B, +%% plus the remainder of A. +find_prefix(A, B, Prefix) -> + case {queue:out(A), queue:out(B)} of + {{{value, Val}, _A1}, {{value, Val}, _B1}} -> + {Prefix, A}; + {{empty, A1}, {{value, _A}, _B1}} -> + {Prefix, A1}; + {{{value, {NumA, _MsgA} = Val}, A1}, + {{value, {NumB, _MsgB}}, _B1}} when NumA < NumB -> + find_prefix(A1, B, queue:in(Val, Prefix)); + {_, {empty, _B1}} -> + {A, Prefix} %% Prefix well be empty here + end. + +%% A should be a prefix of B. Returns the commonality plus the +%% remainder of B. +find_common(A, B, Common) -> + case {queue:out(A), queue:out(B)} of + {{{value, Val}, A1}, {{value, Val}, B1}} -> + find_common(A1, B1, queue:in(Val, Common)); + {{empty, _A}, _} -> + {Common, B}; + %% Drop value from B. + %% Match value to avoid infinite loop, since {empty, B} = queue:out(B). + {_, {{value, _}, B1}} -> + find_common(A, B1, Common); + %% Drop value from A. Empty A should be matched by second close. + {{{value, _}, A1}, _} -> + find_common(A1, B, Common) + end. + + +%% --------------------------------------------------------------------------- +%% Members helpers +%% --------------------------------------------------------------------------- + +with_member(Fun, Id, MembersState) -> + store_member( + Id, Fun(find_member_or_blank(Id, MembersState)), MembersState). + +with_member_acc(Fun, Id, {MembersState, Acc}) -> + {MemberState, Acc1} = Fun(find_member_or_blank(Id, MembersState), Acc), + {store_member(Id, MemberState, MembersState), Acc1}. + +find_member_or_blank(Id, MembersState) -> + case maps:find(Id, MembersState) of + {ok, Result} -> Result; + error -> blank_member() + end. + +erase_member(Id, MembersState) -> maps:remove(Id, MembersState). + +blank_member() -> + #member { pending_ack = queue:new(), last_pub = -1, last_ack = -1 }. + +blank_member_state() -> maps:new(). + +store_member(Id, MemberState, MembersState) -> + maps:put(Id, MemberState, MembersState). + +prepare_members_state(MembersState) -> maps:to_list(MembersState). + +build_members_state(MembersStateList) -> maps:from_list(MembersStateList). + +make_member(GroupName) -> + {case dirty_read_group(GroupName) of + #gm_group { version = Version } -> Version; + {error, not_found} -> ?VERSION_START + end, self()}. + +remove_erased_members(MembersState, View) -> + lists:foldl(fun (Id, MembersState1) -> + store_member(Id, find_member_or_blank(Id, MembersState), + MembersState1) + end, blank_member_state(), all_known_members(View)). + +get_version({Version, _Pid}) -> Version. + +get_pid({_Version, Pid}) -> Pid. + +get_pids(Ids) -> [Pid || {_Version, Pid} <- Ids]. + +%% --------------------------------------------------------------------------- +%% Activity assembly +%% --------------------------------------------------------------------------- + +activity_nil() -> queue:new(). + +activity_cons( _Id, [], [], Tail) -> Tail; +activity_cons(Sender, Pubs, Acks, Tail) -> queue:in({Sender, Pubs, Acks}, Tail). + +activity_finalise(Activity) -> queue:to_list(Activity). + +maybe_send_activity([], _State) -> + ok; +maybe_send_activity(Activity, #state { self = Self, + right = {Right, _MRefR}, + view = View }) -> + send_right(Right, View, {activity, Self, Activity}). + +send_right(Right, View, Msg) -> + ok = neighbour_cast(Right, {?TAG, view_version(View), Msg}). + +calculate_activity(MembersState, Confirms, Activity, Self, View) -> + lists:foldl( + fun ({Id, Pubs, Acks}, MembersStateConfirmsActivity) -> + with_member_acc( + fun (Member = #member { pending_ack = PA, + last_pub = LP, + last_ack = LA }, + {Confirms2, Activity2}) -> + case is_member_alias(Id, Self, View) of + true -> + {ToAck, PA1} = + find_common(queue_from_pubs(Pubs), PA, + queue:new()), + LA1 = last_ack(Acks, LA), + AckNums = acks_from_queue(ToAck), + Confirms3 = maybe_confirm( + Self, Id, Confirms2, AckNums), + {Member #member { pending_ack = PA1, + last_ack = LA1 }, + {Confirms3, + activity_cons( + Id, [], AckNums, Activity2)}}; + false -> + PA1 = apply_acks(Acks, join_pubs(PA, Pubs)), + LA1 = last_ack(Acks, LA), + LP1 = last_pub(Pubs, LP), + {Member #member { pending_ack = PA1, + last_pub = LP1, + last_ack = LA1 }, + {Confirms2, + activity_cons(Id, Pubs, Acks, Activity2)}} + end + end, Id, MembersStateConfirmsActivity) + end, {MembersState, {Confirms, activity_nil()}}, Activity). + +callback(Args, Module, Activity) -> + Result = + lists:foldl( + fun ({Id, Pubs, _Acks}, {Args1, Module1, ok}) -> + lists:foldl(fun ({_PubNum, Pub}, Acc = {Args2, Module2, ok}) -> + case Module2:handle_msg( + Args2, get_pid(Id), Pub) of + ok -> + Acc; + {become, Module3, Args3} -> + {Args3, Module3, ok}; + {stop, _Reason} = Error -> + Error + end; + (_, Error = {stop, _Reason}) -> + Error + end, {Args1, Module1, ok}, Pubs); + (_, Error = {stop, _Reason}) -> + Error + end, {Args, Module, ok}, Activity), + case Result of + {Args, Module, ok} -> ok; + {Args1, Module1, ok} -> {become, Module1, Args1}; + {stop, _Reason} = Error -> Error + end. + +change_view(View, State = #state { view = View0, + module = Module, + callback_args = Args }) -> + OldMembers = all_known_members(View0), + NewMembers = all_known_members(View), + Births = NewMembers -- OldMembers, + Deaths = OldMembers -- NewMembers, + Result = case {Births, Deaths} of + {[], []} -> ok; + _ -> Module:members_changed( + Args, get_pids(Births), get_pids(Deaths)) + end, + {Result, check_neighbours(State #state { view = View })}. + +handle_callback_result({Result, State}) -> + if_callback_success( + Result, fun no_reply_true/3, fun no_reply_false/3, undefined, State); +handle_callback_result({Result, Reply, State}) -> + if_callback_success( + Result, fun reply_true/3, fun reply_false/3, Reply, State). + +no_reply_true (_Result, _Undefined, State) -> noreply(State). +no_reply_false({stop, Reason}, _Undefined, State) -> {stop, Reason, State}. + +reply_true (_Result, Reply, State) -> reply(Reply, State). +reply_false({stop, Reason}, Reply, State) -> {stop, Reason, Reply, State}. + +handle_msg_true (_Result, Msg, State) -> handle_msg(Msg, State). +handle_msg_false(Result, _Msg, State) -> {Result, State}. + +activity_true(_Result, Activity, State = #state { module = Module, + callback_args = Args }) -> + {callback(Args, Module, Activity), State}. +activity_false(Result, _Activity, State) -> + {Result, State}. + +if_callback_success(Result, True, False, Arg, State) -> + {NewResult, NewState} = maybe_stop(Result, State), + if_callback_success1(NewResult, True, False, Arg, NewState). + +if_callback_success1(ok, True, _False, Arg, State) -> + True(ok, Arg, State); +if_callback_success1( + {become, Module, Args} = Result, True, _False, Arg, State) -> + True(Result, Arg, State #state { module = Module, + callback_args = Args }); +if_callback_success1({stop, _Reason} = Result, _True, False, Arg, State) -> + False(Result, Arg, State). + +maybe_stop({stop, Reason}, #state{ shutting_down = false } = State) -> + ShuttingDown = {true, Reason}, + case has_pending_messages(State) of + true -> {ok, State #state{ shutting_down = ShuttingDown }}; + false -> {{stop, Reason}, State #state{ shutting_down = ShuttingDown }} + end; +maybe_stop(Result, #state{ shutting_down = false } = State) -> + {Result, State}; +maybe_stop(Result, #state{ shutting_down = {true, Reason} } = State) -> + case has_pending_messages(State) of + true -> {Result, State}; + false -> {{stop, Reason}, State} + end. + +has_pending_messages(#state{ broadcast_buffer = Buffer }) + when Buffer =/= [] -> + true; +has_pending_messages(#state{ members_state = MembersState }) -> + MembersWithPubAckMismatches = maps:filter(fun(_Id, #member{last_pub = LP, last_ack = LA}) -> + LP =/= LA + end, MembersState), + 0 =/= maps:size(MembersWithPubAckMismatches). + +maybe_confirm(_Self, _Id, Confirms, []) -> + Confirms; +maybe_confirm(Self, Self, Confirms, [PubNum | PubNums]) -> + case queue:out(Confirms) of + {empty, _Confirms} -> + Confirms; + {{value, {PubNum, From}}, Confirms1} -> + gen_server2:reply(From, ok), + maybe_confirm(Self, Self, Confirms1, PubNums); + {{value, {PubNum1, _From}}, _Confirms} when PubNum1 > PubNum -> + maybe_confirm(Self, Self, Confirms, PubNums) + end; +maybe_confirm(_Self, _Id, Confirms, _PubNums) -> + Confirms. + +purge_confirms(Confirms) -> + _ = [gen_server2:reply(From, ok) || {_PubNum, From} <- queue:to_list(Confirms)], + queue:new(). + + +%% --------------------------------------------------------------------------- +%% Msg transformation +%% --------------------------------------------------------------------------- + +acks_from_queue(Q) -> [PubNum || {PubNum, _Msg} <- queue:to_list(Q)]. + +pubs_from_queue(Q) -> queue:to_list(Q). + +queue_from_pubs(Pubs) -> queue:from_list(Pubs). + +apply_acks( [], Pubs) -> Pubs; +apply_acks(List, Pubs) -> {_, Pubs1} = queue:split(length(List), Pubs), + Pubs1. + +join_pubs(Q, []) -> Q; +join_pubs(Q, Pubs) -> queue:join(Q, queue_from_pubs(Pubs)). + +last_ack( [], LA) -> LA; +last_ack(List, LA) -> LA1 = lists:last(List), + true = LA1 > LA, %% ASSERTION + LA1. + +last_pub( [], LP) -> LP; +last_pub(List, LP) -> {PubNum, _Msg} = lists:last(List), + true = PubNum > LP, %% ASSERTION + PubNum. + +%% --------------------------------------------------------------------------- + +%% Uninstrumented versions + +call(Pid, Msg, Timeout) -> gen_server2:call(Pid, Msg, Timeout). +cast(Pid, Msg) -> gen_server2:cast(Pid, Msg). +monitor(Pid) -> erlang:monitor(process, Pid). +demonitor(MRef) -> erlang:demonitor(MRef). + +check_membership(Self, #gm_group{members = M} = Group) -> + case lists:member(Self, M) of + true -> + Group; + false -> + throw(lost_membership) + end; +check_membership(_Self, {error, not_found}) -> + throw(lost_membership). + +check_membership(GroupName) -> + case dirty_read_group(GroupName) of + #gm_group{members = M} -> + case lists:keymember(self(), 2, M) of + true -> + ok; + false -> + throw(lost_membership) + end; + {error, not_found} -> + throw(lost_membership) + end. + +check_group({error, not_found}) -> + throw(lost_membership); +check_group(Any) -> + Any. |