diff options
author | Hans Bolinder <hasse@erlang.org> | 2021-05-05 07:41:59 +0200 |
---|---|---|
committer | Hans Bolinder <hasse@erlang.org> | 2021-05-28 14:20:59 +0200 |
commit | 834d68869c41d568afe2526a3afda5feee00b081 (patch) | |
tree | 4e6bf3b691188922c9f5e40dcf2e415ed16a8159 /lib/kernel/src/global.erl | |
parent | ae4f8649c0ed90abf177124a5c974abf89dd70e3 (diff) | |
download | erlang-834d68869c41d568afe2526a3afda5feee00b081.tar.gz |
kernel: Fix a race condition in Global
See also GH-4448 and GH-3923.
A race between locker processes on different nodes has been resolved
by using global_name_server as a proxy.
Diffstat (limited to 'lib/kernel/src/global.erl')
-rw-r--r-- | lib/kernel/src/global.erl | 60 |
1 files changed, 46 insertions, 14 deletions
diff --git a/lib/kernel/src/global.erl b/lib/kernel/src/global.erl index ff6674cd08..48359154d1 100644 --- a/lib/kernel/src/global.erl +++ b/lib/kernel/src/global.erl @@ -80,10 +80,12 @@ %% when communicating with vsn 3 nodes. (-R10B) %% Vsn 5 uses an ordered list of self() and HisTheLocker when locking %% nodes in the own partition. (R11B-) +%% Vsn 6 does not send any message between locker processes on different +%% nodes, but uses the server as a proxy. %% Current version of global does not support vsn 4 or earlier. --define(vsn, 5). +-define(vsn, 6). %%----------------------------------------------------------------- %% connect_all = boolean() - true if we are supposed to set up a @@ -156,6 +158,8 @@ %%% The new_nodes messages has been augmented with the global lock id. %%% %%% R14A (OTP-8527): The deleter process has been removed. +%%% +%%% Erlang/OTP 22.0: The extra process calling erlang:monitor() is removed. start() -> gen_server:start({local, global_name_server}, ?MODULE, [], []). @@ -567,17 +571,24 @@ init([]) -> %% {exchange, Node, ListOfNames, _ListOfNamesExt, Tag} %% {resolved, Node, HisOps, HisKnown, _Unused, ListOfNamesExt, Tag} %% HisKnown = list of known nodes in Node's partition -%% 2. Between lockers on connecting nodes -%% {his_locker, Pid} (from our global) -%% {lock, Bool} loop until both lockers have lock = true, -%% then send to global_name_server {lock_is_set, Node, Tag} -%% 3. Connecting node's global_name_server informs other nodes in the same +%% 2. Between global_name_server and my locker (the local locker) +%% {nodeup, Node, Tag} +%% {his_the_locker, Pid, {Vsn, HisKnown}, MyKnown} +%% {cancel, Node, Tag, Fun | no_fun} +%% {add_to_known, Nodes} +%% {remove_from_known, Nodes} +%% {lock_set, Pid, LockIsSet, HisKnown} (acting as proxy) +%% 3. Between lockers on connecting nodes (via proxy global_name_server) +%% {lock_set, Pid, LockIsSet, HisKnown} +%% loop until both lockers have LockIsSet =:= true, +%% then send to global_name_server {lock_is_set, Node, Tag, LockId} +%% 4. Connecting node's global_name_server informs other nodes in the same %% partition about hitherto unknown nodes in the other partition %% {new_nodes, Node, Ops, ListOfNamesExt, NewNodes, ExtraInfo} -%% 4. Between global_name_server and resolver +%% 5. Between global_name_server and resolver %% {resolve, NameList, Node} to resolver %% {exchange_ops, Node, Tag, Ops, Resolved} from resolver -%% 5. sync protocol, between global_name_servers in different partitions +%% 6. sync protocol, between global_name_servers in different partitions %% {in_sync, Node, IsKnown} %% sent by each node to all new nodes (Node becomes known to them) %%----------------------------------------------------------------- @@ -820,6 +831,11 @@ handle_cast({async_del_lock, _ResourceId, _Pid}, S) -> %% R14A nodes and later do not send async_del_lock messages. {noreply, S}; +handle_cast({lock_set, _Pid, _Set, _HisKnown} = Message, S) -> + #state{the_locker = Locker} = S, + Locker ! Message, + {noreply, S}; + handle_cast(Request, S) -> error_logger:warning_msg("The global_name_server " "received an unexpected message:\n" @@ -1488,7 +1504,13 @@ delete_global_name(_Name, _Pid) -> %% Version 1: used by unpatched R7. %% Version 2: the messages exchanged between the lockers include the known %% nodes (see OTP-3576). -%%----------------------------------------------------------------- + +%% As of version 6 of global, lockers (on different nodes) do not +%% communicate directly with each other. Instead messages are sent via +%% the server. This is in order to avoid race conditions where a +%% {lock_set, ...} message is received after (before) a nodedown +%% (nodeup). +%% ----------------------------------------------------------------- -define(locker_vsn, 2). @@ -1603,7 +1625,7 @@ the_locker_message({lock_set, Pid, true, _HisKnown}, S) -> {true, MyTag, HisVsn} -> LockId = locker_lock_id(Pid, HisVsn), {IsLockSet, S1} = lock_nodes_safely(LockId, [], S), - Pid ! {lock_set, self(), IsLockSet, S1#multi.known}, + send_lock_set(S1, IsLockSet, Pid, HisVsn), Known2 = [node() | S1#multi.known], ?trace({the_locker, spontaneous, {known2, Known2}, {node,Node}, {is_lock_set,IsLockSet}}), @@ -1630,7 +1652,7 @@ the_locker_message({lock_set, Pid, true, _HisKnown}, S) -> end; false -> ?trace({the_locker, not_there, {node,Node}}), - Pid ! {lock_set, self(), false, S#multi.known}, + send_lock_set(S, false, Pid, _HisVsn=5), loop_the_locker(S) end; the_locker_message({add_to_known, Nodes}, S) -> @@ -1672,8 +1694,7 @@ select_node(S) -> case IsLockSet of true -> Known1 = Us ++ S2#multi.known, - ?trace({sending_lock_set, self(), {his,HisTheLocker}}), - HisTheLocker ! {lock_set, self(), true, S2#multi.known}, + send_lock_set(S2, true, HisTheLocker, HisVsn), S3 = lock_is_set(S2, Him, MyTag, Known1, LockId), loop_the_locker(S3); false -> @@ -1681,7 +1702,18 @@ select_node(S) -> end end. -%% Version 5: Both sides use the same requester id. Thereby the nodes +send_lock_set(S, IsLockSet, HisTheLocker, Vsn) -> + ?trace({sending_lock_set, self(), {his,HisTheLocker}}), + Message = {lock_set, self(), IsLockSet, S#multi.known}, + if + Vsn < 6 -> + HisTheLocker ! Message, + ok; + true -> + gen_server:cast({global_name_server, node(HisTheLocker)}, Message) + end. + +%% Version 5-6: Both sides use the same requester id. Thereby the nodes %% common to both sides are locked by both locker processes. This %% means that the lock is still there when the 'new_nodes' message is %% received even if the other side has deleted the lock. |