File 1217-kernel-Fix-a-race-condition-in-Global.patch of Package erlang

From f2bd4a073961b4788a3b0a4e265b840a20f453eb Mon Sep 17 00:00:00 2001
From: Hans Bolinder <hasse@erlang.org>
Date: Wed, 5 May 2021 07:41:59 +0200
Subject: [PATCH] kernel: Fix a race condition in Global

See also GH-4448 and GH-3923.

A race between locker processes on different nodes has been resolved
by using global_name_server as a proxy.
---
 lib/kernel/src/global.erl        | 60 ++++++++++++++++++++++++--------
 lib/kernel/test/global_SUITE.erl | 26 ++++++++++----
 2 files changed, 66 insertions(+), 20 deletions(-)

diff --git a/lib/kernel/src/global.erl b/lib/kernel/src/global.erl
index 3875074d74..491f60688a 100644
--- a/lib/kernel/src/global.erl
+++ b/lib/kernel/src/global.erl
@@ -80,10 +80,12 @@
 %%       when communicating with vsn 3 nodes. (-R10B)
 %% Vsn 5 uses an ordered list of self() and HisTheLocker when locking
 %%       nodes in the own partition. (R11B-)
+%% Vsn 6 does not send any message between locker processes on different
+%%       nodes, but uses the server as a proxy.
 
 %% Current version of global does not support vsn 4 or earlier.
 
--define(vsn, 5).
+-define(vsn, 6).
 
 %%-----------------------------------------------------------------
 %% connect_all = boolean() - true if we are supposed to set up a
@@ -156,6 +158,8 @@
 %%% The new_nodes messages has been augmented with the global lock id.
 %%%
 %%% R14A (OTP-8527): The deleter process has been removed.
+%%%
+%%% Erlang/OTP 22.0: The extra process calling erlang:monitor() is removed.
 
 start() -> 
     gen_server:start({local, global_name_server}, ?MODULE, [], []).
@@ -567,17 +571,24 @@ init([]) ->
 %%    {exchange, Node, ListOfNames, _ListOfNamesExt, Tag}
 %%    {resolved, Node, HisOps, HisKnown, _Unused, ListOfNamesExt, Tag}
 %%         HisKnown = list of known nodes in Node's partition
-%% 2. Between lockers on connecting nodes
-%%    {his_locker, Pid} (from our global)
-%%    {lock, Bool} loop until both lockers have lock = true,
-%%          then send to global_name_server {lock_is_set, Node, Tag}
-%% 3. Connecting node's global_name_server informs other nodes in the same 
+%% 2. Between global_name_server and my locker (the local locker)
+%%    {nodeup, Node, Tag}
+%%    {his_the_locker, Pid, {Vsn, HisKnown}, MyKnown}
+%%    {cancel, Node, Tag, Fun | no_fun}
+%%    {add_to_known, Nodes}
+%%    {remove_from_known, Nodes}
+%%    {lock_set, Pid, LockIsSet, HisKnown} (acting as proxy)
+%% 3. Between lockers on connecting nodes (via proxy global_name_server)
+%%    {lock_set, Pid, LockIsSet, HisKnown}
+%%      loop until both lockers have LockIsSet =:= true,
+%%      then send to global_name_server {lock_is_set, Node, Tag, LockId}
+%% 4. Connecting node's global_name_server informs other nodes in the same
 %%    partition about hitherto unknown nodes in the other partition
 %%    {new_nodes, Node, Ops, ListOfNamesExt, NewNodes, ExtraInfo}
-%% 4. Between global_name_server and resolver
+%% 5. Between global_name_server and resolver
 %%    {resolve, NameList, Node} to resolver
 %%    {exchange_ops, Node, Tag, Ops, Resolved} from resolver
-%% 5. sync protocol, between global_name_servers in different partitions
+%% 6. sync protocol, between global_name_servers in different partitions
 %%    {in_sync, Node, IsKnown}
 %%          sent by each node to all new nodes (Node becomes known to them)
 %%-----------------------------------------------------------------
@@ -820,6 +831,11 @@ handle_cast({async_del_lock, _ResourceId, _Pid}, S) ->
     %% R14A nodes and later do not send async_del_lock messages.
     {noreply, S};
 
+handle_cast({lock_set, _Pid, _Set, _HisKnown} = Message, S) ->
+    #state{the_locker = Locker} = S,
+    Locker ! Message,
+    {noreply, S};
+
 handle_cast(Request, S) ->
     error_logger:warning_msg("The global_name_server "
                              "received an unexpected message:\n"
@@ -1488,7 +1504,13 @@ delete_global_name(_Name, _Pid) ->
 %% Version 1: used by unpatched R7.
 %% Version 2: the messages exchanged between the lockers include the known 
 %%            nodes (see OTP-3576).
-%%-----------------------------------------------------------------
+
+%% As of version 6 of global, lockers (on different nodes) do not
+%% communicate directly with each other. Instead messages are sent via
+%% the server. This is in order to avoid race conditions where a
+%% {lock_set, ...} message is received after (before) a nodedown
+%% (nodeup).
+%% -----------------------------------------------------------------
 
 -define(locker_vsn, 2).
 
@@ -1603,7 +1625,7 @@ the_locker_message({lock_set, Pid, true, _HisKnown}, S) ->
         {true, MyTag, HisVsn} ->
             LockId = locker_lock_id(Pid, HisVsn),
             {IsLockSet, S1} = lock_nodes_safely(LockId, [], S),
-            Pid ! {lock_set, self(), IsLockSet, S1#multi.known},
+            send_lock_set(S1, IsLockSet, Pid, HisVsn),
             Known2 = [node() | S1#multi.known],
             ?trace({the_locker, spontaneous, {known2, Known2},
                     {node,Node}, {is_lock_set,IsLockSet}}),
@@ -1630,7 +1652,7 @@ the_locker_message({lock_set, Pid, true, _HisKnown}, S) ->
             end;
         false ->
             ?trace({the_locker, not_there, {node,Node}}),
-            Pid ! {lock_set, self(), false, S#multi.known},
+            send_lock_set(S, false, Pid, _HisVsn=5),
             loop_the_locker(S)
     end;
 the_locker_message({add_to_known, Nodes}, S) ->
@@ -1672,8 +1694,7 @@ select_node(S) ->
             case IsLockSet of
                 true -> 
                     Known1 = Us ++ S2#multi.known,
-                    ?trace({sending_lock_set, self(), {his,HisTheLocker}}),
-                    HisTheLocker ! {lock_set, self(), true, S2#multi.known},
+                    send_lock_set(S2, true, HisTheLocker, HisVsn),
                     S3 = lock_is_set(S2, Him, MyTag, Known1, LockId),
                     loop_the_locker(S3);
                 false ->
@@ -1681,7 +1702,18 @@ select_node(S) ->
             end
     end.
 
-%% Version 5: Both sides use the same requester id. Thereby the nodes
+send_lock_set(S, IsLockSet, HisTheLocker, Vsn) ->
+    ?trace({sending_lock_set, self(), {his,HisTheLocker}}),
+    Message = {lock_set, self(), IsLockSet, S#multi.known},
+    if
+        Vsn < 6 ->
+            HisTheLocker ! Message,
+            ok;
+        true ->
+            gen_server:cast({global_name_server, node(HisTheLocker)}, Message)
+    end.
+
+%% Version 5-6: Both sides use the same requester id. Thereby the nodes
 %% common to both sides are locked by both locker processes. This
 %% means that the lock is still there when the 'new_nodes' message is
 %% received even if the other side has deleted the lock.
diff --git a/lib/kernel/test/global_SUITE.erl b/lib/kernel/test/global_SUITE.erl
index 5bff9cc292..3837a44c64 100644
--- a/lib/kernel/test/global_SUITE.erl
+++ b/lib/kernel/test/global_SUITE.erl
@@ -391,6 +393,7 @@ write_high_level_trace(Nodes, Config) ->
                Node <- Nodes],
     Dir = proplists:get_value(priv_dir, Config),
     DataFile = filename:join([Dir, lists:concat(["global_", ?testcase])]),
+    io:format("High-level trace on ~p\n", [DataFile]),
     file:write_file(DataFile, term_to_binary({high_level_trace, When, Data})).
 
 lock_global2(Id, Parent) ->
@@ -2762,11 +2765,11 @@ many_nodes(Config) when is_list(Config) ->
                                 Osname =:= openbsd; 
                                 Osname =:= darwin ->
                 N_nodes = quite_a_few_nodes(32),
-                {node_rel(1, N_nodes, this), N_nodes};
+                {node_rel(1, N_nodes), N_nodes};
             {unix, _} ->
-                {node_rel(1, 32, this), 32};
+                {node_rel(1, 32), 32};
             _ -> 
-                {node_rel(1, 32, this), 32}
+                {node_rel(1, 32), 32}
         end,
     Cps = [begin {ok, Cp} = start_node_rel(Name, Rel, Config), Cp end ||
 	      {Name,Rel} <- Rels],
@@ -2814,8 +2817,19 @@ many_nodes(Config) when is_list(Config) ->
     io:format("~s~n", [Return]),
     {comment, Return}.
 
-node_rel(From, To, Rel) ->
-    [{lists:concat([cp, N]), Rel} || N <- lists:seq(From, To)].
+node_rel(From, To) ->
+    NodeNumbers = lists:seq(From, To),
+    Release = erlang:system_info(otp_release),
+    LastRelease = integer_to_list(list_to_integer(Release) - 1) ++ "_latest",
+    Last = case test_server:is_release_available(LastRelease) of
+               true -> list_to_atom(LastRelease);
+               false -> this
+           end,
+    [{lists:concat([cp, N]),
+      case N rem 2 of
+          0 -> this;
+          1 -> Last
+      end} || N <- NodeNumbers].
 
 isolated_node(File, GoFile, Nodes, Config) ->
     Ns = lists:sort(Nodes),
@@ -4061,7 +4075,7 @@ init_condition(Config) ->
     io:format("globally registered names: ~p~n", [global:registered_names()]),
     io:format("nodes: ~p~n", [nodes()]),
     io:format("known: ~p~n", [get_known(node()) -- [node()]]),
-    io:format("Info ~p~n", [setelement(11, global:info(), trace)]),
+    io:format("Info ~p~n", [setelement(10, global:info(), trace)]),
     _ = [io:format("~s: ~p~n", [TN, ets:tab2list(T)]) ||
             {TN, T} <- [{"Global Names     (ETS)", global_names},
                         {"Global Names Ext (ETS)", global_names_ext},
-- 
2.26.2

openSUSE Build Service is sponsored by