File 2351-kernel-Add-a-Global-testcase-showing-weakness-of-alg.patch of Package erlang

From a769d10343ba676e550aa218a896fb88d3456ed5 Mon Sep 17 00:00:00 2001
From: Hans Bolinder <hasse@erlang.org>
Date: Mon, 7 Jun 2021 07:18:12 +0200
Subject: [PATCH 1/2] kernel: Add a Global testcase showing weakness of algo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The scenario of the testcase is described in
https://erlang.org/pipermail/erlang-questions/2020-October/100034.html:

Dániel Szoboszlay dszoboszlay@REDACTED
Mon Oct 12 11:05:19 CEST 2020

Hi,

Global can indeed end up in inconsistent states if some nodes get
disconnected from each other (so you're no longer running on a fully
connected mesh). Since when registering a global name on node X the change
is only propagated to nodes that X are directly connected to, you can end
up in a situation that X and Y are connected together, so they will both
know about the name, and Y and Z are connected together but X and Z are
not, so Z never gets the update.

When two nodes (re)connect, they only compare the names they locally know
about. So it is a bit tricky, but you can actually end up in a situation
when all nodes are connected, yet the global name databases are
inconsistent. You will need at least 4 nodes for this scenario to happen
(e.g. A, B, C & D):

   1. All nodes are connected initially.
   2. A gets disconnected from C.
   3. A registers process X under some name: this gets propagated to B & D,
   but not C.
   4. B gets disconnected from D.
   5. B re-registers process Y under some name: this gets propagated to A &
   C, but not D, so on D the name still belongs to X.
   6. A reconnects to C, since they both know the name belongs to Y they
   will inform their half of the network about the new node, but won't issue
   any global name updates.
   7. You have all 4 nodes connected again, but A, B & C believe the name
   belongs to Y, while D believes it belongs to X.

So this can happen, if you know how global works you can understand how it
can happen, but I don't think it would be expected by many people to
actually happen. :)

global:sync() is not really meant to resolve this error. The only solution
I know about is to manually compare global name registrations shortly after
you see a new node connecting.

Cheers,
Daniel
---
 lib/kernel/test/global_SUITE.erl | 87 ++++++++++++++++++++++++++++++--
 1 file changed, 84 insertions(+), 3 deletions(-)

diff --git a/lib/kernel/test/global_SUITE.erl b/lib/kernel/test/global_SUITE.erl
index bdc4d9ce16..9c4e272684 100644
--- a/lib/kernel/test/global_SUITE.erl
+++ b/lib/kernel/test/global_SUITE.erl
@@ -44,6 +44,7 @@
 	 mass_death/1,
 	 garbage_messages/1,
          ring_line/1,
+         flaw1/1,
          lost_connection/1,
          lost_connection2/1,
          global_disconnect/1
@@ -141,7 +142,7 @@ all() ->
 	     simple_resolve2, simple_resolve3, leftover_name,
 	     re_register_name, name_exit, external_nodes, many_nodes,
 	     sync_0, global_groups_change, register_1, both_known_1,
-	     lost_unregister, mass_death, garbage_messages,
+	     lost_unregister, mass_death, garbage_messages, flaw1,
              lost_connection, lost_connection2, global_disconnect
             ]
     end.
@@ -490,8 +491,9 @@ write_high_level_trace(Nodes, Config) ->
                Node <- Nodes],
     Dir = proplists:get_value(priv_dir, Config),
     DataFile = filename:join([Dir, lists:concat(["global_", ?testcase])]),
-    ?P("High-level trace on:"
-       "~n      ~p", [DataFile]),
+    io:format("\n\nAnalyze high level trace like this:\n"),
+    io:format("global_trace:dd(~p, [{show_state, 0, 10}]). % 10 seconds\n",
+             [DataFile]),
     file:write_file(DataFile, term_to_binary({high_level_trace, When, Data})).
 
 lock_global2(Id, Parent) ->
@@ -4281,6 +4283,84 @@ garbage_messages(Config) when is_list(Config) ->
     init_condition(Config),
     ok.
 
+%% This is scenario outlined in
+%% https://erlang.org/pipermail/erlang-questions/2020-October/100034.html.
+%% It illustrates that the algorithm of Global is flawed.
+flaw1(Config) ->
+    Timeout = 360,
+    ct:timetrap({seconds,Timeout}),
+    init_high_level_trace(Timeout),
+    init_condition(Config),
+    OrigNames = global:registered_names(),
+
+    [A, B, C, D] = OtherNodes = start_nodes([a, b, c, d], peer, Config),
+    Nodes = lists:sort([node() | OtherNodes]),
+    wait_for_ready_net(Config),
+
+    F1 =
+        fun(S0) ->
+                ct:sleep(100),
+                Str = "************",
+                S = Str ++ "  " ++ lists:flatten(S0) ++ "  " ++ Str,
+                io:format("~s\n", [S]),
+                [begin
+                     RNs = rpc:call(N, global, registered_names, []),
+                     W = rpc:call(N, global, whereis_name, [x]),
+                     io:format("   === ~w ===\n", [N]),
+                     io:format("       registered names: ~p", [RNs]),
+                     io:format("       where is x:       ~p", [W])
+                 end || N <- OtherNodes]
+        end,
+    F1("start"),
+
+    true = rpc:call(A, erlang, disconnect_node, [C]),
+    F1("after disconnecting c from a"),
+
+    Pid = self(),
+    yes = rpc:call(A, global, register_name, [x, Pid]),
+    F1(io_lib:format("after registering x as ~p on a", [Pid])),
+
+    true = rpc:call(B, erlang, disconnect_node, [D]),
+    F1("after disconnecting d from b"),
+
+    Pid2 = whereis(global_name_server),
+    yes = rpc:call(B, global, re_register_name, [x, Pid2]),
+    F1(io_lib:format("after re_register_name x as ~p on b", [Pid2])),
+
+    pong = rpc:call(A, net_adm, ping, [C]),
+    F1("finished after ping c from a"),
+
+    pong = rpc:call(B, net_adm, ping, [D]),
+
+    timer:sleep(1000),
+
+    %% "You have all 4 nodes connected again, but A, B & C believe the
+    %% name belongs to Y, while D believes it belongs to X."
+    Pid2 = rpc:call(A, global, whereis_name, [x]),
+    Pid2 = rpc:call(B, global, whereis_name, [x]),
+    Pid2 = rpc:call(C, global, whereis_name, [x]),
+    Pid  = rpc:call(D, global, whereis_name, [x]),
+
+    lists:foreach(fun(N) ->
+			  rpc:call(N, ?MODULE, stop_tracer, [])
+		  end, Nodes),
+    _ = rpc:call(A, global, unregister_name, [x]),
+
+    F1("after unregistering x on node a"),
+
+    %% _ = rpc:call(B, global, unregister_name, [y]),
+    %% F1("after unregistering y on node b"),
+
+    _ = rpc:call(C, global, unregister_name, [x]),
+    F1("after unregistering x on node c"),
+
+    ct:sleep(100),
+    OrigNames = global:registered_names(),
+    write_high_level_trace(Config),
+    stop_nodes(OtherNodes),
+    init_condition(Config),
+    ok.
+
 global_disconnect(Config) when is_list(Config) ->
     Timeout = 30,
     ct:timetrap({seconds,Timeout}),
-- 
2.26.2

openSUSE Build Service is sponsored by