File 2351-kernel-Add-a-Global-testcase-showing-weakness-of-alg.patch of Package erlang
From a769d10343ba676e550aa218a896fb88d3456ed5 Mon Sep 17 00:00:00 2001
From: Hans Bolinder <hasse@erlang.org>
Date: Mon, 7 Jun 2021 07:18:12 +0200
Subject: [PATCH 1/2] kernel: Add a Global testcase showing weakness of algo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The scenario of the testcase is described in
https://erlang.org/pipermail/erlang-questions/2020-October/100034.html:
Dániel Szoboszlay dszoboszlay@REDACTED
Mon Oct 12 11:05:19 CEST 2020
Hi,
Global can indeed end up in inconsistent states if some nodes get
disconnected from each other (so you're no longer running on a fully
connected mesh). Since when registering a global name on node X the change
is only propagated to nodes that X are directly connected to, you can end
up in a situation that X and Y are connected together, so they will both
know about the name, and Y and Z are connected together but X and Z are
not, so Z never gets the update.
When two nodes (re)connect, they only compare the names they locally know
about. So it is a bit tricky, but you can actually end up in a situation
when all nodes are connected, yet the global name databases are
inconsistent. You will need at least 4 nodes for this scenario to happen
(e.g. A, B, C & D):
1. All nodes are connected initially.
2. A gets disconnected from C.
3. A registers process X under some name: this gets propagated to B & D,
but not C.
4. B gets disconnected from D.
5. B re-registers process Y under some name: this gets propagated to A &
C, but not D, so on D the name still belongs to X.
6. A reconnects to C, since they both know the name belongs to Y they
will inform their half of the network about the new node, but won't issue
any global name updates.
7. You have all 4 nodes connected again, but A, B & C believe the name
belongs to Y, while D believes it belongs to X.
So this can happen, if you know how global works you can understand how it
can happen, but I don't think it would be expected by many people to
actually happen. :)
global:sync() is not really meant to resolve this error. The only solution
I know about is to manually compare global name registrations shortly after
you see a new node connecting.
Cheers,
Daniel
---
lib/kernel/test/global_SUITE.erl | 87 ++++++++++++++++++++++++++++++--
1 file changed, 84 insertions(+), 3 deletions(-)
diff --git a/lib/kernel/test/global_SUITE.erl b/lib/kernel/test/global_SUITE.erl
index bdc4d9ce16..9c4e272684 100644
--- a/lib/kernel/test/global_SUITE.erl
+++ b/lib/kernel/test/global_SUITE.erl
@@ -44,6 +44,7 @@
mass_death/1,
garbage_messages/1,
ring_line/1,
+ flaw1/1,
lost_connection/1,
lost_connection2/1,
global_disconnect/1
@@ -141,7 +142,7 @@ all() ->
simple_resolve2, simple_resolve3, leftover_name,
re_register_name, name_exit, external_nodes, many_nodes,
sync_0, global_groups_change, register_1, both_known_1,
- lost_unregister, mass_death, garbage_messages,
+ lost_unregister, mass_death, garbage_messages, flaw1,
lost_connection, lost_connection2, global_disconnect
]
end.
@@ -490,8 +491,9 @@ write_high_level_trace(Nodes, Config) ->
Node <- Nodes],
Dir = proplists:get_value(priv_dir, Config),
DataFile = filename:join([Dir, lists:concat(["global_", ?testcase])]),
- ?P("High-level trace on:"
- "~n ~p", [DataFile]),
+ io:format("\n\nAnalyze high level trace like this:\n"),
+ io:format("global_trace:dd(~p, [{show_state, 0, 10}]). % 10 seconds\n",
+ [DataFile]),
file:write_file(DataFile, term_to_binary({high_level_trace, When, Data})).
lock_global2(Id, Parent) ->
@@ -4281,6 +4283,84 @@ garbage_messages(Config) when is_list(Config) ->
init_condition(Config),
ok.
+%% This is scenario outlined in
+%% https://erlang.org/pipermail/erlang-questions/2020-October/100034.html.
+%% It illustrates that the algorithm of Global is flawed.
+flaw1(Config) ->
+ Timeout = 360,
+ ct:timetrap({seconds,Timeout}),
+ init_high_level_trace(Timeout),
+ init_condition(Config),
+ OrigNames = global:registered_names(),
+
+ [A, B, C, D] = OtherNodes = start_nodes([a, b, c, d], peer, Config),
+ Nodes = lists:sort([node() | OtherNodes]),
+ wait_for_ready_net(Config),
+
+ F1 =
+ fun(S0) ->
+ ct:sleep(100),
+ Str = "************",
+ S = Str ++ " " ++ lists:flatten(S0) ++ " " ++ Str,
+ io:format("~s\n", [S]),
+ [begin
+ RNs = rpc:call(N, global, registered_names, []),
+ W = rpc:call(N, global, whereis_name, [x]),
+ io:format(" === ~w ===\n", [N]),
+ io:format(" registered names: ~p", [RNs]),
+ io:format(" where is x: ~p", [W])
+ end || N <- OtherNodes]
+ end,
+ F1("start"),
+
+ true = rpc:call(A, erlang, disconnect_node, [C]),
+ F1("after disconnecting c from a"),
+
+ Pid = self(),
+ yes = rpc:call(A, global, register_name, [x, Pid]),
+ F1(io_lib:format("after registering x as ~p on a", [Pid])),
+
+ true = rpc:call(B, erlang, disconnect_node, [D]),
+ F1("after disconnecting d from b"),
+
+ Pid2 = whereis(global_name_server),
+ yes = rpc:call(B, global, re_register_name, [x, Pid2]),
+ F1(io_lib:format("after re_register_name x as ~p on b", [Pid2])),
+
+ pong = rpc:call(A, net_adm, ping, [C]),
+ F1("finished after ping c from a"),
+
+ pong = rpc:call(B, net_adm, ping, [D]),
+
+ timer:sleep(1000),
+
+ %% "You have all 4 nodes connected again, but A, B & C believe the
+ %% name belongs to Y, while D believes it belongs to X."
+ Pid2 = rpc:call(A, global, whereis_name, [x]),
+ Pid2 = rpc:call(B, global, whereis_name, [x]),
+ Pid2 = rpc:call(C, global, whereis_name, [x]),
+ Pid = rpc:call(D, global, whereis_name, [x]),
+
+ lists:foreach(fun(N) ->
+ rpc:call(N, ?MODULE, stop_tracer, [])
+ end, Nodes),
+ _ = rpc:call(A, global, unregister_name, [x]),
+
+ F1("after unregistering x on node a"),
+
+ %% _ = rpc:call(B, global, unregister_name, [y]),
+ %% F1("after unregistering y on node b"),
+
+ _ = rpc:call(C, global, unregister_name, [x]),
+ F1("after unregistering x on node c"),
+
+ ct:sleep(100),
+ OrigNames = global:registered_names(),
+ write_high_level_trace(Config),
+ stop_nodes(OtherNodes),
+ init_condition(Config),
+ ok.
+
global_disconnect(Config) when is_list(Config) ->
Timeout = 30,
ct:timetrap({seconds,Timeout}),
--
2.26.2