File 2614-pg-handle-race-condition-between-scope-DOWN-and-proc.patch of Package erlang

From 358c3b8ce5884bd888956e99dc79ecbd4e7550fe Mon Sep 17 00:00:00 2001
From: Maxim Fedorov <maximfca@gmail.com>
Date: Sat, 29 Feb 2020 21:08:15 -0500
Subject: [PATCH] pg: handle race condition between scope 'DOWN' and processes
 leaving

Handles following sequence:
 - nodes A and B share scope S, and are disconnected
 - some process on node B leaves scope S at the same time
 - node B processes leave request and (while 'nodedown' from A is in the
   mailbox) attempts to send 'leave' broadcast to A
 - node A already received and processes 'nodedown' from B, and removed
   all processes from scope C
 - node B establishes connection to A, and 'leave' message gets sent
 - scope S on node A crashes, because it does not expect messages from
   B (it is assumed to be down, and not yet participating in the overlay
   network)

It also guards against other cases when nodes outside of overlay network
are trying to make changes.
---
 lib/kernel/src/pg.erl        | 60 ++++++++++++++++++++++++++++----------------
 lib/kernel/test/pg_SUITE.erl | 36 +++++++++++++++++++++++++-
 2 files changed, 74 insertions(+), 22 deletions(-)

diff --git a/lib/kernel/src/pg.erl b/lib/kernel/src/pg.erl
index 0668dd1f79..2c91e80930 100644
--- a/lib/kernel/src/pg.erl
+++ b/lib/kernel/src/pg.erl
@@ -264,30 +264,48 @@ handle_cast(_, _State) ->
 
 %% remote pid or several pids joining the group
 handle_info({join, Peer, Group, PidOrPids}, #state{scope = Scope, nodes = Nodes} = State) ->
-    join_remote(Scope, Group, PidOrPids),
-    % store remote group => pids map for fast sync operation
-    {MRef, RemoteGroups} = maps:get(Peer, Nodes),
-    NewRemoteGroups = join_remote_map(Group, PidOrPids, RemoteGroups),
-    {noreply, State#state{nodes = Nodes#{Peer => {MRef, NewRemoteGroups}}}};
+    case maps:get(Peer, Nodes, []) of
+        {MRef, RemoteGroups} ->
+            join_remote(Scope, Group, PidOrPids),
+            %% store remote group => pids map for fast sync operation
+            NewRemoteGroups = join_remote_map(Group, PidOrPids, RemoteGroups),
+            {noreply, State#state{nodes = Nodes#{Peer => {MRef, NewRemoteGroups}}}};
+        [] ->
+            %% handle possible race condition, when remote node is flickering up/down,
+            %%  and remote join can happen after the node left overlay network
+            %% It also handles the case when node outside of overlay network sends
+            %%  unexpected join request.
+            {noreply, State}
+    end;
 
 %% remote pid leaving (multiple groups at once)
 handle_info({leave, Peer, PidOrPids, Groups}, #state{scope = Scope, nodes = Nodes} = State) ->
-    _ = leave_remote(Scope, PidOrPids, Groups),
-    {MRef, RemoteMap} = maps:get(Peer, Nodes),
-    NewRemoteMap = lists:foldl(
-        fun (Group, Acc) ->
-            case maps:get(Group, Acc) of
-                PidOrPids ->
-                    Acc;
-                [PidOrPids] ->
-                    Acc;
-                Existing when is_pid(PidOrPids) ->
-                    Acc#{Group => lists:delete(PidOrPids, Existing)};
-                Existing ->
-                    Acc#{Group => Existing-- PidOrPids}
-            end
-        end, RemoteMap, Groups),
-    {noreply, State#state{nodes = Nodes#{Peer => {MRef, NewRemoteMap}}}};
+    case maps:get(Peer, Nodes, []) of
+        {MRef, RemoteMap} ->
+            _ = leave_remote(Scope, PidOrPids, Groups),
+            NewRemoteMap = lists:foldl(
+                fun (Group, Acc) ->
+                    case maps:get(Group, Acc) of
+                        PidOrPids ->
+                            Acc;
+                        [PidOrPids] ->
+                            Acc;
+                        Existing when is_pid(PidOrPids) ->
+                            Acc#{Group => lists:delete(PidOrPids, Existing)};
+                        Existing ->
+                            Acc#{Group => Existing-- PidOrPids}
+                    end
+                end, RemoteMap, Groups),
+            {noreply, State#state{nodes = Nodes#{Peer => {MRef, NewRemoteMap}}}};
+        [] ->
+            %% Handle race condition: remote node disconnected, but scope process
+            %%  of the remote node was just about to send 'leave' message. In this
+            %%  case, local node handles 'DOWN' first, but then connection is
+            %%  restored, and 'leave' message gets delivered when it's not expected.
+            %% It also handles the case when node outside of overlay network sends
+            %%  unexpected leave request.
+            {noreply, State}
+    end;
 
 %% we're being discovered, let's exchange!
 handle_info({discover, Peer}, #state{scope = Scope, nodes = Nodes} = State) ->
diff --git a/lib/kernel/test/pg_SUITE.erl b/lib/kernel/test/pg_SUITE.erl
index bdb7abe99d..b338187e67 100644
--- a/lib/kernel/test/pg_SUITE.erl
+++ b/lib/kernel/test/pg_SUITE.erl
@@ -37,6 +37,7 @@
     pg/0, pg/1,
     errors/0, errors/1,
     leave_exit_race/0, leave_exit_race/1,
+    overlay_missing/0, overlay_missing/1,
     single/0, single/1,
     two/1,
     thundering_herd/0, thundering_herd/1,
@@ -94,7 +95,7 @@ all() ->
 
 groups() ->
     [
-        {basic, [parallel], [errors, pg, leave_exit_race, single]},
+        {basic, [parallel], [errors, pg, leave_exit_race, single, overlay_missing]},
         {performance, [sequential], [thundering_herd]},
         {cluster, [parallel], [two, initial, netsplit, trisplit, foursplit,
             exchange, nolocal, double, scope_restart, missing_scope_join,
@@ -170,6 +171,39 @@ single(Config) when is_list(Config) ->
     ?assertEqual(ok, pg:leave(?FUNCTION_NAME, ?FUNCTION_NAME, self())),
     ok.
 
+overlay_missing() ->
+    [{doc, "Tests that scope process that is not a part of overlay network does not change state"}].
+
+overlay_missing(_Config) ->
+    {TwoPeer, Socket} = spawn_node(?FUNCTION_NAME, ?FUNCTION_NAME),
+    %% join self (sanity check)
+    ?assertEqual(ok, pg:join(?FUNCTION_NAME, group, self())),
+    %% remember pid from remote
+    PgPid = rpc:call(TwoPeer, erlang, whereis, [?FUNCTION_NAME]),
+    RemotePid = erlang:spawn(TwoPeer, forever()),
+    %% stop remote scope
+    gen_server:stop(PgPid),
+    %% craft white-box request: ensure it's rejected
+    ?FUNCTION_NAME ! {join, PgPid, group, RemotePid},
+    %% rejected!
+    ?assertEqual([self()], pg:get_members(?FUNCTION_NAME, group)),
+    %% ... reject leave too
+    ?FUNCTION_NAME ! {leave, PgPid, RemotePid, [group]},
+    ?assertEqual([self()], pg:get_members(?FUNCTION_NAME, group)),
+    %% join many times on remote
+    %RemotePids = [erlang:spawn(TwoPeer, forever()) || _ <- lists:seq(1, 1024)],
+    %?assertEqual(ok, rpc:call(TwoPeer, pg, join, [?FUNCTION_NAME, ?FUNCTION_NAME, Pid2])),
+    %% check they can't be joined locally
+    %?assertException(error, {nolocal, _}, pg:join(?FUNCTION_NAME, ?FUNCTION_NAME, RemotePid)),
+    %?assertException(error, {nolocal, _}, pg:join(?FUNCTION_NAME, ?FUNCTION_NAME, [RemotePid, RemotePid])),
+    %?assertException(error, {nolocal, _}, pg:join(?FUNCTION_NAME, ?FUNCTION_NAME, [LocalPid, RemotePid])),
+    %% check that non-pid also triggers error
+    %?assertException(error, function_clause, pg:join(?FUNCTION_NAME, ?FUNCTION_NAME, undefined)),
+    %?assertException(error, {nolocal, _}, pg:join(?FUNCTION_NAME, ?FUNCTION_NAME, [undefined])),
+    %% stop the peer
+    stop_node(TwoPeer, Socket).
+
+
 two(Config) when is_list(Config) ->
     {TwoPeer, Socket} = spawn_node(?FUNCTION_NAME, ?FUNCTION_NAME),
     Pid = erlang:spawn(forever()),
-- 
2.16.4

openSUSE Build Service is sponsored by