File 2614-pg-handle-race-condition-between-scope-DOWN-and-proc.patch of Package erlang
From 358c3b8ce5884bd888956e99dc79ecbd4e7550fe Mon Sep 17 00:00:00 2001
From: Maxim Fedorov <maximfca@gmail.com>
Date: Sat, 29 Feb 2020 21:08:15 -0500
Subject: [PATCH] pg: handle race condition between scope 'DOWN' and processes
leaving
Handles following sequence:
- nodes A and B share scope S, and are disconnected
- some process on node B leaves scope S at the same time
- node B processes leave request and (while 'nodedown' from A is in the
mailbox) attempts to send 'leave' broadcast to A
- node A already received and processes 'nodedown' from B, and removed
all processes from scope C
- node B establishes connection to A, and 'leave' message gets sent
- scope S on node A crashes, because it does not expect messages from
B (it is assumed to be down, and not yet participating in the overlay
network)
It also guards against other cases when nodes outside of overlay network
are trying to make changes.
---
lib/kernel/src/pg.erl | 60 ++++++++++++++++++++++++++++----------------
lib/kernel/test/pg_SUITE.erl | 36 +++++++++++++++++++++++++-
2 files changed, 74 insertions(+), 22 deletions(-)
diff --git a/lib/kernel/src/pg.erl b/lib/kernel/src/pg.erl
index 0668dd1f79..2c91e80930 100644
--- a/lib/kernel/src/pg.erl
+++ b/lib/kernel/src/pg.erl
@@ -264,30 +264,48 @@ handle_cast(_, _State) ->
%% remote pid or several pids joining the group
handle_info({join, Peer, Group, PidOrPids}, #state{scope = Scope, nodes = Nodes} = State) ->
- join_remote(Scope, Group, PidOrPids),
- % store remote group => pids map for fast sync operation
- {MRef, RemoteGroups} = maps:get(Peer, Nodes),
- NewRemoteGroups = join_remote_map(Group, PidOrPids, RemoteGroups),
- {noreply, State#state{nodes = Nodes#{Peer => {MRef, NewRemoteGroups}}}};
+ case maps:get(Peer, Nodes, []) of
+ {MRef, RemoteGroups} ->
+ join_remote(Scope, Group, PidOrPids),
+ %% store remote group => pids map for fast sync operation
+ NewRemoteGroups = join_remote_map(Group, PidOrPids, RemoteGroups),
+ {noreply, State#state{nodes = Nodes#{Peer => {MRef, NewRemoteGroups}}}};
+ [] ->
+ %% handle possible race condition, when remote node is flickering up/down,
+ %% and remote join can happen after the node left overlay network
+ %% It also handles the case when node outside of overlay network sends
+ %% unexpected join request.
+ {noreply, State}
+ end;
%% remote pid leaving (multiple groups at once)
handle_info({leave, Peer, PidOrPids, Groups}, #state{scope = Scope, nodes = Nodes} = State) ->
- _ = leave_remote(Scope, PidOrPids, Groups),
- {MRef, RemoteMap} = maps:get(Peer, Nodes),
- NewRemoteMap = lists:foldl(
- fun (Group, Acc) ->
- case maps:get(Group, Acc) of
- PidOrPids ->
- Acc;
- [PidOrPids] ->
- Acc;
- Existing when is_pid(PidOrPids) ->
- Acc#{Group => lists:delete(PidOrPids, Existing)};
- Existing ->
- Acc#{Group => Existing-- PidOrPids}
- end
- end, RemoteMap, Groups),
- {noreply, State#state{nodes = Nodes#{Peer => {MRef, NewRemoteMap}}}};
+ case maps:get(Peer, Nodes, []) of
+ {MRef, RemoteMap} ->
+ _ = leave_remote(Scope, PidOrPids, Groups),
+ NewRemoteMap = lists:foldl(
+ fun (Group, Acc) ->
+ case maps:get(Group, Acc) of
+ PidOrPids ->
+ Acc;
+ [PidOrPids] ->
+ Acc;
+ Existing when is_pid(PidOrPids) ->
+ Acc#{Group => lists:delete(PidOrPids, Existing)};
+ Existing ->
+ Acc#{Group => Existing-- PidOrPids}
+ end
+ end, RemoteMap, Groups),
+ {noreply, State#state{nodes = Nodes#{Peer => {MRef, NewRemoteMap}}}};
+ [] ->
+ %% Handle race condition: remote node disconnected, but scope process
+ %% of the remote node was just about to send 'leave' message. In this
+ %% case, local node handles 'DOWN' first, but then connection is
+ %% restored, and 'leave' message gets delivered when it's not expected.
+ %% It also handles the case when node outside of overlay network sends
+ %% unexpected leave request.
+ {noreply, State}
+ end;
%% we're being discovered, let's exchange!
handle_info({discover, Peer}, #state{scope = Scope, nodes = Nodes} = State) ->
diff --git a/lib/kernel/test/pg_SUITE.erl b/lib/kernel/test/pg_SUITE.erl
index bdb7abe99d..b338187e67 100644
--- a/lib/kernel/test/pg_SUITE.erl
+++ b/lib/kernel/test/pg_SUITE.erl
@@ -37,6 +37,7 @@
pg/0, pg/1,
errors/0, errors/1,
leave_exit_race/0, leave_exit_race/1,
+ overlay_missing/0, overlay_missing/1,
single/0, single/1,
two/1,
thundering_herd/0, thundering_herd/1,
@@ -94,7 +95,7 @@ all() ->
groups() ->
[
- {basic, [parallel], [errors, pg, leave_exit_race, single]},
+ {basic, [parallel], [errors, pg, leave_exit_race, single, overlay_missing]},
{performance, [sequential], [thundering_herd]},
{cluster, [parallel], [two, initial, netsplit, trisplit, foursplit,
exchange, nolocal, double, scope_restart, missing_scope_join,
@@ -170,6 +171,39 @@ single(Config) when is_list(Config) ->
?assertEqual(ok, pg:leave(?FUNCTION_NAME, ?FUNCTION_NAME, self())),
ok.
+overlay_missing() ->
+ [{doc, "Tests that scope process that is not a part of overlay network does not change state"}].
+
+overlay_missing(_Config) ->
+ {TwoPeer, Socket} = spawn_node(?FUNCTION_NAME, ?FUNCTION_NAME),
+ %% join self (sanity check)
+ ?assertEqual(ok, pg:join(?FUNCTION_NAME, group, self())),
+ %% remember pid from remote
+ PgPid = rpc:call(TwoPeer, erlang, whereis, [?FUNCTION_NAME]),
+ RemotePid = erlang:spawn(TwoPeer, forever()),
+ %% stop remote scope
+ gen_server:stop(PgPid),
+ %% craft white-box request: ensure it's rejected
+ ?FUNCTION_NAME ! {join, PgPid, group, RemotePid},
+ %% rejected!
+ ?assertEqual([self()], pg:get_members(?FUNCTION_NAME, group)),
+ %% ... reject leave too
+ ?FUNCTION_NAME ! {leave, PgPid, RemotePid, [group]},
+ ?assertEqual([self()], pg:get_members(?FUNCTION_NAME, group)),
+ %% join many times on remote
+ %RemotePids = [erlang:spawn(TwoPeer, forever()) || _ <- lists:seq(1, 1024)],
+ %?assertEqual(ok, rpc:call(TwoPeer, pg, join, [?FUNCTION_NAME, ?FUNCTION_NAME, Pid2])),
+ %% check they can't be joined locally
+ %?assertException(error, {nolocal, _}, pg:join(?FUNCTION_NAME, ?FUNCTION_NAME, RemotePid)),
+ %?assertException(error, {nolocal, _}, pg:join(?FUNCTION_NAME, ?FUNCTION_NAME, [RemotePid, RemotePid])),
+ %?assertException(error, {nolocal, _}, pg:join(?FUNCTION_NAME, ?FUNCTION_NAME, [LocalPid, RemotePid])),
+ %% check that non-pid also triggers error
+ %?assertException(error, function_clause, pg:join(?FUNCTION_NAME, ?FUNCTION_NAME, undefined)),
+ %?assertException(error, {nolocal, _}, pg:join(?FUNCTION_NAME, ?FUNCTION_NAME, [undefined])),
+ %% stop the peer
+ stop_node(TwoPeer, Socket).
+
+
two(Config) when is_list(Config) ->
{TwoPeer, Socket} = spawn_node(?FUNCTION_NAME, ?FUNCTION_NAME),
Pid = erlang:spawn(forever()),
--
2.16.4