Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
home:Ledest:erlang:19
erlang
2614-pg-handle-race-condition-between-scope-DOW...
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File 2614-pg-handle-race-condition-between-scope-DOWN-and-proc.patch of Package erlang
From 358c3b8ce5884bd888956e99dc79ecbd4e7550fe Mon Sep 17 00:00:00 2001 From: Maxim Fedorov <maximfca@gmail.com> Date: Sat, 29 Feb 2020 21:08:15 -0500 Subject: [PATCH] pg: handle race condition between scope 'DOWN' and processes leaving Handles following sequence: - nodes A and B share scope S, and are disconnected - some process on node B leaves scope S at the same time - node B processes leave request and (while 'nodedown' from A is in the mailbox) attempts to send 'leave' broadcast to A - node A already received and processes 'nodedown' from B, and removed all processes from scope C - node B establishes connection to A, and 'leave' message gets sent - scope S on node A crashes, because it does not expect messages from B (it is assumed to be down, and not yet participating in the overlay network) It also guards against other cases when nodes outside of overlay network are trying to make changes. --- lib/kernel/src/pg.erl | 60 ++++++++++++++++++++++++++++---------------- lib/kernel/test/pg_SUITE.erl | 36 +++++++++++++++++++++++++- 2 files changed, 74 insertions(+), 22 deletions(-) diff --git a/lib/kernel/src/pg.erl b/lib/kernel/src/pg.erl index 0668dd1f79..2c91e80930 100644 --- a/lib/kernel/src/pg.erl +++ b/lib/kernel/src/pg.erl @@ -264,30 +264,48 @@ handle_cast(_, _State) -> %% remote pid or several pids joining the group handle_info({join, Peer, Group, PidOrPids}, #state{scope = Scope, nodes = Nodes} = State) -> - join_remote(Scope, Group, PidOrPids), - % store remote group => pids map for fast sync operation - {MRef, RemoteGroups} = maps:get(Peer, Nodes), - NewRemoteGroups = join_remote_map(Group, PidOrPids, RemoteGroups), - {noreply, State#state{nodes = Nodes#{Peer => {MRef, NewRemoteGroups}}}}; + case maps:get(Peer, Nodes, []) of + {MRef, RemoteGroups} -> + join_remote(Scope, Group, PidOrPids), + %% store remote group => pids map for fast sync operation + NewRemoteGroups = join_remote_map(Group, PidOrPids, RemoteGroups), + {noreply, State#state{nodes = Nodes#{Peer => {MRef, NewRemoteGroups}}}}; + [] -> + %% handle possible race condition, when remote node is flickering up/down, + %% and remote join can happen after the node left overlay network + %% It also handles the case when node outside of overlay network sends + %% unexpected join request. + {noreply, State} + end; %% remote pid leaving (multiple groups at once) handle_info({leave, Peer, PidOrPids, Groups}, #state{scope = Scope, nodes = Nodes} = State) -> - _ = leave_remote(Scope, PidOrPids, Groups), - {MRef, RemoteMap} = maps:get(Peer, Nodes), - NewRemoteMap = lists:foldl( - fun (Group, Acc) -> - case maps:get(Group, Acc) of - PidOrPids -> - Acc; - [PidOrPids] -> - Acc; - Existing when is_pid(PidOrPids) -> - Acc#{Group => lists:delete(PidOrPids, Existing)}; - Existing -> - Acc#{Group => Existing-- PidOrPids} - end - end, RemoteMap, Groups), - {noreply, State#state{nodes = Nodes#{Peer => {MRef, NewRemoteMap}}}}; + case maps:get(Peer, Nodes, []) of + {MRef, RemoteMap} -> + _ = leave_remote(Scope, PidOrPids, Groups), + NewRemoteMap = lists:foldl( + fun (Group, Acc) -> + case maps:get(Group, Acc) of + PidOrPids -> + Acc; + [PidOrPids] -> + Acc; + Existing when is_pid(PidOrPids) -> + Acc#{Group => lists:delete(PidOrPids, Existing)}; + Existing -> + Acc#{Group => Existing-- PidOrPids} + end + end, RemoteMap, Groups), + {noreply, State#state{nodes = Nodes#{Peer => {MRef, NewRemoteMap}}}}; + [] -> + %% Handle race condition: remote node disconnected, but scope process + %% of the remote node was just about to send 'leave' message. In this + %% case, local node handles 'DOWN' first, but then connection is + %% restored, and 'leave' message gets delivered when it's not expected. + %% It also handles the case when node outside of overlay network sends + %% unexpected leave request. + {noreply, State} + end; %% we're being discovered, let's exchange! handle_info({discover, Peer}, #state{scope = Scope, nodes = Nodes} = State) -> diff --git a/lib/kernel/test/pg_SUITE.erl b/lib/kernel/test/pg_SUITE.erl index bdb7abe99d..b338187e67 100644 --- a/lib/kernel/test/pg_SUITE.erl +++ b/lib/kernel/test/pg_SUITE.erl @@ -37,6 +37,7 @@ pg/0, pg/1, errors/0, errors/1, leave_exit_race/0, leave_exit_race/1, + overlay_missing/0, overlay_missing/1, single/0, single/1, two/1, thundering_herd/0, thundering_herd/1, @@ -94,7 +95,7 @@ all() -> groups() -> [ - {basic, [parallel], [errors, pg, leave_exit_race, single]}, + {basic, [parallel], [errors, pg, leave_exit_race, single, overlay_missing]}, {performance, [sequential], [thundering_herd]}, {cluster, [parallel], [two, initial, netsplit, trisplit, foursplit, exchange, nolocal, double, scope_restart, missing_scope_join, @@ -170,6 +171,39 @@ single(Config) when is_list(Config) -> ?assertEqual(ok, pg:leave(?FUNCTION_NAME, ?FUNCTION_NAME, self())), ok. +overlay_missing() -> + [{doc, "Tests that scope process that is not a part of overlay network does not change state"}]. + +overlay_missing(_Config) -> + {TwoPeer, Socket} = spawn_node(?FUNCTION_NAME, ?FUNCTION_NAME), + %% join self (sanity check) + ?assertEqual(ok, pg:join(?FUNCTION_NAME, group, self())), + %% remember pid from remote + PgPid = rpc:call(TwoPeer, erlang, whereis, [?FUNCTION_NAME]), + RemotePid = erlang:spawn(TwoPeer, forever()), + %% stop remote scope + gen_server:stop(PgPid), + %% craft white-box request: ensure it's rejected + ?FUNCTION_NAME ! {join, PgPid, group, RemotePid}, + %% rejected! + ?assertEqual([self()], pg:get_members(?FUNCTION_NAME, group)), + %% ... reject leave too + ?FUNCTION_NAME ! {leave, PgPid, RemotePid, [group]}, + ?assertEqual([self()], pg:get_members(?FUNCTION_NAME, group)), + %% join many times on remote + %RemotePids = [erlang:spawn(TwoPeer, forever()) || _ <- lists:seq(1, 1024)], + %?assertEqual(ok, rpc:call(TwoPeer, pg, join, [?FUNCTION_NAME, ?FUNCTION_NAME, Pid2])), + %% check they can't be joined locally + %?assertException(error, {nolocal, _}, pg:join(?FUNCTION_NAME, ?FUNCTION_NAME, RemotePid)), + %?assertException(error, {nolocal, _}, pg:join(?FUNCTION_NAME, ?FUNCTION_NAME, [RemotePid, RemotePid])), + %?assertException(error, {nolocal, _}, pg:join(?FUNCTION_NAME, ?FUNCTION_NAME, [LocalPid, RemotePid])), + %% check that non-pid also triggers error + %?assertException(error, function_clause, pg:join(?FUNCTION_NAME, ?FUNCTION_NAME, undefined)), + %?assertException(error, {nolocal, _}, pg:join(?FUNCTION_NAME, ?FUNCTION_NAME, [undefined])), + %% stop the peer + stop_node(TwoPeer, Socket). + + two(Config) when is_list(Config) -> {TwoPeer, Socket} = spawn_node(?FUNCTION_NAME, ?FUNCTION_NAME), Pid = erlang:spawn(forever()), -- 2.16.4
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor