File 4501-inet_tls_dist-make-TLS-handshake-concurrent.patch of Package erlang
From 14bc45e39443eeb10adad352c318f6d3807b3745 Mon Sep 17 00:00:00 2001
From: Maxim Fedorov <dane@whatsapp.com>
Date: Tue, 9 Jun 2020 19:21:26 -0700
Subject: [PATCH] inet_tls_dist: make TLS handshake concurrent
TLS handshake may take considerable amount of time, so it should
be done in a different process, not accepting process itself.
Also, there is no need to kill the accepting process if TLS
handshake fails.
Spawned accepting process runs with normal priority. There is
at most one accepting process at a time. When TCP connection is
accepted, the process is added to map of processes pending TLS
handshake. Amount of pending TLS attempts is limited to the
same amount as listening socket backlog, option {backlog, B}
of inet_dist_listen_options.
If accepting process crashes or returns error during TCP
accept phase, listener process exits as well, to close listening
socket, which is most likely unusable - there is no other
reason for accept to fail.
---
lib/ssl/src/inet_tls_dist.erl | 98 ++++++++++++++++++++++++---------
lib/ssl/test/ssl_dist_SUITE.erl | 58 +++++++++++++++++++
2 files changed, 129 insertions(+), 27 deletions(-)
diff --git a/lib/ssl/src/inet_tls_dist.erl b/lib/ssl/src/inet_tls_dist.erl
index eaa481f119..5cfbeff387 100644
--- a/lib/ssl/src/inet_tls_dist.erl
+++ b/lib/ssl/src/inet_tls_dist.erl
@@ -223,27 +223,62 @@ gen_accept(Driver, Listen) ->
monitor_pid(
spawn_opt(
fun () ->
- accept_loop(Driver, Listen, Kernel)
+ process_flag(trap_exit, true),
+ LOpts = application:get_env(kernel, inet_dist_listen_options, []),
+ MaxPending =
+ case lists:keyfind(backlog, 1, LOpts) of
+ {backlog, Backlog} -> Backlog;
+ false -> 128
+ end,
+ DLK = {Driver, Listen, Kernel},
+ accept_loop(DLK, spawn_accept(DLK), MaxPending, #{})
end,
[link, {priority, max}])).
-accept_loop(Driver, Listen, Kernel) ->
- case Driver:accept(Listen) of
- {ok, Socket} ->
- case check_ip(Driver, Socket) of
- true ->
- accept_loop(Driver, Listen, Kernel, Socket);
- {false,IP} ->
- ?LOG_ERROR(
- "** Connection attempt from "
- "disallowed IP ~w ** ~n", [IP]),
- ?shutdown2(no_node, trace({disallowed, IP}))
- end;
- Error ->
- exit(trace(Error))
- end.
+%% Concurrent accept loop will spawn a new HandshakePid when
+%% there is no HandshakePid already running, and Pending map is
+%% smaller than MaxPending
+accept_loop(DLK, undefined, MaxPending, Pending) when map_size(Pending) < MaxPending ->
+ accept_loop(DLK, spawn_accept(DLK), MaxPending, Pending);
+accept_loop(DLK, HandshakePid, MaxPending, Pending) ->
+ receive
+ {continue, HandshakePid} when is_pid(HandshakePid) ->
+ accept_loop(DLK, undefined, MaxPending, Pending#{HandshakePid => true});
+ {'EXIT', Pid, Reason} when is_map_key(Pid, Pending) ->
+ Reason =/= normal andalso
+ ?LOG_ERROR("TLS distribution handshake failed: ~p~n", [Reason]),
+ accept_loop(DLK, HandshakePid, MaxPending, maps:remove(Pid, Pending));
+ {'EXIT', HandshakePid, Reason} when is_pid(HandshakePid) ->
+ %% HandshakePid crashed before turning into Pending, which means
+ %% error happened in accept. Need to restart the listener.
+ exit(Reason);
+ Unexpected ->
+ ?LOG_WARNING("TLS distribution: unexpected message: ~p~n" ,[Unexpected]),
+ accept_loop(DLK, HandshakePid, MaxPending, Pending)
+ end.
+
+spawn_accept({Driver, Listen, Kernel}) ->
+ AcceptLoop = self(),
+ spawn_link(
+ fun () ->
+ case Driver:accept(Listen) of
+ {ok, Socket} ->
+ AcceptLoop ! {continue, self()},
+ case check_ip(Driver, Socket) of
+ true ->
+ accept_one(Driver, Kernel, Socket);
+ {false,IP} ->
+ ?LOG_ERROR(
+ "** Connection attempt from "
+ "disallowed IP ~w ** ~n", [IP]),
+ trace({disallowed, IP})
+ end;
+ Error ->
+ exit(Error)
+ end
+ end).
-accept_loop(Driver, Listen, Kernel, Socket) ->
+accept_one(Driver, Kernel, Socket) ->
Opts = setup_verify_client(Socket, get_ssl_options(server)),
wait_for_code_server(),
case
@@ -259,13 +294,18 @@ accept_loop(Driver, Listen, Kernel, Socket) ->
Driver:family(), tls}),
receive
{Kernel, controller, Pid} ->
- ok = ssl:controlling_process(SslSocket, Pid),
- trace(
- Pid ! {self(), controller});
+ case ssl:controlling_process(SslSocket, Pid) of
+ ok ->
+ trace(Pid ! {self(), controller});
+ Error ->
+ trace(Pid ! {self(), exit}),
+ ?LOG_ERROR(
+ "Cannot control TLS distribution connection: ~p~n",
+ [Error])
+ end;
{Kernel, unsupported_protocol} ->
- exit(trace(unsupported_protocol))
- end,
- accept_loop(Driver, Listen, Kernel);
+ trace(unsupported_protocol)
+ end;
{error, {options, _}} = Error ->
%% Bad options: that's probably our fault.
%% Let's log that.
@@ -273,10 +313,10 @@ accept_loop(Driver, Listen, Kernel, Socket) ->
"Cannot accept TLS distribution connection: ~s~n",
[ssl:format_error(Error)]),
gen_tcp:close(Socket),
- exit(trace(Error));
+ trace(Error);
Other ->
gen_tcp:close(Socket),
- exit(trace(Other))
+ trace(Other)
end.
@@ -412,9 +452,9 @@ gen_accept_connection(
do_accept(
_Driver, AcceptPid, DistCtrl, MyNode, Allowed, SetupTime, Kernel) ->
- {ok, SslSocket} = tls_sender:dist_tls_socket(DistCtrl),
receive
{AcceptPid, controller} ->
+ {ok, SslSocket} = tls_sender:dist_tls_socket(DistCtrl),
Timer = dist_util:start_timer(SetupTime),
NewAllowed = allowed_nodes(SslSocket, Allowed),
HSData0 = hs_data_common(SslSocket),
@@ -427,7 +467,11 @@ do_accept(
this_flags = 0,
allowed = NewAllowed},
link(DistCtrl),
- dist_util:handshake_other_started(trace(HSData))
+ dist_util:handshake_other_started(trace(HSData));
+ {AcceptPid, exit} ->
+ %% this can happen when connection was initiated, but dropped
+ %% between TLS handshake completion and dist handshake start
+ ?shutdown2(MyNode, connection_setup_failed)
end.
allowed_nodes(_SslSocket, []) ->
diff --git a/lib/ssl/test/ssl_dist_SUITE.erl b/lib/ssl/test/ssl_dist_SUITE.erl
index 3b0c4d8c09..2123ac5def 100644
--- a/lib/ssl/test/ssl_dist_SUITE.erl
+++ b/lib/ssl/test/ssl_dist_SUITE.erl
@@ -39,6 +39,8 @@
basic/1,
payload/0,
payload/1,
+ dist_port_overload/0,
+ dist_port_overload/1,
plain_options/0,
plain_options/1,
plain_verify_options/0,
@@ -91,6 +93,7 @@ start_ssl_node_name(Name, Args) ->
all() ->
[basic,
payload,
+ dist_port_overload,
plain_options,
plain_verify_options,
nodelay_option,
@@ -164,6 +167,61 @@ payload() ->
payload(Config) when is_list(Config) ->
gen_dist_test(payload_test, Config).
+%%--------------------------------------------------------------------
+dist_port_overload() ->
+ [{doc, "Test that TLS distribution connections can be accepted concurrently"}].
+dist_port_overload(Config) when is_list(Config) ->
+ %% Start a node, and get the port number it's listening on.
+ #node_handle{nodename = NodeName} = NH1 = start_ssl_node(Config),
+ [Name, Host] = string:lexemes(atom_to_list(NodeName), "@"),
+ {ok, NodesPorts} = apply_on_ssl_node(NH1, fun net_adm:names/0),
+ {Name, Port} = lists:keyfind(Name, 1, NodesPorts),
+ %% Run 4 connections concurrently. When TLS handshake is not concurrent,
+ %% and with default net_setuptime of 7 seconds, only one connection per 7
+ %% seconds is closed from server side. With concurrent accept, all 7 will
+ %% be dropped in 7 seconds
+ RequiredConcurrency = 4,
+ Started = [connect(self(), Host, Port) || _ <- lists:seq(1, RequiredConcurrency)],
+ %% give 10 seconds (more than 7, less than 2x7 seconds)
+ Responded = barrier(RequiredConcurrency, [], erlang:system_time(millisecond) + 10000),
+ %% clean up
+ stop_ssl_node(NH1),
+ [R ! exit || R <- Responded],
+ [exit(P, kill) || P <- Started -- Responded],
+ %% Ensure some amount of concurrency was reached.
+ (length(Responded) >= RequiredConcurrency) orelse
+ ct:fail({actual, length(Responded), expected, RequiredConcurrency}),
+ success(Config).
+
+barrier(0, Responded, _Until) ->
+ Responded;
+barrier(RequiredConcurrency, Responded, Until) ->
+ Timeout = Until - erlang:system_time(millisecond),
+ receive
+ {waiting, Pid} ->
+ barrier(RequiredConcurrency - 1, [Pid | Responded], Until);
+ {error, Error} ->
+ ct:fail(Error)
+ after
+ Timeout -> Responded
+ end.
+
+connect(Control, Host, Port) ->
+ spawn(
+ fun () ->
+ case gen_tcp:connect(Host, Port, [{active, true}]) of
+ {ok, Sock} ->
+ receive
+ {tcp_closed, Sock} ->
+ Control ! {waiting, self()};
+ exit ->
+ gen_tcp:close(Sock)
+ end;
+ Error ->
+ Control ! {error, Error}
+ end
+ end).
+
%%--------------------------------------------------------------------
plain_options() ->
[{doc,"Test specifying tls options not related to certificate verification"}].
--
2.26.2