File 5903-Implement-option-for-inet_drv-read_ahead.patch of Package erlang
From 7cb3eb7b74738071a86b1ab7f96ad29a932c7344 Mon Sep 17 00:00:00 2001
From: Raimo Niskanen <raimo@erlang.org>
Date: Thu, 25 Jul 2024 13:45:28 +0200
Subject: [PATCH 3/7] Implement option for inet_drv read_ahead
---
erts/emulator/drivers/common/inet_drv.c | 150 +++++++++++++++++-------
erts/preloaded/ebin/prim_inet.beam | Bin 102496 -> 102576 bytes
erts/preloaded/src/prim_inet.erl | 3 +
lib/kernel/src/inet.erl | 4 +-
lib/kernel/src/inet_int.hrl | 3 +-
5 files changed, 116 insertions(+), 44 deletions(-)
diff --git a/erts/emulator/drivers/common/inet_drv.c b/erts/emulator/drivers/common/inet_drv.c
index 9d772dc193..6776ab2f1e 100644
--- a/erts/emulator/drivers/common/inet_drv.c
+++ b/erts/emulator/drivers/common/inet_drv.c
@@ -839,6 +839,7 @@ static size_t my_strnlen(const char *s, size_t maxlen)
#define TCP_ADDF_SHUTDOWN_WR_DONE 256 /* A shutdown(sock, SHUT_WR) or SHUT_RDWR was made */
#define TCP_ADDF_LINGER_ZERO 512 /* Discard driver queue on port close */
#define TCP_ADDF_SENDFILE 1024 /* Send from an fd instead of the driver queue */
+#define TCP_ADDF_NO_READ_AHEAD 2048 /* Don't read ahead in packet modes */
/* *_REQ_* replies */
#define INET_REP_ERROR 0
@@ -896,6 +897,7 @@ static size_t my_strnlen(const char *s, size_t maxlen)
#define INET_OPT_TTL 46 /* IP_TTL */
#define INET_OPT_RECVTTL 47 /* IP_RECVTTL ancillary data */
#define TCP_OPT_NOPUSH 48 /* super-Nagle, aka TCP_CORK */
+#define INET_LOPT_TCP_READ_AHEAD 49 /* Read ahead of packet data */
#define INET_LOPT_DEBUG 99 /* Enable/disable DEBUG for a socket */
/* SCTP options: a separate range, from 100: */
@@ -7082,6 +7084,21 @@ static int inet_set_opts(inet_descriptor* desc, char* ptr, int len)
}
continue;
+ case INET_LOPT_TCP_READ_AHEAD:
+ DDBG(desc,
+ ("INET-DRV-DBG[%d][" SOCKET_FSTR ",%T] "
+ "inet_set_opts(read_ahead) -> %d (%s)\r\n",
+ __LINE__,
+ desc->s, driver_caller(desc->port), ival, B2S(ival)) );
+ if (desc->sprotocol == IPPROTO_TCP) {
+ tcp_descriptor* tdesc = (tcp_descriptor*) desc;
+ if (! ival)
+ tdesc->tcp_add_flags |= TCP_ADDF_NO_READ_AHEAD;
+ else
+ tdesc->tcp_add_flags &= ~TCP_ADDF_NO_READ_AHEAD;
+ }
+ continue;
+
case INET_LOPT_LINE_DELIM:
DDBG(desc,
("INET-DRV-DBG[%d][" SOCKET_FSTR ",%T] "
@@ -9043,6 +9060,17 @@ static ErlDrvSSizeT inet_fill_opts(inet_descriptor* desc,
}
continue;
+ case INET_LOPT_TCP_READ_AHEAD:
+ if (desc->sprotocol == IPPROTO_TCP) {
+ tcp_descriptor* tdesc = (tcp_descriptor*) desc;
+ *ptr++ = opt;
+ ival = !(tdesc->tcp_add_flags & TCP_ADDF_NO_READ_AHEAD);
+ put_int32(ival, ptr);
+ } else {
+ TRUNCATE_TO(0,ptr);
+ }
+ continue;
+
case INET_OPT_PRIORITY:
#ifdef SO_PRIORITY
type = SO_PRIORITY;
@@ -11275,8 +11303,9 @@ static void inet_input_count(inet_descriptor* desc, ErlDrvSizeT len)
/*
** Set new size on buffer, used when packet size is determined
-** and the buffer is to small.
-** buffer must have a size of at least len bytes (counting from ptr_start!)
+** and the buffer is to small. Expand the buffer to len bytes
+** from ptr_start, don't move ptr_start's position and
+** keep the content before ptr_start.
*/
static int tcp_expand_buffer(tcp_descriptor* desc, int len)
{
@@ -11465,7 +11494,8 @@ static tcp_descriptor* tcp_inet_copy(tcp_descriptor* desc,SOCKET s,
copy_desc->send_timeout_close = desc->send_timeout_close;
copy_desc->tcp_add_flags = desc->tcp_add_flags
- & (TCP_ADDF_SHOW_ECONNRESET | TCP_ADDF_LINGER_ZERO);
+ & (TCP_ADDF_SHOW_ECONNRESET | TCP_ADDF_LINGER_ZERO |
+ TCP_ADDF_NO_READ_AHEAD);
/* The new port will be linked and connected to the original caller */
port = driver_create_port(port, owner, "tcp_inet", (ErlDrvData) copy_desc);
@@ -12435,22 +12465,61 @@ static int tcp_recv_error(tcp_descriptor* desc, int err)
}
-
+/* To be called after packet_get_length() has returned that
+ * more bytes are needed (or when we know that it would).
+ */
static int packet_header_length(tcp_descriptor *desc) {
- /* XXX RaNi. Here we shall decide whether to read as short as possible,
- * or to fill the buffer as much as possible.
- *
- * Return 0 when we shall read ahead, and > 0 for the amount of byte
- * we need to deduce the packet length.
- */
+ int n, hlen;
+
+ if (! (desc->tcp_add_flags & TCP_ADDF_NO_READ_AHEAD))
+ return 0; /* Read ahead */
switch (desc->inet.htype) {
- case TCP_PB_SSL_TLS:
- /* Return the minimal length to not read ahead */
- return 5;
+ case TCP_PB_RAW:
+ return 0;
+
+ /* Return how many more bytes we should read to make
+ * packet_get_length() return the packet length.
+ *
+ * Set hlen to the minimal header bytes, for starters.
+ */
+ case TCP_PB_1: hlen = 1; break;
+ case TCP_PB_2: hlen = 2; break;
+ case TCP_PB_4: hlen = 4; break;
+ case TCP_PB_RM: hlen = 4; break;
+ case TCP_PB_ASN1: hlen = 2; break;
+ case TCP_PB_SSL_TLS: hlen = 5; break;
+ case TCP_PB_CDR: hlen = 12; break;
+ case TCP_PB_FCGI: hlen = sizeof(struct fcgi_head); break;
+ case TCP_PB_TPKT: hlen = 4; break;
default:
- return 0; /* Read ahead as much as is comfortable */
+ /* We should always be able to read another byte
+ * to see if we then can deduce the packet length.
+ * Note that for line mode packet formats,
+ * not a length in a header, this is very inefficient,
+ * but there is no other way to not read ahead.
+ * For TCP_PB_ASN1 it is also inefficient, but we
+ * would have to re-implement quite some decoding rules
+ * here to figure out a better value that probably isn't
+ * that much better since some field have to be read one byte
+ * at the time to find the end of the field.
+ *
+ * Just don't combine TCP_ADDF_NO_READ_AHEAD
+ * with non-suitable packet types.
+ */
+ return 1;
}
+ n = desc->i_ptr - desc->i_ptr_start;
+ ASSERT(n >= 0);
+
+ if (hlen > n)
+ return hlen - n;
+ else
+ /* Since packet_get_length() couldn't return a length
+ * and since the minimal header size above apprently isn't enough
+ * we need at least another byte
+ */
+ return 1;
}
@@ -12458,15 +12527,17 @@ static int packet_header_length(tcp_descriptor *desc) {
** Calculate number of bytes that remain to read before deliver
** Assume buf, ptr_start, ptr has been setup
**
-** return > 0 if more to read
-** = 0 if holding complete packet
-** < 0 on error
-**
-** if return value == 0 then *len will hold the length of the first packet
-** return value > 0 then
-** if *len == 0 then value means what to read next: upperbound or minimum
-** *len > 0 then value means exactly what is missing for a packet
-**
+** return == 0 if we have a complete packet. *len holds the packet length.
+** > 0 is how many bytes to read.
+** *len is the packet's length.
+** == 0 if we don't know, then the return value is either
+** the minimum bytes to read to figure out the packet's
+** length or the maximum bytes to read which avoids
+** numerous calls to sock_recv(), depending on
+** TCP_ADDF_NO_READ_AHEAD.
+** > 0 Implies that the return value is
+** exactly what's missing.
+** < 0 if there is a decode error
*/
static int tcp_remain(tcp_descriptor* desc, int* len)
{
@@ -12484,17 +12555,18 @@ static int tcp_remain(tcp_descriptor* desc, int* len)
desc->inet.port, desc->inet.s, n, nfill, nsz, tlen));
if (tlen > 0) {
+ *len = tlen;
if (tlen <= n) { /* got a packet */
- *len = tlen;
DEBUGF((" => nothing remain packet=%d\r\n", tlen));
return 0;
}
else { /* need known more */
+ int nread;
if (tcp_expand_buffer(desc, tlen) < 0)
return -1;
- *len = tlen - n;
- DEBUGF((" => remain=%d\r\n", *len));
- return *len;
+ nread = tlen - n;
+ DEBUGF((" => remain=%d\r\n", nread));
+ return nread;
}
}
else if (tlen == 0) { /* need unknown more */
@@ -12514,17 +12586,14 @@ static int tcp_remain(tcp_descriptor* desc, int* len)
*/
if (tcp_expand_buffer(desc, desc->inet.psize) < 0)
return -1;
- return desc->inet.psize;
+ return desc->inet.psize - n;;
}
else
goto error;
}
DEBUGF((" => restart more=%d\r\n", nfill - n));
- /* Return the unused buffer space before desc->i_ptr_start,
- * XXX RaNi. Is there a missing copy down here?
- * Won't the sock_recv that follows overwrite
- * the buffer end...???
- */
+ tcp_clear_input(desc); /* Move the data to buffer start */
+ /* Return the unused buffer space before desc->i_ptr_start */
return nfill - n;
}
else {
@@ -12556,8 +12625,8 @@ static int tcp_deliver(tcp_descriptor* desc, int len)
if (n < 0) /* packet error */
return n;
/* Packet incomplete */
- if (len > 0) /* This is what is missing */
- desc->i_remain = len;
+ if (len > 0)
+ desc->i_remain = n; /* This is what is missing */
return 0;
}
}
@@ -12621,7 +12690,7 @@ static int tcp_deliver(tcp_descriptor* desc, int len)
return n;
tcp_restart_input(desc);
if (len > 0)
- desc->i_remain = len;
+ desc->i_remain = n;
len = 0;
}
}
@@ -12663,15 +12732,14 @@ static int tcp_recv(tcp_descriptor* desc, int request_len)
else
desc->i_remain = 0;
}
- else if (request_len > 0) { /* we have a data in buffer and a request */
+ else if (request_len > 0) { /* we have data in buffer and a request */
int n = desc->i_ptr - desc->i_ptr_start;
if (n >= request_len)
return tcp_deliver(desc, request_len);
else if (tcp_expand_buffer(desc, request_len) < 0)
return tcp_recv_error(desc, ENOMEM);
- else
- desc->i_remain = nread = request_len - n;
+ desc->i_remain = nread = request_len - n;
}
else if (desc->i_remain == 0) { /* poll remain from buffer data */
int len;
@@ -12681,7 +12749,7 @@ static int tcp_recv(tcp_descriptor* desc, int request_len)
else if (nread == 0)
return tcp_deliver(desc, len);
else if (len > 0)
- desc->i_remain = len; /* set remain */
+ desc->i_remain = nread; /* set remain */
}
else /* remain already set use it */
nread = desc->i_remain;
@@ -12739,7 +12807,7 @@ static int tcp_recv(tcp_descriptor* desc, int request_len)
else if (nread == 0)
return tcp_deliver(desc, len);
else if (len > 0) /* nread > 0 */
- desc->i_remain = len; /* What is missing for this packet */
+ desc->i_remain = nread; /* What is missing for this packet */
}
} /* for (;;) */
}
diff --git a/erts/preloaded/src/prim_inet.erl b/erts/preloaded/src/prim_inet.erl
index b12a29bf83..bdc4e97ad3 100644
--- a/erts/preloaded/src/prim_inet.erl
+++ b/erts/preloaded/src/prim_inet.erl
@@ -1576,6 +1576,7 @@ enc_opt(show_econnreset) -> ?INET_LOPT_TCP_SHOW_ECONNRESET;
enc_opt(line_delimiter) -> ?INET_LOPT_LINE_DELIM;
enc_opt(raw) -> ?INET_OPT_RAW;
enc_opt(bind_to_device) -> ?INET_OPT_BIND_TO_DEVICE;
+enc_opt(read_ahead) -> ?INET_LOPT_TCP_READ_AHEAD;
enc_opt(debug) -> ?INET_OPT_DEBUG;
% Names of SCTP opts:
enc_opt(sctp_rtoinfo) -> ?SCTP_OPT_RTOINFO;
@@ -1647,6 +1648,7 @@ dec_opt(?INET_LOPT_TCP_SHOW_ECONNRESET) -> show_econnreset;
dec_opt(?INET_LOPT_LINE_DELIM) -> line_delimiter;
dec_opt(?INET_OPT_RAW) -> raw;
dec_opt(?INET_OPT_BIND_TO_DEVICE) -> bind_to_device;
+dec_opt(?INET_LOPT_TCP_READ_AHEAD) -> read_ahead;
dec_opt(?INET_OPT_DEBUG) -> debug;
dec_opt(I) when is_integer(I) -> undefined.
@@ -1759,6 +1761,7 @@ type_opt_1(read_packets) -> uint;
type_opt_1(netns) -> binary;
type_opt_1(show_econnreset) -> bool;
type_opt_1(bind_to_device) -> binary;
+type_opt_1(read_ahead) -> bool;
type_opt_1(debug) -> bool;
%%
%% SCTP options (to be set). If the type is a record type, the corresponding
diff --git a/lib/kernel/src/inet.erl b/lib/kernel/src/inet.erl
index f6efbea077..0008a13048 100644
--- a/lib/kernel/src/inet.erl
+++ b/lib/kernel/src/inet.erl
@@ -999,7 +999,7 @@ connect_options() ->
header, active, packet, packet_size, buffer, mode, deliver, line_delimiter,
exit_on_close, high_watermark, low_watermark, high_msgq_watermark,
low_msgq_watermark, send_timeout, send_timeout_close, delay_send, raw,
- show_econnreset, bind_to_device].
+ show_econnreset, bind_to_device, read_ahead].
connect_options(Opts, Mod) ->
BaseOpts =
@@ -1089,7 +1089,7 @@ listen_options() ->
header, active, packet, buffer, mode, deliver, backlog, ipv6_v6only,
exit_on_close, high_watermark, low_watermark, high_msgq_watermark,
low_msgq_watermark, send_timeout, send_timeout_close, delay_send,
- packet_size, raw, show_econnreset, bind_to_device].
+ packet_size, raw, show_econnreset, bind_to_device, read_ahead].
listen_options(Opts, Mod) ->
BaseOpts =
diff --git a/lib/kernel/src/inet_int.hrl b/lib/kernel/src/inet_int.hrl
index 2f50f2c23c..76b3f3ce02 100644
--- a/lib/kernel/src/inet_int.hrl
+++ b/lib/kernel/src/inet_int.hrl
@@ -1,7 +1,7 @@
%%
%% %CopyrightBegin%
%%
-%% Copyright Ericsson AB 1997-2022. All Rights Reserved.
+%% Copyright Ericsson AB 1997-2024. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
@@ -167,6 +167,7 @@
-define(INET_OPT_TTL, 46).
-define(INET_OPT_RECVTTL, 47).
-define(TCP_OPT_NOPUSH, 48).
+-define(INET_LOPT_TCP_READ_AHEAD, 49).
-define(INET_OPT_DEBUG, 99).
% Specific SCTP options: separate range:
-define(SCTP_OPT_RTOINFO, 100).
--
2.43.0