File 2224-erts-Add-option-IOs-to-disable-scheduler-polling.patch of Package erlang

From de2667593b7560ba809fbc90a1905508a93c18ef Mon Sep 17 00:00:00 2001
From: Sverker Eriksson <sverker@erlang.org>
Date: Fri, 4 Mar 2022 22:50:41 +0100
Subject: [PATCH 4/4] erts: Add option +IOs to disable scheduler polling

Enabled (true) by default.
---
 erts/doc/src/erl_cmd.xml                | 12 +++
 erts/emulator/beam/erl_port_task.c      | 30 ++++----
 erts/emulator/beam/erl_process.c        | 32 ++++----
 erts/emulator/sys/common/erl_check_io.c | 99 +++++++++++++++----------
 erts/emulator/sys/common/erl_check_io.h | 18 +++++
 erts/emulator/test/scheduler_SUITE.erl  | 51 +++++++++++++
 erts/etc/common/erlexec.c               | 10 +++
 7 files changed, 182 insertions(+), 70 deletions(-)

diff --git a/erts/doc/src/erl_cmd.xml b/erts/doc/src/erl_cmd.xml
index 6eeaf27eaf..98412fcf25 100644
--- a/erts/doc/src/erl_cmd.xml
+++ b/erts/doc/src/erl_cmd.xml
@@ -969,6 +969,18 @@ $ <input>erl \
           <c>+IOt</c> are used, <c>+IOPt</c> is ignored.
         </p>
       </item>
+      <tag><marker id="+IOs"/><c>+IOs true|false</c></tag>
+      <item>
+        <p>Enable or disable scheduler thread poll optimization. Default is
+          <c>true</c>.
+        </p>
+        <p>If enabled, file descriptors that are frequently read may be moved to
+          a special pollset used by scheduler threads. The objective is to
+          reduce the number of system calls and thereby CPU load, but it can
+          in some cases increase scheduling latency for individual file
+          descriptor input events.
+        </p>
+      </item>
       <tag><marker id="+JPperf"/><c>+JPperf true|false</c></tag>
       <item>
         <p>Enables or disables support for the `perf` profiler when running
diff --git a/erts/emulator/beam/erl_port_task.c b/erts/emulator/beam/erl_port_task.c
index 27ecee8d09..33b6cf1c43 100644
--- a/erts/emulator/beam/erl_port_task.c
+++ b/erts/emulator/beam/erl_port_task.c
@@ -1326,22 +1326,22 @@ erts_port_task_abort(ErtsPortTaskHandle *pthp)
 	    res = - 1; /* Task already aborted, executing, or executed */
 	else {
 	    reset_port_task_handle(pthp);
-
 #if ERTS_POLL_USE_SCHEDULER_POLLING
-            switch (ptp->type) {
-	    case ERTS_PORT_TASK_INPUT:
-	    case ERTS_PORT_TASK_OUTPUT:
-                if (ptp->u.alive.td.io.is_scheduler_event) {
-                    ASSERT(erts_atomic_read_nob(
-                               &erts_port_task_outstanding_io_tasks) > 0);
-                    erts_atomic_dec_relb(&erts_port_task_outstanding_io_tasks);
+            if (erts_sched_poll_enabled()) {
+                switch (ptp->type) {
+                case ERTS_PORT_TASK_INPUT:
+                case ERTS_PORT_TASK_OUTPUT:
+                    if (ptp->u.alive.td.io.is_scheduler_event) {
+                        ASSERT(erts_atomic_read_nob(
+                                   &erts_port_task_outstanding_io_tasks) > 0);
+                        erts_atomic_dec_relb(&erts_port_task_outstanding_io_tasks);
+                    }
+                    break;
+                default:
+                    break;
                 }
-		break;
-	    default:
-		break;
-	    }
+            }
 #endif
-
 	    res = 0;
 	}
     }
@@ -1844,16 +1844,14 @@ erts_port_task_execute(ErtsRunQueue *runq, Port **curr_port_pp)
     }
 
     ERTS_MSACC_POP_STATE_M();
-
 #if ERTS_POLL_USE_SCHEDULER_POLLING
-    if (io_tasks_executed) {
+    if (erts_sched_poll_enabled() && io_tasks_executed) {
         ASSERT(erts_atomic_read_nob(&erts_port_task_outstanding_io_tasks)
 	       >= io_tasks_executed);
         erts_atomic_add_relb(&erts_port_task_outstanding_io_tasks,
 				 -1*io_tasks_executed);
     }
 #endif
-
     ASSERT(runq == erts_get_runq_port(pp));
 
     active = finalize_exec(pp, &execq, processing_busy_q);
diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c
index c75f67c0d7..fac9ab704b 100644
--- a/erts/emulator/beam/erl_process.c
+++ b/erts/emulator/beam/erl_process.c
@@ -3194,11 +3194,11 @@ aux_thread(void *vix)
 
 #if ERTS_POLL_USE_FALLBACK
     if (ix == 0) {
-#if ERTS_POLL_USE_SCHEDULER_POLLING
-	ssi->psi = erts_create_pollset_thread(-2, tpd);
-#else
-	ssi->psi = erts_create_pollset_thread(-1, tpd);
-#endif
+        if (erts_sched_poll_enabled()) {
+            ssi->psi = erts_create_pollset_thread(-2, tpd);
+        } else {
+            ssi->psi = erts_create_pollset_thread(-1, tpd);
+        }
     }
 #endif
 
@@ -3365,11 +3365,13 @@ try_set_sys_scheduling(void)
 static ERTS_INLINE int
 prepare_for_sys_schedule(void)
 {
-    while (!erts_port_task_have_outstanding_io_tasks()
-           && try_set_sys_scheduling()) {
-        if (!erts_port_task_have_outstanding_io_tasks())
-            return 1;
-        clear_sys_scheduling();
+    if (erts_sched_poll_enabled()) {
+        while (!erts_port_task_have_outstanding_io_tasks()
+               && try_set_sys_scheduling()) {
+            if (!erts_port_task_have_outstanding_io_tasks())
+                return 1;
+            clear_sys_scheduling();
+        }
     }
     return 0;
 }
@@ -8620,9 +8622,9 @@ sched_thread_func(void *vesdp)
 
     erts_thr_progress_register_managed_thread(esdp, &callbacks, 0, 0);
 
-#if ERTS_POLL_USE_SCHEDULER_POLLING
-    esdp->ssi->psi = erts_create_pollset_thread(-1, NULL);
-#endif
+    if (erts_sched_poll_enabled()) {
+        esdp->ssi->psi = erts_create_pollset_thread(-1, NULL);
+    }
 
 #ifdef ERTS_ENABLE_LOCK_CHECK
     {
@@ -9522,7 +9524,9 @@ Process *erts_schedule(ErtsSchedulerData *esdp, Process *p, int calls)
 	esdp->virtual_reds = 0;
 
 #if ERTS_POLL_USE_SCHEDULER_POLLING
-        fcalls = (int) erts_atomic32_add_read_acqb(&function_calls, reds);
+        if (erts_sched_poll_enabled()) {
+            fcalls = (int)erts_atomic32_add_read_acqb(&function_calls, reds);
+        }
 #endif
 
 	ASSERT(esdp && esdp == erts_get_scheduler_data());
diff --git a/erts/emulator/sys/common/erl_check_io.c b/erts/emulator/sys/common/erl_check_io.c
index 28716b313e..1b12ed3293 100644
--- a/erts/emulator/sys/common/erl_check_io.c
+++ b/erts/emulator/sys/common/erl_check_io.c
@@ -146,7 +146,7 @@ static ErtsPollThread *psiv;
 static ErtsPollSet *flbk_pollset;
 #endif
 #if ERTS_POLL_USE_SCHEDULER_POLLING
-static ErtsPollSet *sched_pollset;
+ErtsPollSet *sched_pollset;
 #endif
 
 typedef struct {
@@ -418,7 +418,7 @@ static ERTS_INLINE ErtsPollSet *
 get_scheduler_pollset(ErtsSysFdType fd)
 {
 #if ERTS_POLL_USE_SCHEDULER_POLLING
-    return sched_pollset;
+    return sched_pollset ? sched_pollset : get_pollset(fd);
 #else
     return get_pollset(fd);
 #endif
@@ -506,7 +506,7 @@ erts_io_notify_port_task_executed(ErtsPortTaskType type,
                 && (state->events & ERTS_POLL_EV_IN)) {
 
                 active_events |= ERTS_POLL_EV_IN;
-                if (state->count > 10 && ERTS_POLL_USE_SCHEDULER_POLLING) {
+                if (state->count > 10 && erts_sched_poll_enabled()) {
                     if (!(state->flags & ERTS_EV_FLAG_SCHEDULER))
                         op = ERTS_POLL_OP_ADD;
                     state->flags |= ERTS_EV_FLAG_IN_SCHEDULER|ERTS_EV_FLAG_SCHEDULER;
@@ -514,7 +514,7 @@ erts_io_notify_port_task_executed(ErtsPortTaskType type,
                     DEBUG_PRINT_FD("moving to scheduler ps", state);
                 } else
                     new_events = active_events;
-                if (!(state->flags & ERTS_EV_FLAG_FALLBACK) && ERTS_POLL_USE_SCHEDULER_POLLING)
+                if (!(state->flags & ERTS_EV_FLAG_FALLBACK) && erts_sched_poll_enabled())
                     state->count++;
             }
             break;
@@ -1800,7 +1800,7 @@ erts_check_io(ErtsPollThread *psi, ErtsMonotonicTime timeout_time, int poll_only
                select/deselect in rapid succession. */
             revents &= state->active_events | ERTS_POLL_EV_NVAL;
 
-            if (psi->ps != get_scheduler_pollset(fd) || !ERTS_POLL_USE_SCHEDULER_POLLING) {
+            if (psi->ps != get_scheduler_pollset(fd) || !erts_sched_poll_enabled()) {
                 ErtsPollEvents reactive_events;
                 state->active_events &= ~revents;
 
@@ -2118,7 +2118,7 @@ get_arg(char* rest, char** argv, int* ip)
 }
 
 static void
-parse_args(int *argc, char **argv, int concurrent_waiters)
+parse_args(int *argc, char **argv, int concurrent_waiters, int* use_sched_poll)
 {
     int i = 0, j;
     int no_pollsets = 0, no_poll_threads = 0,
@@ -2160,8 +2160,16 @@ parse_args(int *argc, char **argv, int concurrent_waiters)
                         erts_fprintf(stderr,"bad I/O pollset percentage number: %s\n", arg);
                         erts_usage();
                     }
-                } else {
-                    break;
+                } else if (sys_strcmp(argv[i]+2, "Os") == 0) {
+                    const char *arg = get_arg(argv[i]+4, argv, &i);
+                    if (sys_strcmp(arg, "true") == 0) {
+                        *use_sched_poll = 1;
+                    } else if (sys_strcmp(arg, "false") == 0) {
+                        *use_sched_poll = 0;
+                    } else  {
+                        erts_fprintf(stderr,"bad +IOs boolean argument: %s\n", arg);
+                        erts_usage();
+                    }
                 }
                 break;
             }
@@ -2229,6 +2237,8 @@ void
 erts_init_check_io(int *argc, char **argv)
 {
     int j, concurrent_waiters, no_poll_threads;
+    int use_sched_poll = ERTS_POLL_USE_SCHEDULER_POLLING;
+
     ERTS_CT_ASSERT((INT_MIN & (ERL_NIF_SELECT_STOP_CALLED |
                                ERL_NIF_SELECT_STOP_SCHEDULED |
                                ERL_NIF_SELECT_INVALID_EVENT |
@@ -2240,7 +2250,7 @@ erts_init_check_io(int *argc, char **argv)
     erts_poll_init_flbk(NULL);
 #endif
 
-    parse_args(argc, argv, concurrent_waiters);
+    parse_args(argc, argv, concurrent_waiters, &use_sched_poll);
 
     /* Create the actual pollsets */
     pollsetv = erts_alloc(ERTS_ALC_T_POLLSET,sizeof(ErtsPollSet *) * erts_no_pollsets);
@@ -2252,10 +2262,16 @@ erts_init_check_io(int *argc, char **argv)
 
     j = -1;
 
+    if (use_sched_poll) {
 #if ERTS_POLL_USE_SCHEDULER_POLLING
-    sched_pollset = erts_poll_create_pollset(j--);
-    no_poll_threads++;
+        sched_pollset = erts_poll_create_pollset(j--);
+        ASSERT(erts_sched_poll_enabled());
+        no_poll_threads++;
+#else
+        erts_fprintf(stderr,"+IOs true: not supported by this emulator\n");
+        erts_usage();
 #endif
+    }
 
 #if ERTS_POLL_USE_FALLBACK
     flbk_pollset = erts_poll_create_pollset_flbk(j--);
@@ -2272,13 +2288,13 @@ erts_init_check_io(int *argc, char **argv)
     psiv++;
 #endif
 
-#if ERTS_POLL_USE_SCHEDULER_POLLING
-    psiv[0].pollres_len = ERTS_CHECK_IO_POLL_RES_LEN;
-    psiv[0].pollres = erts_alloc(ERTS_ALC_T_POLLSET,
-        sizeof(ErtsPollResFd) * ERTS_CHECK_IO_POLL_RES_LEN);
-    psiv[0].ps = get_scheduler_pollset(0);
-    psiv++;
-#endif
+    if (erts_sched_poll_enabled()) {
+        psiv[0].pollres_len = ERTS_CHECK_IO_POLL_RES_LEN;
+        psiv[0].pollres = erts_alloc(ERTS_ALC_T_POLLSET,
+                                     sizeof(ErtsPollResFd) * ERTS_CHECK_IO_POLL_RES_LEN);
+        psiv[0].ps = get_scheduler_pollset(0);
+        psiv++;
+    }
 
     for (j = 0; j < erts_no_poll_threads; j++) {
         psiv[j].pollres_len = ERTS_CHECK_IO_POLL_RES_LEN;
@@ -2336,12 +2352,12 @@ erts_check_io_size(void)
     erts_poll_info(get_fallback_pollset(), &pi);
     res += pi.memory_size;
 #endif
-
 #if ERTS_POLL_USE_SCHEDULER_POLLING
-    erts_poll_info(get_scheduler_pollset(0), &pi);
-    res += pi.memory_size;
+    if (erts_sched_poll_enabled()) {
+        erts_poll_info(sched_pollset, &pi);
+        res += pi.memory_size;
+    }
 #endif
-
     for (i = 0; i < erts_no_pollsets; i++) {
         erts_poll_info(pollsetv[i], &pi);
         res += pi.memory_size;
@@ -2370,9 +2386,9 @@ erts_check_io_info(void *proc)
     Uint sz, *szp, *hp, **hpp;
     ErtsPollInfo *piv;
     Sint i, j = 0, len;
-    int no_pollsets = erts_no_pollsets + ERTS_POLL_USE_FALLBACK + ERTS_POLL_USE_SCHEDULER_POLLING;
+    int no_pollsets = erts_no_pollsets + ERTS_POLL_USE_FALLBACK + erts_sched_poll_enabled();
     ERTS_CT_ASSERT(ERTS_POLL_USE_FALLBACK == 0 || ERTS_POLL_USE_FALLBACK == 1);
-    ERTS_CT_ASSERT(ERTS_POLL_USE_SCHEDULER_POLLING == 0 || ERTS_POLL_USE_SCHEDULER_POLLING == 1);
+    ERTS_ASSERT(erts_sched_poll_enabled() == 0 || erts_sched_poll_enabled() == 1);
 
     piv = erts_alloc(ERTS_ALC_T_TMP, sizeof(ErtsPollInfo) * no_pollsets);
 
@@ -2382,14 +2398,14 @@ erts_check_io_info(void *proc)
     piv[0].active_fds = 0;
     piv++;
 #endif
-
 #if ERTS_POLL_USE_SCHEDULER_POLLING
-    erts_poll_info(get_scheduler_pollset(0), &piv[0]);
-    piv[0].poll_threads = 0;
-    piv[0].active_fds = 0;
-    piv++;
+    if (erts_sched_poll_enabled()) {
+        erts_poll_info(sched_pollset, &piv[0]);
+        piv[0].poll_threads = 0;
+        piv[0].active_fds = 0;
+        piv++;
+    }
 #endif
-
     for (j = 0; j < erts_no_pollsets; j++) {
         erts_poll_info(pollsetv[j], &piv[j]);
         piv[j].active_fds = 0;
@@ -2438,7 +2454,7 @@ erts_check_io_info(void *proc)
     sz = 0;
 
     piv -= ERTS_POLL_USE_FALLBACK;
-    piv -= ERTS_POLL_USE_SCHEDULER_POLLING;
+    piv -= erts_sched_poll_enabled();
 
  bld_it:
 
@@ -2919,18 +2935,19 @@ erts_check_io_debug(ErtsCheckIoDebugInfo *ciodip)
     }
 #endif
 #if ERTS_POLL_USE_SCHEDULER_POLLING
-    erts_dsprintf(dsbufp, "--- fds in scheduler pollset ----------------------------\n");
-    erts_poll_get_selected_events(get_scheduler_pollset(0), counters.epep,
-                                  drv_ev_state.max_fds);
-    for (fd = 0; fd < len; fd++) {
-        if (drv_ev_state.v[fd].flags & ERTS_EV_FLAG_SCHEDULER) {
-            if (drv_ev_state.v[fd].events && drv_ev_state.v[fd].events != ERTS_POLL_EV_NONE)
-                counters.epep[fd] &= ~ERTS_POLL_EV_OUT;
-            doit_erts_check_io_debug(&drv_ev_state.v[fd], &counters, dsbufp);
+    if (erts_sched_poll_enabled()) {
+        erts_dsprintf(dsbufp, "--- fds in scheduler pollset ----------------------------\n");
+        erts_poll_get_selected_events(sched_pollset, counters.epep,
+                                      drv_ev_state.max_fds);
+        for (fd = 0; fd < len; fd++) {
+            if (drv_ev_state.v[fd].flags & ERTS_EV_FLAG_SCHEDULER) {
+                if (drv_ev_state.v[fd].events && drv_ev_state.v[fd].events != ERTS_POLL_EV_NONE)
+                    counters.epep[fd] &= ~ERTS_POLL_EV_OUT;
+                doit_erts_check_io_debug(&drv_ev_state.v[fd], &counters, dsbufp);
+            }
         }
     }
 #endif
-
     erts_dsprintf(dsbufp, "--- fds in pollset --------------------------------------\n");
 
     for (i = 0; i < erts_no_pollsets; i++) {
@@ -2942,6 +2959,7 @@ erts_check_io_debug(ErtsCheckIoDebugInfo *ciodip)
                 && get_pollset_id(fd) == i) {
                 if (counters.epep[fd] != ERTS_POLL_EV_NONE &&
                     drv_ev_state.v[fd].flags & ERTS_EV_FLAG_IN_SCHEDULER) {
+                    ERTS_ASSERT(erts_sched_poll_enabled());
                     /* We add the in flag if it is enabled in the scheduler pollset
                        and get_selected_events works on the platform */
                     counters.epep[fd] |= ERTS_POLL_EV_IN;
@@ -2950,6 +2968,7 @@ erts_check_io_debug(ErtsCheckIoDebugInfo *ciodip)
             }
         }
     }
+
     for (fd = len ; fd < drv_ev_state.max_fds; fd++) {
         null_des.fd = fd;
         doit_erts_check_io_debug(&null_des, &counters, dsbufp);
diff --git a/erts/emulator/sys/common/erl_check_io.h b/erts/emulator/sys/common/erl_check_io.h
index b96f4f9609..7af984fea9 100644
--- a/erts/emulator/sys/common/erl_check_io.h
+++ b/erts/emulator/sys/common/erl_check_io.h
@@ -112,6 +112,22 @@ typedef struct {
 } ErtsIoTask;
 
 
+ERTS_GLB_INLINE int erts_sched_poll_enabled(void);
+
+#if ERTS_GLB_INLINE_INCL_FUNC_DEF
+
+ERTS_GLB_INLINE int erts_sched_poll_enabled(void)
+{
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+    extern ErtsPollSet *sched_pollset;
+    return (sched_pollset != NULL);
+#else
+    return 0;
+#endif
+}
+
+#endif /* ERTS_GLB_INLINE_INCL_FUNC_DEF */
+
 #endif /*  ERL_CHECK_IO_H__ */
 
 #if !defined(ERL_CHECK_IO_C__) && !defined(ERTS_ALLOC_C__)
@@ -133,6 +149,7 @@ extern int erts_no_poll_threads;
 #include "erl_poll.h"
 #include "erl_port_task.h"
 
+
 typedef struct {
     Eterm inport;
     Eterm outport;
@@ -152,3 +169,4 @@ typedef struct {
 } ErtsNifSelectDataState;
 
 #endif /* #ifndef ERL_CHECK_IO_INTERNAL__ */
+
diff --git a/erts/emulator/test/scheduler_SUITE.erl b/erts/emulator/test/scheduler_SUITE.erl
index 817062c397..44000667a0 100644
--- a/erts/emulator/test/scheduler_SUITE.erl
+++ b/erts/emulator/test/scheduler_SUITE.erl
@@ -58,6 +58,7 @@
 	 scheduler_suspend_basic/1,
 	 scheduler_suspend/1,
 	 dirty_scheduler_threads/1,
+         sched_poll/1,
          poll_threads/1,
 	 reader_groups/1,
          otp_16446/1,
@@ -77,6 +78,7 @@ all() ->
      {group, scheduler_bind}, scheduler_threads,
      scheduler_suspend_basic, scheduler_suspend,
      dirty_scheduler_threads,
+     sched_poll,
      poll_threads,
      reader_groups,
      otp_16446,
@@ -1528,6 +1530,54 @@ sst5_loop(N) ->
     erlang:system_flag(multi_scheduling, unblock_normal),
     sst5_loop(N-1).
 
+%% Test scheduler polling: +IOs true|false
+sched_poll(Config) when is_list(Config) ->
+
+    Env = case os:getenv("ERL_AFLAGS") of
+              false ->
+                  [];
+              AFLAGS1 ->
+                  %% Remove any +IOs
+                  AFLAGS2 = list_to_binary(re:replace(AFLAGS1,
+                                                      "\\+IOs (true|false)",
+                                                      "", [global])),
+                  [{"ERL_AFLAGS", binary_to_list(AFLAGS2)}]
+          end,
+
+    [PS | _] = get_iostate(""),
+    HaveSchedPoll = proplists:get_value(concurrent_updates, PS),
+
+    0 = get_sched_pollsets(["+IOs", "false"]),
+    if
+        HaveSchedPoll ->
+            1 = get_sched_pollsets(["+IOs", "true"]),
+            1 = get_sched_pollsets([], Env);
+
+        not HaveSchedPoll ->
+            fail = get_sched_pollsets(["+IOs", "true"]),
+            0 = get_sched_pollsets([], Env)
+    end,
+    fail = get_sched_pollsets(["+IOs", "bad"]),
+    ok.
+
+get_sched_pollsets(Cmd) ->
+    get_sched_pollsets(Cmd, []).
+
+get_sched_pollsets(Cmd, Env)->
+    try
+        {ok, Peer, Node} = ?CT_PEER(#{connection => standard_io, args => Cmd,
+                                      env => [{"ERL_LIBS", false} | Env]}),
+        [IOStates] = mcall(Node,[fun () -> erlang:system_info(check_io) end]),
+        IO = [IOState || IOState <- IOStates,
+            %% We assume non-fallbacks without threads are scheduler pollsets
+            proplists:get_value(fallback, IOState) == false,
+            proplists:get_value(poll_threads, IOState) == 0],
+        length(IO) % number of scheduler pollsets
+    catch
+        exit:{boot_failed, _} ->
+            fail
+    end.
+
 poll_threads(Config) when is_list(Config) ->
     {Conc, PollType, KP} = get_ioconfig(Config),
     {Sched, SchedOnln, _} = get_sstate(Config, ""),
diff --git a/erts/etc/common/erlexec.c b/erts/etc/common/erlexec.c
index 8dc9363af6..78c4440a3a 100644
--- a/erts/etc/common/erlexec.c
+++ b/erts/etc/common/erlexec.c
@@ -875,6 +875,16 @@ int main(int argc, char **argv)
                           i++;
                           break;
                       }
+                      if (argv[i][2] == 'O' && argv[i][3] == 's') {
+                          if (argv[i][4] != '\0')
+                              goto the_default;
+                          NEXT_ARG_CHECK();
+                          argv[i][0] = '-';
+                          add_Eargs(argv[i]);
+                          add_Eargs(argv[i+1]);
+                          i++;
+                          break;
+                      }
                       usage(argv[i]);
                       break;
                   case 'J':
-- 
2.34.1

openSUSE Build Service is sponsored by