File 1129-AArch64-Optimize-init_yregs-using-128-bit-vector-reg.patch of Package erlang

From 01bc36d4048dcda228e8fa0ba05a170ed50d70b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Wed, 6 Sep 2023 16:32:13 +0200
Subject: [PATCH 09/25] AArch64: Optimize init_yregs using 128-bit vector
 registers

---
 erts/emulator/beam/jit/arm/instr_common.cpp   | 50 +++++++++-
 erts/emulator/test/beam_SUITE.erl             | 36 +++++++-
 .../test/beam_SUITE_data/beam_init_yregs.S    | 91 +++++++++++++++++++
 3 files changed, 169 insertions(+), 8 deletions(-)
 create mode 100644 erts/emulator/test/beam_SUITE_data/beam_init_yregs.S

diff --git a/erts/emulator/beam/jit/arm/instr_common.cpp b/erts/emulator/beam/jit/arm/instr_common.cpp
index 7abaff09ee..117c17a098 100644
--- a/erts/emulator/beam/jit/arm/instr_common.cpp
+++ b/erts/emulator/beam/jit/arm/instr_common.cpp
@@ -418,10 +418,9 @@ void BeamModuleAssembler::emit_init_yregs(const ArgWord &Size,
                                           const Span<ArgVal> &args) {
     unsigned count = Size.get();
     ASSERT(count == args.size());
-
     unsigned i = 0;
-
-    mov_imm(TMP1, NIL);
+    bool x_initialized = false;
+    bool q_initialized = false;
 
     while (i < count) {
         unsigned first_y = args[i].as<ArgYRegister>().get();
@@ -440,7 +439,42 @@ void BeamModuleAssembler::emit_init_yregs(const ArgWord &Size,
 
         /* Now first_y is the number of the first y register to be initialized
          * and slots is the number of y registers to be initialized. */
+
+        while (slots >= 4 && first_y % 2 == 0 &&
+               first_y <= 2 * MAX_LDP_STP_DISPLACEMENT) {
+            /* `stp` (with vector registers) can only address the
+             *  first 128 Y registers. */
+            if (!q_initialized) {
+                ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == 0x0f);
+                a.movi(a64::v0.d2(), imm(-1));
+                q_initialized = true;
+            }
+            a.stp(a64::q0, a64::q0, getYRef(first_y));
+            first_y += 4;
+            slots -= 4;
+        }
+
+        while (slots >= 2 && q_initialized &&
+               (first_y % 2 == 0 ||
+                first_y * sizeof(Eterm) <= MAX_LDUR_STUR_DISPLACEMENT)) {
+            /* Note that the STR instruction for a vector register
+             * requires the offset to be 16-byte aligned. If it is
+             * not, the STUR instruction must be used. AsmJit
+             * automatically turns STR into STUR when necessary. */
+            a.str(a64::v0, getYRef(first_y));
+            first_y += 2;
+            slots -= 2;
+        }
+
         while (slots >= 2) {
+            /* Either the vector register is not initialized, or first_y
+             * is either not 16-byte aligned or it is out of reach for the
+             * STUR instruction. */
+            if (!x_initialized) {
+                mov_imm(TMP1, NIL);
+                x_initialized = true;
+            }
+
             /* `stp` can only address the first 64 Y registers. */
             if (first_y <= MAX_LDP_STP_DISPLACEMENT) {
                 a.stp(TMP1, TMP1, getYRef(first_y));
@@ -454,7 +488,15 @@ void BeamModuleAssembler::emit_init_yregs(const ArgWord &Size,
         }
 
         if (slots == 1) {
-            a.str(TMP1, getYRef(first_y));
+            if (q_initialized) {
+                a.str(a64::d0, getYRef(first_y));
+            } else {
+                if (!x_initialized) {
+                    mov_imm(TMP1, NIL);
+                    x_initialized = true;
+                }
+                a.str(TMP1, getYRef(first_y));
+            }
         }
     }
 }
diff --git a/erts/emulator/test/beam_SUITE.erl b/erts/emulator/test/beam_SUITE.erl
index 8a9e70e4d4..c38aa9a1ab 100644
--- a/erts/emulator/test/beam_SUITE.erl
+++ b/erts/emulator/test/beam_SUITE.erl
@@ -23,8 +23,9 @@
 -export([all/0, suite/0,groups/0,init_per_suite/1, end_per_suite/1, 
 	 init_per_group/2,end_per_group/2, 
 	 packed_registers/1, apply_last/1, apply_last_bif/1,
-         heap_sizes/1, big_lists/1, fconv/1,
-	 select_val/1, swap_temp_apply/1]).
+	 heap_sizes/1, big_lists/1, fconv/1,
+         select_val/1,
+         swap_temp_apply/1, beam_init_yregs/1]).
 
 -export([applied/2,swap_temp_applied/1]).
 
@@ -36,7 +37,8 @@ suite() -> [{ct_hooks,[ts_install_cth]}].
 all() -> 
     [packed_registers, apply_last, apply_last_bif,
      heap_sizes, big_lists, select_val,
-     swap_temp_apply].
+     swap_temp_apply,
+     beam_init_yregs].
 
 groups() -> 
     [].
@@ -148,8 +150,10 @@ packed_registers(Config) when is_list(Config) ->
             io:put_chars(Dis)
     end,
 
+    %% Executing the generated code in freshly spawned process makes it much
+    %% more likely to crash if there is a bug in init_yregs.
     CombinedSeq = Seq ++ Seq ++ Seq,
-    CombinedSeq = Mod:f(),
+    CombinedSeq = spawn_exec(fun Mod:f/0),
 
     %% Clean up.
     true = code:delete(Mod),
@@ -430,3 +434,27 @@ swap_temp_apply_function(_) ->
 
 swap_temp_applied(Int) ->
     Int+1.
+
+beam_init_yregs(Config) ->
+    DataDir = proplists:get_value(data_dir, Config),
+    Mod = ?FUNCTION_NAME,
+    File = filename:join(DataDir, Mod),
+    {ok,Mod,Code} = compile:file(File, [from_asm,no_postopt,binary]),
+    {module,Mod} = code:load_binary(Mod, Mod, Code),
+
+    _ = [ok = spawn_exec(fun Mod:Mod/0) || _ <- lists:seq(1, 10)],
+
+    %% Clean up.
+    true = code:delete(Mod),
+    false = code:purge(Mod),
+
+    ok.
+
+%%% Common utilities.
+spawn_exec(F) ->
+    {Pid,Ref} = spawn_monitor(fun() ->
+                                      exit(F())
+                              end),
+    receive
+        {'DOWN',Ref,process,Pid,Result} -> Result
+    end.
diff --git a/erts/emulator/test/beam_SUITE_data/beam_init_yregs.S b/erts/emulator/test/beam_SUITE_data/beam_init_yregs.S
new file mode 100644
index 0000000000..15f6661a38
--- /dev/null
+++ b/erts/emulator/test/beam_SUITE_data/beam_init_yregs.S
@@ -0,0 +1,91 @@
+{module, beam_init_yregs}.  %% version = 0
+
+{exports, [{beam_init_yregs,0},{module_info,0},{module_info,1}]}.
+
+{attributes, []}.
+
+{labels, 7}.
+
+
+{function, beam_init_yregs, 0, 2}.
+  {label,1}.
+    {line,[{location,"beam_init_yregs.erl",4}]}.
+    {func_info,{atom,beam_init_yregs},{atom,beam_init_yregs},0}.
+  {label,2}.
+    {allocate,53,0}.
+    {init_yregs,{list,[{y,0},
+                       {y,1},
+                       {y,2},
+                       {y,3},
+                       {y,4},
+                       {y,5},
+                       {y,6},
+                       {y,7},
+                       {y,8},
+                       {y,9},
+                       {y,10},
+                       {y,11},
+                       {y,12},
+                       {y,13},
+                       {y,14},
+                       {y,15},
+                       {y,16},
+                       {y,17},
+                       {y,18},
+                       {y,19},
+                       {y,20},
+                       {y,21},
+                       {y,22},
+                       {y,25},
+                       {y,26},
+                       {y,27},
+                       {y,28},
+                       {y,29},
+                       {y,30},
+                       {y,31},
+                       {y,32},
+                       {y,33},
+                       {y,35},
+                       {y,36},
+                       {y,37},
+                       {y,38},
+                       {y,40},
+                       {y,41},
+                       {y,42},
+                       {y,43},
+                       {y,44},
+                       {y,45},
+                       {y,46},
+                       {y,47},
+                       {y,48},
+                       {y,49},
+                       {y,50},
+                       {y,51},
+                       {y,52}]}}.
+    {move,{atom,ok},{y,23}}.
+    {move,{atom,ok},{y,24}}.
+    {move,{atom,ok},{y,34}}.
+    {move,{atom,ok},{y,39}}.
+    {call_ext,0,{extfunc,erlang,garbage_collect,0}}.
+    {move,{atom,ok},{x,0}}.
+    {deallocate,53}.
+    return.
+
+
+{function, module_info, 0, 4}.
+  {label,3}.
+    {line,[]}.
+    {func_info,{atom,beam_init_yregs},{atom,module_info},0}.
+  {label,4}.
+    {move,{atom,beam_init_yregs},{x,0}}.
+    {call_ext_only,1,{extfunc,erlang,get_module_info,1}}.
+
+
+{function, module_info, 1, 6}.
+  {label,5}.
+    {line,[]}.
+    {func_info,{atom,beam_init_yregs},{atom,module_info},1}.
+  {label,6}.
+    {move,{x,0},{x,1}}.
+    {move,{atom,beam_init_yregs},{x,0}}.
+    {call_ext_only,2,{extfunc,erlang,get_module_info,2}}.
-- 
2.35.3

openSUSE Build Service is sponsored by