File 1129-AArch64-Optimize-init_yregs-using-128-bit-vector-reg.patch of Package erlang
From 01bc36d4048dcda228e8fa0ba05a170ed50d70b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Wed, 6 Sep 2023 16:32:13 +0200
Subject: [PATCH 09/25] AArch64: Optimize init_yregs using 128-bit vector
registers
---
erts/emulator/beam/jit/arm/instr_common.cpp | 50 +++++++++-
erts/emulator/test/beam_SUITE.erl | 36 +++++++-
.../test/beam_SUITE_data/beam_init_yregs.S | 91 +++++++++++++++++++
3 files changed, 169 insertions(+), 8 deletions(-)
create mode 100644 erts/emulator/test/beam_SUITE_data/beam_init_yregs.S
diff --git a/erts/emulator/beam/jit/arm/instr_common.cpp b/erts/emulator/beam/jit/arm/instr_common.cpp
index 7abaff09ee..117c17a098 100644
--- a/erts/emulator/beam/jit/arm/instr_common.cpp
+++ b/erts/emulator/beam/jit/arm/instr_common.cpp
@@ -418,10 +418,9 @@ void BeamModuleAssembler::emit_init_yregs(const ArgWord &Size,
const Span<ArgVal> &args) {
unsigned count = Size.get();
ASSERT(count == args.size());
-
unsigned i = 0;
-
- mov_imm(TMP1, NIL);
+ bool x_initialized = false;
+ bool q_initialized = false;
while (i < count) {
unsigned first_y = args[i].as<ArgYRegister>().get();
@@ -440,7 +439,42 @@ void BeamModuleAssembler::emit_init_yregs(const ArgWord &Size,
/* Now first_y is the number of the first y register to be initialized
* and slots is the number of y registers to be initialized. */
+
+ while (slots >= 4 && first_y % 2 == 0 &&
+ first_y <= 2 * MAX_LDP_STP_DISPLACEMENT) {
+ /* `stp` (with vector registers) can only address the
+ * first 128 Y registers. */
+ if (!q_initialized) {
+ ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == 0x0f);
+ a.movi(a64::v0.d2(), imm(-1));
+ q_initialized = true;
+ }
+ a.stp(a64::q0, a64::q0, getYRef(first_y));
+ first_y += 4;
+ slots -= 4;
+ }
+
+ while (slots >= 2 && q_initialized &&
+ (first_y % 2 == 0 ||
+ first_y * sizeof(Eterm) <= MAX_LDUR_STUR_DISPLACEMENT)) {
+ /* Note that the STR instruction for a vector register
+ * requires the offset to be 16-byte aligned. If it is
+ * not, the STUR instruction must be used. AsmJit
+ * automatically turns STR into STUR when necessary. */
+ a.str(a64::v0, getYRef(first_y));
+ first_y += 2;
+ slots -= 2;
+ }
+
while (slots >= 2) {
+ /* Either the vector register is not initialized, or first_y
+ * is either not 16-byte aligned or it is out of reach for the
+ * STUR instruction. */
+ if (!x_initialized) {
+ mov_imm(TMP1, NIL);
+ x_initialized = true;
+ }
+
/* `stp` can only address the first 64 Y registers. */
if (first_y <= MAX_LDP_STP_DISPLACEMENT) {
a.stp(TMP1, TMP1, getYRef(first_y));
@@ -454,7 +488,15 @@ void BeamModuleAssembler::emit_init_yregs(const ArgWord &Size,
}
if (slots == 1) {
- a.str(TMP1, getYRef(first_y));
+ if (q_initialized) {
+ a.str(a64::d0, getYRef(first_y));
+ } else {
+ if (!x_initialized) {
+ mov_imm(TMP1, NIL);
+ x_initialized = true;
+ }
+ a.str(TMP1, getYRef(first_y));
+ }
}
}
}
diff --git a/erts/emulator/test/beam_SUITE.erl b/erts/emulator/test/beam_SUITE.erl
index 8a9e70e4d4..c38aa9a1ab 100644
--- a/erts/emulator/test/beam_SUITE.erl
+++ b/erts/emulator/test/beam_SUITE.erl
@@ -23,8 +23,9 @@
-export([all/0, suite/0,groups/0,init_per_suite/1, end_per_suite/1,
init_per_group/2,end_per_group/2,
packed_registers/1, apply_last/1, apply_last_bif/1,
- heap_sizes/1, big_lists/1, fconv/1,
- select_val/1, swap_temp_apply/1]).
+ heap_sizes/1, big_lists/1, fconv/1,
+ select_val/1,
+ swap_temp_apply/1, beam_init_yregs/1]).
-export([applied/2,swap_temp_applied/1]).
@@ -36,7 +37,8 @@ suite() -> [{ct_hooks,[ts_install_cth]}].
all() ->
[packed_registers, apply_last, apply_last_bif,
heap_sizes, big_lists, select_val,
- swap_temp_apply].
+ swap_temp_apply,
+ beam_init_yregs].
groups() ->
[].
@@ -148,8 +150,10 @@ packed_registers(Config) when is_list(Config) ->
io:put_chars(Dis)
end,
+ %% Executing the generated code in freshly spawned process makes it much
+ %% more likely to crash if there is a bug in init_yregs.
CombinedSeq = Seq ++ Seq ++ Seq,
- CombinedSeq = Mod:f(),
+ CombinedSeq = spawn_exec(fun Mod:f/0),
%% Clean up.
true = code:delete(Mod),
@@ -430,3 +434,27 @@ swap_temp_apply_function(_) ->
swap_temp_applied(Int) ->
Int+1.
+
+beam_init_yregs(Config) ->
+ DataDir = proplists:get_value(data_dir, Config),
+ Mod = ?FUNCTION_NAME,
+ File = filename:join(DataDir, Mod),
+ {ok,Mod,Code} = compile:file(File, [from_asm,no_postopt,binary]),
+ {module,Mod} = code:load_binary(Mod, Mod, Code),
+
+ _ = [ok = spawn_exec(fun Mod:Mod/0) || _ <- lists:seq(1, 10)],
+
+ %% Clean up.
+ true = code:delete(Mod),
+ false = code:purge(Mod),
+
+ ok.
+
+%%% Common utilities.
+spawn_exec(F) ->
+ {Pid,Ref} = spawn_monitor(fun() ->
+ exit(F())
+ end),
+ receive
+ {'DOWN',Ref,process,Pid,Result} -> Result
+ end.
diff --git a/erts/emulator/test/beam_SUITE_data/beam_init_yregs.S b/erts/emulator/test/beam_SUITE_data/beam_init_yregs.S
new file mode 100644
index 0000000000..15f6661a38
--- /dev/null
+++ b/erts/emulator/test/beam_SUITE_data/beam_init_yregs.S
@@ -0,0 +1,91 @@
+{module, beam_init_yregs}. %% version = 0
+
+{exports, [{beam_init_yregs,0},{module_info,0},{module_info,1}]}.
+
+{attributes, []}.
+
+{labels, 7}.
+
+
+{function, beam_init_yregs, 0, 2}.
+ {label,1}.
+ {line,[{location,"beam_init_yregs.erl",4}]}.
+ {func_info,{atom,beam_init_yregs},{atom,beam_init_yregs},0}.
+ {label,2}.
+ {allocate,53,0}.
+ {init_yregs,{list,[{y,0},
+ {y,1},
+ {y,2},
+ {y,3},
+ {y,4},
+ {y,5},
+ {y,6},
+ {y,7},
+ {y,8},
+ {y,9},
+ {y,10},
+ {y,11},
+ {y,12},
+ {y,13},
+ {y,14},
+ {y,15},
+ {y,16},
+ {y,17},
+ {y,18},
+ {y,19},
+ {y,20},
+ {y,21},
+ {y,22},
+ {y,25},
+ {y,26},
+ {y,27},
+ {y,28},
+ {y,29},
+ {y,30},
+ {y,31},
+ {y,32},
+ {y,33},
+ {y,35},
+ {y,36},
+ {y,37},
+ {y,38},
+ {y,40},
+ {y,41},
+ {y,42},
+ {y,43},
+ {y,44},
+ {y,45},
+ {y,46},
+ {y,47},
+ {y,48},
+ {y,49},
+ {y,50},
+ {y,51},
+ {y,52}]}}.
+ {move,{atom,ok},{y,23}}.
+ {move,{atom,ok},{y,24}}.
+ {move,{atom,ok},{y,34}}.
+ {move,{atom,ok},{y,39}}.
+ {call_ext,0,{extfunc,erlang,garbage_collect,0}}.
+ {move,{atom,ok},{x,0}}.
+ {deallocate,53}.
+ return.
+
+
+{function, module_info, 0, 4}.
+ {label,3}.
+ {line,[]}.
+ {func_info,{atom,beam_init_yregs},{atom,module_info},0}.
+ {label,4}.
+ {move,{atom,beam_init_yregs},{x,0}}.
+ {call_ext_only,1,{extfunc,erlang,get_module_info,1}}.
+
+
+{function, module_info, 1, 6}.
+ {label,5}.
+ {line,[]}.
+ {func_info,{atom,beam_init_yregs},{atom,module_info},1}.
+ {label,6}.
+ {move,{x,0},{x,1}}.
+ {move,{atom,beam_init_yregs},{x,0}}.
+ {call_ext_only,2,{extfunc,erlang,get_module_info,2}}.
--
2.35.3