File 1139-AArch64-Enhance-optimizations-of-two-move-instructio.patch of Package erlang
From 77cfe46376058624d82314f4b16fc183ebf6e88c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Wed, 27 Sep 2023 08:01:32 +0200
Subject: [PATCH 19/25] AArch64: Enhance optimizations of two move instructions
* Replace the instructions `store_two_xregs` and `move_two_yregs` with
the more general `store_two_values`, which also supports storing
any value into Y registers.
* Replace an `init_yreg` instruction that initializes a single
register with a `move` instruction. That give more opportunities for
optimizations, such as combining it with another `move` instruction or
with a `trim` instruction.
---
erts/emulator/beam/jit/arm/generators.tab | 15 +++++++++++
erts/emulator/beam/jit/arm/instr_common.cpp | 30 +++------------------
erts/emulator/beam/jit/arm/ops.tab | 30 +++++++++------------
3 files changed, 31 insertions(+), 44 deletions(-)
diff --git a/erts/emulator/beam/jit/arm/generators.tab b/erts/emulator/beam/jit/arm/generators.tab
index 9e0a686ea6..7a4aa84028 100644
--- a/erts/emulator/beam/jit/arm/generators.tab
+++ b/erts/emulator/beam/jit/arm/generators.tab
@@ -450,6 +450,21 @@ gen.func_end(Func_Label, Entry_Label) {
return op;
}
+gen.init_yregs_move(Size, Rest) {
+ BeamOp* op;
+
+ ASSERT(Size.val == 1);
+
+ $NewBeamOp(S, op);
+ $BeamOpNameArity(op, move, 2);
+ op->next = NULL;
+ op->a[0].type = TAG_n;
+ op->a[0].val = 0;
+ op->a[1] = Rest[0];
+
+ return op;
+}
+
gen.create_bin(Fail, Alloc, Live, Unit, Dst, N, Segments) {
BeamOp* op;
int fixed_args;
diff --git a/erts/emulator/beam/jit/arm/instr_common.cpp b/erts/emulator/beam/jit/arm/instr_common.cpp
index 63214821db..755124dcf9 100644
--- a/erts/emulator/beam/jit/arm/instr_common.cpp
+++ b/erts/emulator/beam/jit/arm/instr_common.cpp
@@ -558,10 +558,10 @@ void BeamModuleAssembler::emit_move_trim(const ArgSource &Src,
}
}
-void BeamModuleAssembler::emit_store_two_xregs(const ArgXRegister &Src1,
- const ArgYRegister &Dst1,
- const ArgXRegister &Src2,
- const ArgYRegister &Dst2) {
+void BeamModuleAssembler::emit_store_two_values(const ArgSource &Src1,
+ const ArgYRegister &Dst1,
+ const ArgSource &Src2,
+ const ArgYRegister &Dst2) {
auto [src1, src2] = load_sources(Src1, TMP1, Src2, TMP2);
auto dst1 = init_destination(Dst1, src1.reg);
auto dst2 = init_destination(Dst2, src2.reg);
@@ -582,28 +582,6 @@ void BeamModuleAssembler::emit_load_two_xregs(const ArgYRegister &Src1,
flush_vars(dst1, dst2);
}
-void BeamModuleAssembler::emit_move_two_yregs(const ArgYRegister &Src1,
- const ArgYRegister &Dst1,
- const ArgYRegister &Src2,
- const ArgYRegister &Dst2) {
- /* Optimize fetching of source Y registers. */
- switch (ArgVal::memory_relation(Src1, Src2)) {
- case ArgVal::Relation::consecutive:
- safe_ldp(TMP1, TMP2, Src1, Src2);
- break;
- case ArgVal::Relation::reverse_consecutive:
- safe_ldp(TMP2, TMP1, Src2, Src1);
- break;
- case ArgVal::Relation::none:
- a.ldr(TMP1, getArgRef(Src1));
- a.ldr(TMP2, getArgRef(Src2));
- break;
- }
-
- /* Destination registers are always in consecutive order. */
- safe_stp(TMP1, TMP2, Dst1, Dst2);
-}
-
void BeamModuleAssembler::emit_swap(const ArgRegister &R1,
const ArgRegister &R2) {
if (isRegisterBacked(R1)) {
diff --git a/erts/emulator/beam/jit/arm/ops.tab b/erts/emulator/beam/jit/arm/ops.tab
index 5e75154e81..cd22696347 100644
--- a/erts/emulator/beam/jit/arm/ops.tab
+++ b/erts/emulator/beam/jit/arm/ops.tab
@@ -95,6 +95,11 @@ trim t t
allocate_zero Ns Live => allocate_heap_zero Ns u Live
allocate_heap_zero Ns Nh Live => allocate_heap_zero(Ns, Nh, Live)
+# Use a move instruction for initializating a single Y register, since
+# that can open up more optimizations (for example, combining a move +
+# trim to move_trim).
+init_yregs Size=u==1 Regs=* => init_yregs_move(Size, Regs)
+
init_yregs I *
# Selecting values.
@@ -321,19 +326,10 @@ system_limit_body
#
# Optimize moves of consecutive memory addresses.
#
-move Src=c Dst => i_move Src Dst
move Src SrcDst | move SrcDst2 Dst |
equal(SrcDst, SrcDst2) =>
i_move Src SrcDst | move SrcDst Dst
-# Optimize two moves from X registers to Y registers when destination
-# Y registers are consecutive.
-
-move S1=x D1=y | move S2=x D2=y | consecutive_words(D1, D2) =>
- store_two_xregs S1 D1 S2 D2
-move S1=x D1=y | move S2=x D2=y | consecutive_words(D2, D1) =>
- store_two_xregs S2 D2 S1 D1
-
# Optimize two moves from Y registers to X registers when source Y
# registers are consecutive.
@@ -347,14 +343,13 @@ move S1=y D1=x | move S2=y D2=x |
distinct(D1, D2) =>
load_two_xregs S2 D2 S1 D1
-# Optimize two moves of Y registers when destinations are consecutive.
-move S1=y D1=y | move S2=y D2=y |
- consecutive_words(D1, D2) =>
- move_two_yregs S1 D1 S2 D2
+# Optimize storing two values in Y registers when destination Y
+# registers are consecutive.
-move S1=y D1=y | move S2=y D2=y |
- consecutive_words(D2, D1) =>
- move_two_yregs S2 D2 S1 D1
+move S1 D1=y | move S2 D2=y | consecutive_words(D1, D2) =>
+ store_two_values S1 D1 S2 D2
+move S1 D1=y | move S2 D2=y | consecutive_words(D2, D1) =>
+ store_two_values S2 D2 S1 D1
move Src Dst | trim N u => move_trim Src Dst N
@@ -363,9 +358,8 @@ move_trim s d t
move Src Dst => i_move Src Dst
i_move s d
-store_two_xregs x y x y
+store_two_values s y s y
load_two_xregs y x y x
-move_two_yregs y y y y
#
# Swap instructions.
--
2.35.3