File 1134-Optimize-tuple-construction-with-repeated-elements.patch of Package erlang
From 3c90c8bfff0c89de56de84e678a24a0c5e55c684 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Sat, 9 Sep 2023 17:12:28 +0200
Subject: [PATCH 14/25] Optimize tuple construction with repeated elements
While at it, for AArch64 reduce tuple construction by one instruction
when the destination register is also a source register.
---
erts/emulator/beam/jit/arm/instr_common.cpp | 49 ++++++++++++---------
erts/emulator/beam/jit/x86/instr_common.cpp | 25 ++++++++++-
2 files changed, 50 insertions(+), 24 deletions(-)
diff --git a/erts/emulator/beam/jit/arm/instr_common.cpp b/erts/emulator/beam/jit/arm/instr_common.cpp
index 15e5d54553..1963263a19 100644
--- a/erts/emulator/beam/jit/arm/instr_common.cpp
+++ b/erts/emulator/beam/jit/arm/instr_common.cpp
@@ -720,41 +720,46 @@ void BeamModuleAssembler::emit_put_tuple2(const ArgRegister &Dst,
std::vector<ArgVal> data;
data.reserve(args.size() + 1);
data.push_back(Arity);
-
- bool dst_is_src = false;
- for (auto arg : args) {
- data.push_back(arg);
- dst_is_src |= (arg == Dst);
- }
-
- if (dst_is_src) {
- a.add(TMP1, HTOP, TAG_PRIMARY_BOXED);
- } else {
- auto ptr = init_destination(Dst, TMP1);
- a.add(ptr.reg, HTOP, TAG_PRIMARY_BOXED);
- flush_var(ptr);
- }
+ data.insert(data.end(), std::begin(args), std::end(args));
size_t size = data.size();
unsigned i;
+ ArgVal value = ArgWord(0);
for (i = 0; i < size - 1; i += 2) {
if ((i % 128) == 0) {
check_pending_stubs();
}
- auto [first, second] = load_sources(data[i], TMP2, data[i + 1], TMP3);
- a.stp(first.reg, second.reg, arm::Mem(HTOP).post(sizeof(Eterm[2])));
+ if (!data[i].isRegister() && data[i] == data[i + 1]) {
+ if (data[i] != value) {
+ value = data[i];
+ mov_arg(TMP1, value);
+ }
+ a.stp(TMP1, TMP1, arm::Mem(HTOP).post(sizeof(Eterm[2])));
+ } else if (data[i] == value) {
+ auto second = load_source(data[i + 1], TMP3);
+ a.stp(TMP1, second.reg, arm::Mem(HTOP).post(sizeof(Eterm[2])));
+ } else if (data[i + 1] == value) {
+ auto first = load_source(data[i], TMP2);
+ a.stp(first.reg, TMP1, arm::Mem(HTOP).post(sizeof(Eterm[2])));
+ } else {
+ auto [first, second] =
+ load_sources(data[i], TMP2, data[i + 1], TMP3);
+ a.stp(first.reg, second.reg, arm::Mem(HTOP).post(sizeof(Eterm[2])));
+ }
}
if (i < size) {
- mov_arg(arm::Mem(HTOP).post(sizeof(Eterm)), data[i]);
+ if (data[i] == value) {
+ a.str(TMP1, arm::Mem(HTOP).post(sizeof(Eterm)));
+ } else {
+ mov_arg(arm::Mem(HTOP).post(sizeof(Eterm)), data[i]);
+ }
}
- if (dst_is_src) {
- auto ptr = init_destination(Dst, TMP1);
- mov_var(ptr, TMP1);
- flush_var(ptr);
- }
+ auto ptr = init_destination(Dst, TMP1);
+ sub(ptr.reg, HTOP, size * sizeof(Eterm) - TAG_PRIMARY_BOXED);
+ flush_var(ptr);
}
void BeamModuleAssembler::emit_self(const ArgRegister &Dst) {
diff --git a/erts/emulator/beam/jit/x86/instr_common.cpp b/erts/emulator/beam/jit/x86/instr_common.cpp
index 48b21e8bc6..bbf05396f1 100644
--- a/erts/emulator/beam/jit/x86/instr_common.cpp
+++ b/erts/emulator/beam/jit/x86/instr_common.cpp
@@ -746,6 +746,7 @@ void BeamModuleAssembler::emit_put_tuple2(const ArgRegister &Dst,
const ArgWord &Arity,
const Span<ArgVal> &args) {
size_t size = args.size();
+ ArgVal value = ArgWord(0);
ASSERT(arityval(Arity.get()) == size);
@@ -784,10 +785,30 @@ void BeamModuleAssembler::emit_put_tuple2(const ArgRegister &Dst,
}
break;
}
- case ArgVal::none:
- mov_arg(dst_ptr, args[i]);
+ case ArgVal::none: {
+ unsigned j;
+ if (value == args[i]) {
+ a.mov(dst_ptr, RET);
+ break;
+ }
+ for (j = i + 1; j < size && args[i] == args[j]; j++) {
+ ;
+ }
+ if (j - i < 2) {
+ mov_arg(dst_ptr, args[i]);
+ } else {
+ value = args[i];
+ mov_arg(RET, value);
+ while (i < j) {
+ dst_ptr = x86::qword_ptr(HTOP, (i + 1) * sizeof(Eterm));
+ a.mov(dst_ptr, RET);
+ i++;
+ }
+ i--;
+ }
break;
}
+ }
}
}
--
2.35.3