File 2151-Inline-creation-of-small-maps-with-literal-keys.patch of Package erlang
From 42f7dfb2f4a3dd10b82454a544c3e93af5a2cd3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Fri, 29 Jul 2022 07:34:20 +0200
Subject: [PATCH] Inline creation of small maps with literal keys
Avoiding calling a helper function improves performance of
map creation.
---
.../beam/jit/arm/beam_asm_global.hpp.pl | 1 -
erts/emulator/beam/jit/arm/instr_map.cpp | 63 +++++++++-----
erts/emulator/beam/jit/x86/beam_asm.hpp | 9 ++
.../beam/jit/x86/beam_asm_global.hpp.pl | 1 -
erts/emulator/beam/jit/x86/instr_map.cpp | 87 ++++++++++++++-----
5 files changed, 115 insertions(+), 46 deletions(-)
diff --git a/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl b/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl
index 3b7b2c9fa1..1cb8a155c3 100644
--- a/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl
+++ b/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl
@@ -83,7 +83,6 @@ my @beam_global_funcs = qw(
i_length_guard_shared
i_length_body_shared
i_loop_rec_shared
- i_new_small_map_lit_shared
i_test_yield_shared
i_bxor_body_shared
int_div_rem_body_shared
diff --git a/erts/emulator/beam/jit/arm/instr_map.cpp b/erts/emulator/beam/jit/arm/instr_map.cpp
index fd837cf93f..fb3db42431 100644
--- a/erts/emulator/beam/jit/arm/instr_map.cpp
+++ b/erts/emulator/beam/jit/arm/instr_map.cpp
@@ -263,22 +263,6 @@ void BeamModuleAssembler::emit_new_map(const ArgRegister &Dst,
mov_arg(Dst, ARG1);
}
-void BeamGlobalAssembler::emit_i_new_small_map_lit_shared() {
- emit_enter_runtime_frame();
- emit_enter_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
- Update::eReductions>();
-
- a.mov(ARG1, c_p);
- load_x_reg_array(ARG2);
- runtime_call<5>(erts_gc_new_small_map_lit);
-
- emit_leave_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
- Update::eReductions>();
- emit_leave_runtime_frame();
-
- a.ret(a64::x30);
-}
-
void BeamModuleAssembler::emit_i_new_small_map_lit(const ArgRegister &Dst,
const ArgWord &Live,
const ArgLiteral &Keys,
@@ -286,15 +270,50 @@ void BeamModuleAssembler::emit_i_new_small_map_lit(const ArgRegister &Dst,
const Span<ArgVal> &args) {
ASSERT(Size.get() == args.size());
- embed_vararg_rodata(args, ARG5);
+ emit_gc_test(ArgWord(0),
+ ArgWord(args.size() + MAP_HEADER_FLATMAP_SZ + 1),
+ Live);
- ASSERT(Keys.isLiteral());
- mov_arg(ARG3, Keys);
- mov_arg(ARG4, Live);
+ std::vector<ArgVal> data;
+ data.reserve(args.size() + MAP_HEADER_FLATMAP_SZ + 1);
+ data.push_back(ArgWord(MAP_HEADER_FLATMAP));
+ data.push_back(Size);
+ data.push_back(Keys);
- fragment_call(ga->get_i_new_small_map_lit_shared());
+ bool dst_is_src = false;
+ for (auto arg : args) {
+ data.push_back(arg);
+ dst_is_src |= (arg == Dst);
+ }
- mov_arg(Dst, ARG1);
+ if (dst_is_src) {
+ a.add(TMP1, HTOP, TAG_PRIMARY_BOXED);
+ } else {
+ auto ptr = init_destination(Dst, TMP1);
+ a.add(ptr.reg, HTOP, TAG_PRIMARY_BOXED);
+ flush_var(ptr);
+ }
+
+ size_t size = data.size();
+ unsigned i;
+ for (i = 0; i < size - 1; i += 2) {
+ if ((i % 128) == 0) {
+ check_pending_stubs();
+ }
+
+ auto [first, second] = load_sources(data[i], TMP2, data[i + 1], TMP3);
+ a.stp(first.reg, second.reg, arm::Mem(HTOP).post(sizeof(Eterm[2])));
+ }
+
+ if (i < size) {
+ mov_arg(arm::Mem(HTOP).post(sizeof(Eterm)), data[i]);
+ }
+
+ if (dst_is_src) {
+ auto ptr = init_destination(Dst, TMP1);
+ mov_var(ptr, TMP1);
+ flush_var(ptr);
+ }
}
/* ARG1 = map
diff --git a/erts/emulator/beam/jit/x86/beam_asm.hpp b/erts/emulator/beam/jit/x86/beam_asm.hpp
index 71294190b9..dccee87af4 100644
--- a/erts/emulator/beam/jit/x86/beam_asm.hpp
+++ b/erts/emulator/beam/jit/x86/beam_asm.hpp
@@ -1559,6 +1559,15 @@ protected:
if (from.isImmed()) {
auto val = from.as<ArgImmed>().get();
+ if (Support::isInt32((Sint)val)) {
+ a.mov(to, imm(val));
+ } else {
+ a.mov(spill, imm(val));
+ a.mov(to, spill);
+ }
+ } else if (from.isWord()) {
+ auto val = from.as<ArgWord>().get();
+
if (Support::isInt32((Sint)val)) {
a.mov(to, imm(val));
} else {
diff --git a/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl b/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl
index 2fa14f3ad9..7f2ca91c03 100755
--- a/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl
+++ b/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl
@@ -75,7 +75,6 @@ my @beam_global_funcs = qw(
i_length_guard_shared
i_length_body_shared
i_loop_rec_shared
- i_new_small_map_lit_shared
i_test_yield_shared
increment_body_shared
int_div_rem_body_shared
diff --git a/erts/emulator/beam/jit/x86/instr_map.cpp b/erts/emulator/beam/jit/x86/instr_map.cpp
index 93847259de..94f13e3e0c 100644
--- a/erts/emulator/beam/jit/x86/instr_map.cpp
+++ b/erts/emulator/beam/jit/x86/instr_map.cpp
@@ -260,37 +260,80 @@ void BeamModuleAssembler::emit_new_map(const ArgRegister &Dst,
mov_arg(Dst, RET);
}
-void BeamGlobalAssembler::emit_i_new_small_map_lit_shared() {
- emit_enter_frame();
- emit_enter_runtime<Update::eReductions | Update::eStack | Update::eHeap>();
-
- a.mov(ARG1, c_p);
- load_x_reg_array(ARG2);
- runtime_call<5>(erts_gc_new_small_map_lit);
-
- emit_leave_runtime<Update::eReductions | Update::eStack | Update::eHeap>();
- emit_leave_frame();
-
- a.ret();
-}
-
void BeamModuleAssembler::emit_i_new_small_map_lit(const ArgRegister &Dst,
const ArgWord &Live,
const ArgLiteral &Keys,
const ArgWord &Size,
const Span<ArgVal> &args) {
- Label data = embed_vararg_rodata(args, CP_SIZE);
-
ASSERT(Size.get() == args.size());
- ASSERT(Keys.isLiteral());
- mov_arg(ARG3, Keys);
- mov_imm(ARG4, Live.get());
- a.lea(ARG5, x86::qword_ptr(data));
+ emit_gc_test(ArgWord(0),
+ ArgWord(args.size() + MAP_HEADER_FLATMAP_SZ + 1),
+ Live);
- fragment_call(ga->get_i_new_small_map_lit_shared());
+ std::vector<ArgVal> data;
+ data.reserve(args.size() + MAP_HEADER_FLATMAP_SZ + 1);
+ data.push_back(ArgWord(MAP_HEADER_FLATMAP));
+ data.push_back(Size);
+ data.push_back(Keys);
- mov_arg(Dst, RET);
+ for (auto arg : args) {
+ data.push_back(arg);
+ }
+
+ size_t size = data.size();
+ unsigned i;
+
+ mov_arg(x86::qword_ptr(HTOP), data[0]);
+
+ /* Starting from 1 instead of 0 gives more opportunities for
+ * applying the MMX optimizations. */
+ for (i = 1; i < size - 1; i += 2) {
+ x86::Mem dst_ptr0 = x86::qword_ptr(HTOP, i * sizeof(Eterm));
+ x86::Mem dst_ptr1 = x86::qword_ptr(HTOP, (i + 1) * sizeof(Eterm));
+ auto first = data[i];
+ auto second = data[i + 1];
+
+ switch (ArgVal::memory_relation(first, second)) {
+ case ArgVal::consecutive: {
+ x86::Mem src_ptr = getArgRef(first, 16);
+
+ comment("(initializing two elements at once)");
+ dst_ptr0.setSize(16);
+ a.movups(x86::xmm0, src_ptr);
+ a.movups(dst_ptr0, x86::xmm0);
+ break;
+ }
+ case ArgVal::reverse_consecutive: {
+ if (!hasCpuFeature(CpuFeatures::X86::kAVX)) {
+ mov_arg(dst_ptr0, first);
+ mov_arg(dst_ptr1, second);
+ } else {
+ x86::Mem src_ptr = getArgRef(second, 16);
+
+ comment("(initializing with two swapped elements at once)");
+ dst_ptr0.setSize(16);
+ a.vpermilpd(x86::xmm0, src_ptr, 1); /* Load and swap */
+ a.vmovups(dst_ptr0, x86::xmm0);
+ }
+ break;
+ }
+ case ArgVal::none:
+ mov_arg(dst_ptr0, first);
+ mov_arg(dst_ptr1, second);
+ break;
+ }
+ }
+
+ if (i < size) {
+ x86::Mem dst_ptr = x86::qword_ptr(HTOP, i * sizeof(Eterm));
+ mov_arg(dst_ptr, data[i]);
+ }
+
+ a.lea(ARG1, x86::byte_ptr(HTOP, TAG_PRIMARY_BOXED));
+ a.add(HTOP, imm(size * sizeof(Eterm)));
+
+ mov_arg(Dst, ARG1);
}
/* ARG1 = map, ARG2 = key
--
2.35.3