File 2151-Inline-creation-of-small-maps-with-literal-keys.patch of Package erlang

From 42f7dfb2f4a3dd10b82454a544c3e93af5a2cd3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Fri, 29 Jul 2022 07:34:20 +0200
Subject: [PATCH] Inline creation of small maps with literal keys

Avoiding calling a helper function improves performance of
map creation.
---
 .../beam/jit/arm/beam_asm_global.hpp.pl       |  1 -
 erts/emulator/beam/jit/arm/instr_map.cpp      | 63 +++++++++-----
 erts/emulator/beam/jit/x86/beam_asm.hpp       |  9 ++
 .../beam/jit/x86/beam_asm_global.hpp.pl       |  1 -
 erts/emulator/beam/jit/x86/instr_map.cpp      | 87 ++++++++++++++-----
 5 files changed, 115 insertions(+), 46 deletions(-)

diff --git a/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl b/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl
index 3b7b2c9fa1..1cb8a155c3 100644
--- a/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl
+++ b/erts/emulator/beam/jit/arm/beam_asm_global.hpp.pl
@@ -83,7 +83,6 @@ my @beam_global_funcs = qw(
     i_length_guard_shared
     i_length_body_shared
     i_loop_rec_shared
-    i_new_small_map_lit_shared
     i_test_yield_shared
     i_bxor_body_shared
     int_div_rem_body_shared
diff --git a/erts/emulator/beam/jit/arm/instr_map.cpp b/erts/emulator/beam/jit/arm/instr_map.cpp
index fd837cf93f..fb3db42431 100644
--- a/erts/emulator/beam/jit/arm/instr_map.cpp
+++ b/erts/emulator/beam/jit/arm/instr_map.cpp
@@ -263,22 +263,6 @@ void BeamModuleAssembler::emit_new_map(const ArgRegister &Dst,
     mov_arg(Dst, ARG1);
 }
 
-void BeamGlobalAssembler::emit_i_new_small_map_lit_shared() {
-    emit_enter_runtime_frame();
-    emit_enter_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
-                       Update::eReductions>();
-
-    a.mov(ARG1, c_p);
-    load_x_reg_array(ARG2);
-    runtime_call<5>(erts_gc_new_small_map_lit);
-
-    emit_leave_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
-                       Update::eReductions>();
-    emit_leave_runtime_frame();
-
-    a.ret(a64::x30);
-}
-
 void BeamModuleAssembler::emit_i_new_small_map_lit(const ArgRegister &Dst,
                                                    const ArgWord &Live,
                                                    const ArgLiteral &Keys,
@@ -286,15 +270,50 @@ void BeamModuleAssembler::emit_i_new_small_map_lit(const ArgRegister &Dst,
                                                    const Span<ArgVal> &args) {
     ASSERT(Size.get() == args.size());
 
-    embed_vararg_rodata(args, ARG5);
+    emit_gc_test(ArgWord(0),
+                 ArgWord(args.size() + MAP_HEADER_FLATMAP_SZ + 1),
+                 Live);
 
-    ASSERT(Keys.isLiteral());
-    mov_arg(ARG3, Keys);
-    mov_arg(ARG4, Live);
+    std::vector<ArgVal> data;
+    data.reserve(args.size() + MAP_HEADER_FLATMAP_SZ + 1);
+    data.push_back(ArgWord(MAP_HEADER_FLATMAP));
+    data.push_back(Size);
+    data.push_back(Keys);
 
-    fragment_call(ga->get_i_new_small_map_lit_shared());
+    bool dst_is_src = false;
+    for (auto arg : args) {
+        data.push_back(arg);
+        dst_is_src |= (arg == Dst);
+    }
 
-    mov_arg(Dst, ARG1);
+    if (dst_is_src) {
+        a.add(TMP1, HTOP, TAG_PRIMARY_BOXED);
+    } else {
+        auto ptr = init_destination(Dst, TMP1);
+        a.add(ptr.reg, HTOP, TAG_PRIMARY_BOXED);
+        flush_var(ptr);
+    }
+
+    size_t size = data.size();
+    unsigned i;
+    for (i = 0; i < size - 1; i += 2) {
+        if ((i % 128) == 0) {
+            check_pending_stubs();
+        }
+
+        auto [first, second] = load_sources(data[i], TMP2, data[i + 1], TMP3);
+        a.stp(first.reg, second.reg, arm::Mem(HTOP).post(sizeof(Eterm[2])));
+    }
+
+    if (i < size) {
+        mov_arg(arm::Mem(HTOP).post(sizeof(Eterm)), data[i]);
+    }
+
+    if (dst_is_src) {
+        auto ptr = init_destination(Dst, TMP1);
+        mov_var(ptr, TMP1);
+        flush_var(ptr);
+    }
 }
 
 /* ARG1 = map
diff --git a/erts/emulator/beam/jit/x86/beam_asm.hpp b/erts/emulator/beam/jit/x86/beam_asm.hpp
index 71294190b9..dccee87af4 100644
--- a/erts/emulator/beam/jit/x86/beam_asm.hpp
+++ b/erts/emulator/beam/jit/x86/beam_asm.hpp
@@ -1559,6 +1559,15 @@ protected:
         if (from.isImmed()) {
             auto val = from.as<ArgImmed>().get();
 
+            if (Support::isInt32((Sint)val)) {
+                a.mov(to, imm(val));
+            } else {
+                a.mov(spill, imm(val));
+                a.mov(to, spill);
+            }
+        } else if (from.isWord()) {
+            auto val = from.as<ArgWord>().get();
+
             if (Support::isInt32((Sint)val)) {
                 a.mov(to, imm(val));
             } else {
diff --git a/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl b/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl
index 2fa14f3ad9..7f2ca91c03 100755
--- a/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl
+++ b/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl
@@ -75,7 +75,6 @@ my @beam_global_funcs = qw(
     i_length_guard_shared
     i_length_body_shared
     i_loop_rec_shared
-    i_new_small_map_lit_shared
     i_test_yield_shared
     increment_body_shared
     int_div_rem_body_shared
diff --git a/erts/emulator/beam/jit/x86/instr_map.cpp b/erts/emulator/beam/jit/x86/instr_map.cpp
index 93847259de..94f13e3e0c 100644
--- a/erts/emulator/beam/jit/x86/instr_map.cpp
+++ b/erts/emulator/beam/jit/x86/instr_map.cpp
@@ -260,37 +260,80 @@ void BeamModuleAssembler::emit_new_map(const ArgRegister &Dst,
     mov_arg(Dst, RET);
 }
 
-void BeamGlobalAssembler::emit_i_new_small_map_lit_shared() {
-    emit_enter_frame();
-    emit_enter_runtime<Update::eReductions | Update::eStack | Update::eHeap>();
-
-    a.mov(ARG1, c_p);
-    load_x_reg_array(ARG2);
-    runtime_call<5>(erts_gc_new_small_map_lit);
-
-    emit_leave_runtime<Update::eReductions | Update::eStack | Update::eHeap>();
-    emit_leave_frame();
-
-    a.ret();
-}
-
 void BeamModuleAssembler::emit_i_new_small_map_lit(const ArgRegister &Dst,
                                                    const ArgWord &Live,
                                                    const ArgLiteral &Keys,
                                                    const ArgWord &Size,
                                                    const Span<ArgVal> &args) {
-    Label data = embed_vararg_rodata(args, CP_SIZE);
-
     ASSERT(Size.get() == args.size());
 
-    ASSERT(Keys.isLiteral());
-    mov_arg(ARG3, Keys);
-    mov_imm(ARG4, Live.get());
-    a.lea(ARG5, x86::qword_ptr(data));
+    emit_gc_test(ArgWord(0),
+                 ArgWord(args.size() + MAP_HEADER_FLATMAP_SZ + 1),
+                 Live);
 
-    fragment_call(ga->get_i_new_small_map_lit_shared());
+    std::vector<ArgVal> data;
+    data.reserve(args.size() + MAP_HEADER_FLATMAP_SZ + 1);
+    data.push_back(ArgWord(MAP_HEADER_FLATMAP));
+    data.push_back(Size);
+    data.push_back(Keys);
 
-    mov_arg(Dst, RET);
+    for (auto arg : args) {
+        data.push_back(arg);
+    }
+
+    size_t size = data.size();
+    unsigned i;
+
+    mov_arg(x86::qword_ptr(HTOP), data[0]);
+
+    /* Starting from 1 instead of 0 gives more opportunities for
+     * applying the MMX optimizations. */
+    for (i = 1; i < size - 1; i += 2) {
+        x86::Mem dst_ptr0 = x86::qword_ptr(HTOP, i * sizeof(Eterm));
+        x86::Mem dst_ptr1 = x86::qword_ptr(HTOP, (i + 1) * sizeof(Eterm));
+        auto first = data[i];
+        auto second = data[i + 1];
+
+        switch (ArgVal::memory_relation(first, second)) {
+        case ArgVal::consecutive: {
+            x86::Mem src_ptr = getArgRef(first, 16);
+
+            comment("(initializing two elements at once)");
+            dst_ptr0.setSize(16);
+            a.movups(x86::xmm0, src_ptr);
+            a.movups(dst_ptr0, x86::xmm0);
+            break;
+        }
+        case ArgVal::reverse_consecutive: {
+            if (!hasCpuFeature(CpuFeatures::X86::kAVX)) {
+                mov_arg(dst_ptr0, first);
+                mov_arg(dst_ptr1, second);
+            } else {
+                x86::Mem src_ptr = getArgRef(second, 16);
+
+                comment("(initializing with two swapped elements at once)");
+                dst_ptr0.setSize(16);
+                a.vpermilpd(x86::xmm0, src_ptr, 1); /* Load and swap */
+                a.vmovups(dst_ptr0, x86::xmm0);
+            }
+            break;
+        }
+        case ArgVal::none:
+            mov_arg(dst_ptr0, first);
+            mov_arg(dst_ptr1, second);
+            break;
+        }
+    }
+
+    if (i < size) {
+        x86::Mem dst_ptr = x86::qword_ptr(HTOP, i * sizeof(Eterm));
+        mov_arg(dst_ptr, data[i]);
+    }
+
+    a.lea(ARG1, x86::byte_ptr(HTOP, TAG_PRIMARY_BOXED));
+    a.add(HTOP, imm(size * sizeof(Eterm)));
+
+    mov_arg(Dst, ARG1);
 }
 
 /* ARG1 = map, ARG2 = key
-- 
2.35.3

openSUSE Build Service is sponsored by