File 1134-Optimize-tuple-construction-with-repeated-elements.patch of Package erlang

From 3c90c8bfff0c89de56de84e678a24a0c5e55c684 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Sat, 9 Sep 2023 17:12:28 +0200
Subject: [PATCH 14/25] Optimize tuple construction with repeated elements

While at it, for AArch64 reduce tuple construction by one instruction
when the destination register is also a source register.
---
 erts/emulator/beam/jit/arm/instr_common.cpp | 49 ++++++++++++---------
 erts/emulator/beam/jit/x86/instr_common.cpp | 25 ++++++++++-
 2 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/erts/emulator/beam/jit/arm/instr_common.cpp b/erts/emulator/beam/jit/arm/instr_common.cpp
index 15e5d54553..1963263a19 100644
--- a/erts/emulator/beam/jit/arm/instr_common.cpp
+++ b/erts/emulator/beam/jit/arm/instr_common.cpp
@@ -720,41 +720,46 @@ void BeamModuleAssembler::emit_put_tuple2(const ArgRegister &Dst,
     std::vector<ArgVal> data;
     data.reserve(args.size() + 1);
     data.push_back(Arity);
-
-    bool dst_is_src = false;
-    for (auto arg : args) {
-        data.push_back(arg);
-        dst_is_src |= (arg == Dst);
-    }
-
-    if (dst_is_src) {
-        a.add(TMP1, HTOP, TAG_PRIMARY_BOXED);
-    } else {
-        auto ptr = init_destination(Dst, TMP1);
-        a.add(ptr.reg, HTOP, TAG_PRIMARY_BOXED);
-        flush_var(ptr);
-    }
+    data.insert(data.end(), std::begin(args), std::end(args));
 
     size_t size = data.size();
     unsigned i;
+    ArgVal value = ArgWord(0);
     for (i = 0; i < size - 1; i += 2) {
         if ((i % 128) == 0) {
             check_pending_stubs();
         }
 
-        auto [first, second] = load_sources(data[i], TMP2, data[i + 1], TMP3);
-        a.stp(first.reg, second.reg, arm::Mem(HTOP).post(sizeof(Eterm[2])));
+        if (!data[i].isRegister() && data[i] == data[i + 1]) {
+            if (data[i] != value) {
+                value = data[i];
+                mov_arg(TMP1, value);
+            }
+            a.stp(TMP1, TMP1, arm::Mem(HTOP).post(sizeof(Eterm[2])));
+        } else if (data[i] == value) {
+            auto second = load_source(data[i + 1], TMP3);
+            a.stp(TMP1, second.reg, arm::Mem(HTOP).post(sizeof(Eterm[2])));
+        } else if (data[i + 1] == value) {
+            auto first = load_source(data[i], TMP2);
+            a.stp(first.reg, TMP1, arm::Mem(HTOP).post(sizeof(Eterm[2])));
+        } else {
+            auto [first, second] =
+                    load_sources(data[i], TMP2, data[i + 1], TMP3);
+            a.stp(first.reg, second.reg, arm::Mem(HTOP).post(sizeof(Eterm[2])));
+        }
     }
 
     if (i < size) {
-        mov_arg(arm::Mem(HTOP).post(sizeof(Eterm)), data[i]);
+        if (data[i] == value) {
+            a.str(TMP1, arm::Mem(HTOP).post(sizeof(Eterm)));
+        } else {
+            mov_arg(arm::Mem(HTOP).post(sizeof(Eterm)), data[i]);
+        }
     }
 
-    if (dst_is_src) {
-        auto ptr = init_destination(Dst, TMP1);
-        mov_var(ptr, TMP1);
-        flush_var(ptr);
-    }
+    auto ptr = init_destination(Dst, TMP1);
+    sub(ptr.reg, HTOP, size * sizeof(Eterm) - TAG_PRIMARY_BOXED);
+    flush_var(ptr);
 }
 
 void BeamModuleAssembler::emit_self(const ArgRegister &Dst) {
diff --git a/erts/emulator/beam/jit/x86/instr_common.cpp b/erts/emulator/beam/jit/x86/instr_common.cpp
index 48b21e8bc6..bbf05396f1 100644
--- a/erts/emulator/beam/jit/x86/instr_common.cpp
+++ b/erts/emulator/beam/jit/x86/instr_common.cpp
@@ -746,6 +746,7 @@ void BeamModuleAssembler::emit_put_tuple2(const ArgRegister &Dst,
                                           const ArgWord &Arity,
                                           const Span<ArgVal> &args) {
     size_t size = args.size();
+    ArgVal value = ArgWord(0);
 
     ASSERT(arityval(Arity.get()) == size);
 
@@ -784,10 +785,30 @@ void BeamModuleAssembler::emit_put_tuple2(const ArgRegister &Dst,
                 }
                 break;
             }
-            case ArgVal::none:
-                mov_arg(dst_ptr, args[i]);
+            case ArgVal::none: {
+                unsigned j;
+                if (value == args[i]) {
+                    a.mov(dst_ptr, RET);
+                    break;
+                }
+                for (j = i + 1; j < size && args[i] == args[j]; j++) {
+                    ;
+                }
+                if (j - i < 2) {
+                    mov_arg(dst_ptr, args[i]);
+                } else {
+                    value = args[i];
+                    mov_arg(RET, value);
+                    while (i < j) {
+                        dst_ptr = x86::qword_ptr(HTOP, (i + 1) * sizeof(Eterm));
+                        a.mov(dst_ptr, RET);
+                        i++;
+                    }
+                    i--;
+                }
                 break;
             }
+            }
         }
     }
 
-- 
2.35.3

openSUSE Build Service is sponsored by