File 1138-AArch64-Optimize-move_call_last-move_ext_call_last.patch of Package erlang

From 439d30ad9480e02800ac8e91d904f8944e18fe0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Mon, 18 Sep 2023 15:05:42 +0200
Subject: [PATCH 18/25] AArch64: Optimize move_call_last & move_ext_call_last

There are more than 4700 occurrences of the following instruction
sequence in the OTP code base:

    ldr x25, [x20], 8
    ldr x30, [x20], 8

It can be simplified like so:

    ldp x25, x30, [x20], 16
---
 erts/emulator/beam/jit/arm/instr_call.cpp | 24 +++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/erts/emulator/beam/jit/arm/instr_call.cpp b/erts/emulator/beam/jit/arm/instr_call.cpp
index 2699c30962..8f6f587698 100644
--- a/erts/emulator/beam/jit/arm/instr_call.cpp
+++ b/erts/emulator/beam/jit/arm/instr_call.cpp
@@ -79,16 +79,23 @@ void BeamModuleAssembler::emit_move_call_last(const ArgYRegister &Src,
     auto src_index = Src.get();
     Sint deallocate = Deallocate.get() * sizeof(Eterm);
 
-    if (src_index == 0 && Support::isInt9(deallocate)) {
+    if (src_index == 0 && deallocate == 8) {
+        auto dst = init_destination(Dst, TMP1);
+        const arm::Mem src_ref = arm::Mem(E).post(2 * deallocate);
+        a.ldp(dst.reg, a64::x30, src_ref);
+        flush_var(dst);
+        a.b(resolve_beam_label(CallTarget, disp128MB));
+    } else if (src_index == 0 && Support::isInt9(deallocate)) {
         auto dst = init_destination(Dst, TMP1);
         const arm::Mem src_ref = arm::Mem(E).post(deallocate);
         a.ldr(dst.reg, src_ref);
         flush_var(dst);
+        emit_i_call_only(CallTarget);
     } else {
         mov_arg(Dst, Src);
         emit_deallocate(Deallocate);
+        emit_i_call_only(CallTarget);
     }
-    emit_i_call_only(CallTarget);
 }
 
 void BeamModuleAssembler::emit_i_call_only(const ArgLabel &CallTarget) {
@@ -151,16 +158,25 @@ void BeamModuleAssembler::emit_move_call_ext_last(const ArgYRegister &Src,
     auto src_index = Src.get();
     Sint deallocate = Deallocate.get() * sizeof(Eterm);
 
-    if (src_index == 0 && Support::isInt9(deallocate)) {
+    if (src_index == 0 && deallocate == 8) {
+        auto dst = init_destination(Dst, TMP1);
+        const arm::Mem src_ref = arm::Mem(E).post(2 * deallocate);
+        mov_arg(ARG1, Exp);
+        arm::Mem target = emit_setup_dispatchable_call(ARG1);
+        a.ldp(dst.reg, a64::x30, src_ref);
+        flush_var(dst);
+        branch(target);
+    } else if (src_index == 0 && Support::isInt9(deallocate)) {
         auto dst = init_destination(Dst, TMP1);
         const arm::Mem src_ref = arm::Mem(E).post(deallocate);
         a.ldr(dst.reg, src_ref);
         flush_var(dst);
+        emit_i_call_ext_only(Exp);
     } else {
         mov_arg(Dst, Src);
         emit_deallocate(Deallocate);
+        emit_i_call_ext_only(Exp);
     }
-    emit_i_call_ext_only(Exp);
 }
 
 static ErtsCodeMFA apply3_mfa = {am_erlang, am_apply, 3};
-- 
2.35.3

openSUSE Build Service is sponsored by