File 1145-Optimize-select_val-with-consecutive-identical-label.patch of Package erlang

From 57bc32d84cbce65a4e6103ac88c3c22ad9a059fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Sat, 22 Jul 2023 08:27:50 +0200
Subject: [PATCH 25/25] Optimize select_val with consecutive identical labels

---
 erts/emulator/beam/jit/arm/beam_asm.hpp     |   7 +-
 erts/emulator/beam/jit/arm/instr_select.cpp | 150 +++++++++++++-------
 erts/emulator/beam/jit/x86/beam_asm.hpp     |   6 +-
 erts/emulator/beam/jit/x86/instr_select.cpp |  58 ++++----
 erts/emulator/test/beam_SUITE.erl           |  47 +++++-
 5 files changed, 174 insertions(+), 94 deletions(-)

diff --git a/erts/emulator/beam/jit/arm/beam_asm.hpp b/erts/emulator/beam/jit/arm/beam_asm.hpp
index 11b796dfbf..7ab52fb5c0 100644
--- a/erts/emulator/beam/jit/arm/beam_asm.hpp
+++ b/erts/emulator/beam/jit/arm/beam_asm.hpp
@@ -1289,9 +1289,10 @@ protected:
                               Label fail,
                               const Span<ArgVal> &args);
 
-    bool emit_optimized_three_way_select(arm::Gp reg,
-                                         Label fail,
-                                         const Span<ArgVal> &args);
+    void emit_optimized_two_way_select(arm::Gp reg,
+                                       const ArgVal &value1,
+                                       const ArgVal &value2,
+                                       const ArgVal &label);
 
 #ifdef DEBUG
     void emit_tuple_assertion(const ArgSource &Src, arm::Gp tuple_reg);
diff --git a/erts/emulator/beam/jit/arm/instr_select.cpp b/erts/emulator/beam/jit/arm/instr_select.cpp
index 9037d63bf1..494c8e5233 100644
--- a/erts/emulator/beam/jit/arm/instr_select.cpp
+++ b/erts/emulator/beam/jit/arm/instr_select.cpp
@@ -198,17 +198,51 @@ void BeamModuleAssembler::emit_linear_search(arm::Gp comparand,
                                              const Span<ArgVal> &args) {
     int count = args.size() / 2;
 
+    ASSERT(count < 128);
+    check_pending_stubs();
+
     for (int i = 0; i < count; i++) {
         const ArgVal &value = args[i];
         const ArgVal &label = args[i + count];
+        int j;
+        int n = 1;
 
-        if ((i % 128) == 0) {
-            /* Checking veneers on the first element is intentional. */
-            check_pending_stubs();
+        for (j = i + 1; j < count && args[j + count] == label; j++) {
+            n++;
         }
+        if (n < 2) {
+            cmp_arg(comparand, value);
+            a.b_eq(resolve_beam_label(label, disp1MB));
+        } else {
+            int in_range = 1;
+
+            for (j = i + 1; j < n; j++) {
+                if (!(value.isWord() && value.as<ArgWord>().get() + in_range ==
+                                                args[j].as<ArgWord>().get())) {
+                    break;
+                }
+                in_range++;
+            }
 
-        cmp_arg(comparand, value);
-        a.b_eq(resolve_beam_label(label, disp1MB));
+            if (in_range > 2) {
+                uint64_t first = value.as<ArgWord>().get();
+
+                if (first == 0) {
+                    a.cmp(comparand, imm(in_range));
+                } else {
+                    sub(TMP6, comparand, first);
+                    a.cmp(TMP6, imm(in_range));
+                }
+                a.b_lo(resolve_beam_label(label, disp1MB));
+                i += in_range - 1;
+            } else {
+                emit_optimized_two_way_select(comparand,
+                                              value,
+                                              args[i + 1],
+                                              label);
+                i++;
+            }
+        }
     }
 
     /* An invalid label means fallthrough to the next instruction. */
@@ -239,7 +273,7 @@ void BeamModuleAssembler::emit_i_select_tuple_arity(const ArgRegister &Src,
     }
 
     Label fail = rawLabels[Fail.get()];
-    emit_linear_search(TMP1, fail, args);
+    emit_binsearch_nodes(TMP1, 0, args.size() / 2 - 1, fail, args);
 }
 
 void BeamModuleAssembler::emit_i_select_val_lins(const ArgSource &Src,
@@ -284,16 +318,11 @@ void BeamModuleAssembler::emit_i_select_val_lins(const ArgSource &Src,
     auto shift = plan.second;
 
     if (base == 0 && shift == 0) {
-        if (!emit_optimized_three_way_select(src.reg, fail, args)) {
-            emit_linear_search(src.reg, fail, args);
-        }
+        emit_linear_search(src.reg, fail, args);
     } else {
         auto untagged =
                 emit_select_untag(Src, args, src.reg, next, base, shift);
-
-        if (!emit_optimized_three_way_select(ARG1, fail, untagged)) {
-            emit_linear_search(ARG1, fail, untagged);
-        }
+        emit_linear_search(ARG1, fail, untagged);
     }
 
     if (!Fail.isLabel()) {
@@ -467,7 +496,7 @@ void BeamModuleAssembler::emit_i_jump_on_val(const ArgSource &Src,
     a.bind(data);
     if (embedInText) {
         for (const ArgVal &arg : args) {
-            ASSERT(arg.getType() ==  ArgVal::Label);
+            ASSERT(arg.getType() == ArgVal::Label);
             a.embedLabel(rawLabels[arg.as<ArgLabel>().get()]);
         }
     }
@@ -478,52 +507,65 @@ void BeamModuleAssembler::emit_i_jump_on_val(const ArgSource &Src,
 }
 
 /*
- * Attempt to optimize the case when a select_val has exactly two
- * values which only differ by one bit and they both branch to the
- * same label.
+ * Optimize the case when a select_val has exactly two values that
+ * both branch to the same label.
  *
- * The optimization makes use of the observation that (V == X || V ==
- * Y) is equivalent to (V | (X ^ Y)) == (X | Y) when (X ^ Y) has only
- * one bit set.
+ * If the values only differ by one bit, the optimization makes use of
+ * the observation that (V == X || V == Y) is equivalent to (V | (X ^
+ * Y)) == (X | Y) when (X ^ Y) has only one bit set.
+ *
+ * If more than one bit differ, one conditional branch instruction can
+ * still be eliminated by using the CCMP instruction.
  *
  * Return true if the optimization was possible.
  */
-bool BeamModuleAssembler::emit_optimized_three_way_select(
-        arm::Gp reg,
-        Label fail,
-        const Span<ArgVal> &args) {
-    if (args.size() != 4 || (args[2] != args[3])) {
-        return false;
-    }
-
-    uint64_t x = args[0].isImmed() ? args[0].as<ArgImmed>().get()
-                                   : args[0].as<ArgWord>().get();
-    uint64_t y = args[1].isImmed() ? args[1].as<ArgImmed>().get()
-                                   : args[1].as<ArgWord>().get();
-    uint64_t combined = x | y;
+void BeamModuleAssembler::emit_optimized_two_way_select(arm::Gp reg,
+                                                        const ArgVal &value1,
+                                                        const ArgVal &value2,
+                                                        const ArgVal &label) {
+    uint64_t x = value1.isImmed() ? value1.as<ArgImmed>().get()
+                                  : value1.as<ArgWord>().get();
+    uint64_t y = value2.isImmed() ? value2.as<ArgImmed>().get()
+                                  : value2.as<ArgWord>().get();
     uint64_t diff = x ^ y;
 
-    ArgWord val(combined);
-
-    if ((diff & (diff - 1)) != 0) {
-        return false;
-    }
-
-    comment("(Src == 0x%x || Src == 0x%x) <=> (Src | 0x%x) == 0x%x",
-            x,
-            y,
-            diff,
-            combined);
+    /* Be sure to use a register not used by any caller. */
+    arm::Gp tmp = TMP6;
 
-    a.orr(TMP1, reg, imm(diff));
-    cmp_arg(TMP1, val);
-    a.b_eq(resolve_beam_label(args[2], disp1MB));
-
-    /* An invalid label means fallthrough to the next instruction. */
-    if (fail.isValid()) {
-        a.b(resolve_label(fail, disp128MB));
-        mark_unreachable_check_pending_stubs();
+    if (x + 1 == y) {
+        comment("(Src == %ld || Src == %ld) <=> (Src - %ld) < 2", x, y, x);
+        if (x == 0) {
+            a.cmp(reg, imm(2));
+        } else {
+            sub(tmp, reg, x);
+            a.cmp(tmp, imm(2));
+        }
+        a.b_lo(resolve_beam_label(label, disp1MB));
+    } else if ((diff & (diff - 1)) == 0) {
+        uint64_t combined = x | y;
+        ArgWord val(combined);
+
+        comment("(Src == 0x%x || Src == 0x%x) <=> (Src | 0x%x) == 0x%x",
+                x,
+                y,
+                diff,
+                combined);
+
+        a.orr(tmp, reg, imm(diff));
+        cmp_arg(tmp, val);
+        a.b_eq(resolve_beam_label(label, disp1MB));
+    } else {
+        if (x < 32) {
+            cmp(reg, y);
+            a.ccmp(reg, imm(x), imm(NZCV::kEqual), imm(arm::CondCode::kNE));
+        } else if (-y < 32) {
+            cmp(reg, x);
+            a.ccmn(reg, imm(-y), imm(NZCV::kEqual), imm(arm::CondCode::kNE));
+        } else {
+            cmp(reg, x);
+            a.mov(tmp, y);
+            a.ccmp(reg, tmp, imm(NZCV::kEqual), imm(arm::CondCode::kNE));
+        }
+        a.b_eq(resolve_beam_label(label, disp1MB));
     }
-
-    return true;
 }
diff --git a/erts/emulator/beam/jit/x86/beam_asm.hpp b/erts/emulator/beam/jit/x86/beam_asm.hpp
index 907209bb30..648a4bbc73 100644
--- a/erts/emulator/beam/jit/x86/beam_asm.hpp
+++ b/erts/emulator/beam/jit/x86/beam_asm.hpp
@@ -1364,8 +1364,10 @@ protected:
                               const ArgVal &Fail,
                               const Span<ArgVal> &args);
 
-    bool emit_optimized_three_way_select(const ArgVal &Fail,
-                                         const Span<ArgVal> &args);
+    bool emit_optimized_two_way_select(bool destructive,
+                                       const ArgVal &value1,
+                                       const ArgVal &value2,
+                                       const ArgVal &label);
 
 #ifdef DEBUG
     void emit_tuple_assertion(const ArgSource &Src, x86::Gp tuple_reg);
diff --git a/erts/emulator/beam/jit/x86/instr_select.cpp b/erts/emulator/beam/jit/x86/instr_select.cpp
index 529e1f4f01..fe1736fc7b 100644
--- a/erts/emulator/beam/jit/x86/instr_select.cpp
+++ b/erts/emulator/beam/jit/x86/instr_select.cpp
@@ -31,6 +31,16 @@ void BeamModuleAssembler::emit_linear_search(x86::Gp comparand,
         const ArgImmed &value = args[i];
         const ArgLabel &label = args[i + count];
 
+        if (i < count - 1 && label == args[i + count + 1]) {
+            if (emit_optimized_two_way_select(i == count - 2,
+                                              value,
+                                              args[i + 1],
+                                              label)) {
+                i++;
+                continue;
+            }
+        }
+
         cmp_arg(comparand, value, ARG1);
         a.je(resolve_beam_label(label));
     }
@@ -86,10 +96,6 @@ void BeamModuleAssembler::emit_i_select_val_lins(const ArgSource &Src,
 
     mov_arg(ARG2, Src);
 
-    if (emit_optimized_three_way_select(Fail, args)) {
-        return;
-    }
-
     emit_linear_search(ARG2, Fail, args);
 }
 
@@ -157,9 +163,7 @@ void BeamModuleAssembler::emit_binsearch_nodes(size_t Left,
                       args.begin() + Left + count,
                       args.begin() + count + Left + remaining);
 
-        if (!emit_optimized_three_way_select(Fail, shrunk)) {
-            emit_linear_search(ARG2, Fail, shrunk);
-        }
+        emit_linear_search(ARG2, Fail, shrunk);
 
         return;
     }
@@ -249,18 +253,17 @@ void BeamModuleAssembler::emit_i_jump_on_val(const ArgSource &Src,
  * one bit set.
  *
  * ARG2 contains the value.
- * Return true if the optimization was possible, in
- * which case ARG1 should be considered trashed.
+ *
+ * Return true if the optimization was possible, in which case ARG1
+ * and RET should be considered trashed. If the destructive argument
+ * is true, ARG2 will also be trashed.
  */
-bool BeamModuleAssembler::emit_optimized_three_way_select(
-        const ArgVal &Fail,
-        const Span<ArgVal> &args) {
-    if (args.size() != 4 || (args[2] != args[3])) {
-        return false;
-    }
-
-    uint64_t x = args[0].as<ArgImmed>().get();
-    uint64_t y = args[1].as<ArgImmed>().get();
+bool BeamModuleAssembler::emit_optimized_two_way_select(bool destructive,
+                                                        const ArgVal &value1,
+                                                        const ArgVal &value2,
+                                                        const ArgVal &label) {
+    uint64_t x = value1.as<ArgImmed>().get();
+    uint64_t y = value2.as<ArgImmed>().get();
     uint64_t combined = x | y;
     uint64_t diff = x ^ y;
     ArgVal val(ArgVal::Immediate, combined);
@@ -273,22 +276,15 @@ bool BeamModuleAssembler::emit_optimized_three_way_select(
             diff,
             combined);
 
-    if (Support::isInt32((Sint)diff)) {
+    if (destructive && Support::isInt32((Sint)diff)) {
         a.or_(ARG2, imm(diff));
+        cmp_arg(ARG2, val, RET);
     } else {
-        a.mov(ARG1, imm(diff));
-        a.or_(ARG2, ARG1);
-    }
-
-    cmp_arg(ARG2, val, ARG1);
-    a.je(resolve_beam_label(args[2]));
-
-    if (Fail.isLabel()) {
-        a.jmp(resolve_beam_label(Fail));
-    } else {
-        /* NIL means fallthrough to the next instruction. */
-        ASSERT(Fail.isNil());
+        mov_imm(RET, diff);
+        a.or_(RET, ARG2);
+        cmp_arg(RET, val, ARG1);
     }
+    a.je(resolve_beam_label(label));
 
     return true;
 }
diff --git a/erts/emulator/test/beam_SUITE.erl b/erts/emulator/test/beam_SUITE.erl
index c38aa9a1ab..eff8c6b426 100644
--- a/erts/emulator/test/beam_SUITE.erl
+++ b/erts/emulator/test/beam_SUITE.erl
@@ -24,7 +24,7 @@
 	 init_per_group/2,end_per_group/2, 
 	 packed_registers/1, apply_last/1, apply_last_bif/1,
 	 heap_sizes/1, big_lists/1, fconv/1,
-         select_val/1,
+         select_val/1, select_tuple_arity/1,
          swap_temp_apply/1, beam_init_yregs/1]).
 
 -export([applied/2,swap_temp_applied/1]).
@@ -36,9 +36,9 @@ suite() -> [{ct_hooks,[ts_install_cth]}].
 
 all() -> 
     [packed_registers, apply_last, apply_last_bif,
-     heap_sizes, big_lists, select_val,
-     swap_temp_apply,
-     beam_init_yregs].
+     heap_sizes, big_lists,
+     select_val, select_tuple_arity,
+     swap_temp_apply, beam_init_yregs].
 
 groups() -> 
     [].
@@ -397,6 +397,45 @@ make_func(Name, List) ->
     Cs = [?Q(["(_@I@) -> _@Body@"]) || {I,Body} <- List],
     erl_syntax:function(erl_syntax:atom(Name), Cs).
 
+select_tuple_arity(_Config) ->
+    Mod = ?FUNCTION_NAME,
+
+    {Vs,Cs} = make_tuple_tests(300, [], []),
+    Name = erl_syntax:atom(match_tuple),
+    F = erl_syntax:function(Name, Cs),
+    Code = ?Q(["-module('@Mod@').\n"
+               "-export([match_tuple/1]).\n"]) ++ [F],
+
+    merl:compile_and_load(Code, []),
+
+    %% %% Uncomment the following line to print the generated code.
+    %% merl:print(Code),
+
+    verify_tuple_match(Vs, Mod),
+
+    %% Clean up.
+    true = code:delete(Mod),
+    false = code:purge(Mod),
+
+    ok.
+
+make_tuple_tests(0, Clauses, Acc) ->
+    {Acc,Clauses};
+make_tuple_tests(Size, Clauses, Acc) ->
+    V = erlang:phash2(Size),
+    Es = lists:duplicate(Size, erl_syntax:underscore()),
+    Tuple = erl_syntax:tuple(Es),
+    Value = erl_syntax:integer(V),
+    Clause = erl_syntax:clause([Tuple], [], [Value]),
+    make_tuple_tests(Size-1, [Clause|Clauses], [{Size,V}|Acc]).
+
+verify_tuple_match([{Size,Result}|T], Mod) ->
+    Tuple = erlang:make_tuple(Size, a),
+    Result = Mod:match_tuple(Tuple),
+    verify_tuple_match(T, Mod);
+verify_tuple_match([], _) ->
+    ok.
+
 swap_temp_apply(_Config) ->
     {swap_temp_applied,42} = do_swap_temp_apply(41),
     not_an_integer = do_swap_temp_apply(not_an_integer),
-- 
2.35.3

openSUSE Build Service is sponsored by