File 1145-Optimize-select_val-with-consecutive-identical-label.patch of Package erlang
From 57bc32d84cbce65a4e6103ac88c3c22ad9a059fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Sat, 22 Jul 2023 08:27:50 +0200
Subject: [PATCH 25/25] Optimize select_val with consecutive identical labels
---
erts/emulator/beam/jit/arm/beam_asm.hpp | 7 +-
erts/emulator/beam/jit/arm/instr_select.cpp | 150 +++++++++++++-------
erts/emulator/beam/jit/x86/beam_asm.hpp | 6 +-
erts/emulator/beam/jit/x86/instr_select.cpp | 58 ++++----
erts/emulator/test/beam_SUITE.erl | 47 +++++-
5 files changed, 174 insertions(+), 94 deletions(-)
diff --git a/erts/emulator/beam/jit/arm/beam_asm.hpp b/erts/emulator/beam/jit/arm/beam_asm.hpp
index 11b796dfbf..7ab52fb5c0 100644
--- a/erts/emulator/beam/jit/arm/beam_asm.hpp
+++ b/erts/emulator/beam/jit/arm/beam_asm.hpp
@@ -1289,9 +1289,10 @@ protected:
Label fail,
const Span<ArgVal> &args);
- bool emit_optimized_three_way_select(arm::Gp reg,
- Label fail,
- const Span<ArgVal> &args);
+ void emit_optimized_two_way_select(arm::Gp reg,
+ const ArgVal &value1,
+ const ArgVal &value2,
+ const ArgVal &label);
#ifdef DEBUG
void emit_tuple_assertion(const ArgSource &Src, arm::Gp tuple_reg);
diff --git a/erts/emulator/beam/jit/arm/instr_select.cpp b/erts/emulator/beam/jit/arm/instr_select.cpp
index 9037d63bf1..494c8e5233 100644
--- a/erts/emulator/beam/jit/arm/instr_select.cpp
+++ b/erts/emulator/beam/jit/arm/instr_select.cpp
@@ -198,17 +198,51 @@ void BeamModuleAssembler::emit_linear_search(arm::Gp comparand,
const Span<ArgVal> &args) {
int count = args.size() / 2;
+ ASSERT(count < 128);
+ check_pending_stubs();
+
for (int i = 0; i < count; i++) {
const ArgVal &value = args[i];
const ArgVal &label = args[i + count];
+ int j;
+ int n = 1;
- if ((i % 128) == 0) {
- /* Checking veneers on the first element is intentional. */
- check_pending_stubs();
+ for (j = i + 1; j < count && args[j + count] == label; j++) {
+ n++;
}
+ if (n < 2) {
+ cmp_arg(comparand, value);
+ a.b_eq(resolve_beam_label(label, disp1MB));
+ } else {
+ int in_range = 1;
+
+ for (j = i + 1; j < n; j++) {
+ if (!(value.isWord() && value.as<ArgWord>().get() + in_range ==
+ args[j].as<ArgWord>().get())) {
+ break;
+ }
+ in_range++;
+ }
- cmp_arg(comparand, value);
- a.b_eq(resolve_beam_label(label, disp1MB));
+ if (in_range > 2) {
+ uint64_t first = value.as<ArgWord>().get();
+
+ if (first == 0) {
+ a.cmp(comparand, imm(in_range));
+ } else {
+ sub(TMP6, comparand, first);
+ a.cmp(TMP6, imm(in_range));
+ }
+ a.b_lo(resolve_beam_label(label, disp1MB));
+ i += in_range - 1;
+ } else {
+ emit_optimized_two_way_select(comparand,
+ value,
+ args[i + 1],
+ label);
+ i++;
+ }
+ }
}
/* An invalid label means fallthrough to the next instruction. */
@@ -239,7 +273,7 @@ void BeamModuleAssembler::emit_i_select_tuple_arity(const ArgRegister &Src,
}
Label fail = rawLabels[Fail.get()];
- emit_linear_search(TMP1, fail, args);
+ emit_binsearch_nodes(TMP1, 0, args.size() / 2 - 1, fail, args);
}
void BeamModuleAssembler::emit_i_select_val_lins(const ArgSource &Src,
@@ -284,16 +318,11 @@ void BeamModuleAssembler::emit_i_select_val_lins(const ArgSource &Src,
auto shift = plan.second;
if (base == 0 && shift == 0) {
- if (!emit_optimized_three_way_select(src.reg, fail, args)) {
- emit_linear_search(src.reg, fail, args);
- }
+ emit_linear_search(src.reg, fail, args);
} else {
auto untagged =
emit_select_untag(Src, args, src.reg, next, base, shift);
-
- if (!emit_optimized_three_way_select(ARG1, fail, untagged)) {
- emit_linear_search(ARG1, fail, untagged);
- }
+ emit_linear_search(ARG1, fail, untagged);
}
if (!Fail.isLabel()) {
@@ -467,7 +496,7 @@ void BeamModuleAssembler::emit_i_jump_on_val(const ArgSource &Src,
a.bind(data);
if (embedInText) {
for (const ArgVal &arg : args) {
- ASSERT(arg.getType() == ArgVal::Label);
+ ASSERT(arg.getType() == ArgVal::Label);
a.embedLabel(rawLabels[arg.as<ArgLabel>().get()]);
}
}
@@ -478,52 +507,65 @@ void BeamModuleAssembler::emit_i_jump_on_val(const ArgSource &Src,
}
/*
- * Attempt to optimize the case when a select_val has exactly two
- * values which only differ by one bit and they both branch to the
- * same label.
+ * Optimize the case when a select_val has exactly two values that
+ * both branch to the same label.
*
- * The optimization makes use of the observation that (V == X || V ==
- * Y) is equivalent to (V | (X ^ Y)) == (X | Y) when (X ^ Y) has only
- * one bit set.
+ * If the values only differ by one bit, the optimization makes use of
+ * the observation that (V == X || V == Y) is equivalent to (V | (X ^
+ * Y)) == (X | Y) when (X ^ Y) has only one bit set.
+ *
+ * If more than one bit differ, one conditional branch instruction can
+ * still be eliminated by using the CCMP instruction.
*
* Return true if the optimization was possible.
*/
-bool BeamModuleAssembler::emit_optimized_three_way_select(
- arm::Gp reg,
- Label fail,
- const Span<ArgVal> &args) {
- if (args.size() != 4 || (args[2] != args[3])) {
- return false;
- }
-
- uint64_t x = args[0].isImmed() ? args[0].as<ArgImmed>().get()
- : args[0].as<ArgWord>().get();
- uint64_t y = args[1].isImmed() ? args[1].as<ArgImmed>().get()
- : args[1].as<ArgWord>().get();
- uint64_t combined = x | y;
+void BeamModuleAssembler::emit_optimized_two_way_select(arm::Gp reg,
+ const ArgVal &value1,
+ const ArgVal &value2,
+ const ArgVal &label) {
+ uint64_t x = value1.isImmed() ? value1.as<ArgImmed>().get()
+ : value1.as<ArgWord>().get();
+ uint64_t y = value2.isImmed() ? value2.as<ArgImmed>().get()
+ : value2.as<ArgWord>().get();
uint64_t diff = x ^ y;
- ArgWord val(combined);
-
- if ((diff & (diff - 1)) != 0) {
- return false;
- }
-
- comment("(Src == 0x%x || Src == 0x%x) <=> (Src | 0x%x) == 0x%x",
- x,
- y,
- diff,
- combined);
+ /* Be sure to use a register not used by any caller. */
+ arm::Gp tmp = TMP6;
- a.orr(TMP1, reg, imm(diff));
- cmp_arg(TMP1, val);
- a.b_eq(resolve_beam_label(args[2], disp1MB));
-
- /* An invalid label means fallthrough to the next instruction. */
- if (fail.isValid()) {
- a.b(resolve_label(fail, disp128MB));
- mark_unreachable_check_pending_stubs();
+ if (x + 1 == y) {
+ comment("(Src == %ld || Src == %ld) <=> (Src - %ld) < 2", x, y, x);
+ if (x == 0) {
+ a.cmp(reg, imm(2));
+ } else {
+ sub(tmp, reg, x);
+ a.cmp(tmp, imm(2));
+ }
+ a.b_lo(resolve_beam_label(label, disp1MB));
+ } else if ((diff & (diff - 1)) == 0) {
+ uint64_t combined = x | y;
+ ArgWord val(combined);
+
+ comment("(Src == 0x%x || Src == 0x%x) <=> (Src | 0x%x) == 0x%x",
+ x,
+ y,
+ diff,
+ combined);
+
+ a.orr(tmp, reg, imm(diff));
+ cmp_arg(tmp, val);
+ a.b_eq(resolve_beam_label(label, disp1MB));
+ } else {
+ if (x < 32) {
+ cmp(reg, y);
+ a.ccmp(reg, imm(x), imm(NZCV::kEqual), imm(arm::CondCode::kNE));
+ } else if (-y < 32) {
+ cmp(reg, x);
+ a.ccmn(reg, imm(-y), imm(NZCV::kEqual), imm(arm::CondCode::kNE));
+ } else {
+ cmp(reg, x);
+ a.mov(tmp, y);
+ a.ccmp(reg, tmp, imm(NZCV::kEqual), imm(arm::CondCode::kNE));
+ }
+ a.b_eq(resolve_beam_label(label, disp1MB));
}
-
- return true;
}
diff --git a/erts/emulator/beam/jit/x86/beam_asm.hpp b/erts/emulator/beam/jit/x86/beam_asm.hpp
index 907209bb30..648a4bbc73 100644
--- a/erts/emulator/beam/jit/x86/beam_asm.hpp
+++ b/erts/emulator/beam/jit/x86/beam_asm.hpp
@@ -1364,8 +1364,10 @@ protected:
const ArgVal &Fail,
const Span<ArgVal> &args);
- bool emit_optimized_three_way_select(const ArgVal &Fail,
- const Span<ArgVal> &args);
+ bool emit_optimized_two_way_select(bool destructive,
+ const ArgVal &value1,
+ const ArgVal &value2,
+ const ArgVal &label);
#ifdef DEBUG
void emit_tuple_assertion(const ArgSource &Src, x86::Gp tuple_reg);
diff --git a/erts/emulator/beam/jit/x86/instr_select.cpp b/erts/emulator/beam/jit/x86/instr_select.cpp
index 529e1f4f01..fe1736fc7b 100644
--- a/erts/emulator/beam/jit/x86/instr_select.cpp
+++ b/erts/emulator/beam/jit/x86/instr_select.cpp
@@ -31,6 +31,16 @@ void BeamModuleAssembler::emit_linear_search(x86::Gp comparand,
const ArgImmed &value = args[i];
const ArgLabel &label = args[i + count];
+ if (i < count - 1 && label == args[i + count + 1]) {
+ if (emit_optimized_two_way_select(i == count - 2,
+ value,
+ args[i + 1],
+ label)) {
+ i++;
+ continue;
+ }
+ }
+
cmp_arg(comparand, value, ARG1);
a.je(resolve_beam_label(label));
}
@@ -86,10 +96,6 @@ void BeamModuleAssembler::emit_i_select_val_lins(const ArgSource &Src,
mov_arg(ARG2, Src);
- if (emit_optimized_three_way_select(Fail, args)) {
- return;
- }
-
emit_linear_search(ARG2, Fail, args);
}
@@ -157,9 +163,7 @@ void BeamModuleAssembler::emit_binsearch_nodes(size_t Left,
args.begin() + Left + count,
args.begin() + count + Left + remaining);
- if (!emit_optimized_three_way_select(Fail, shrunk)) {
- emit_linear_search(ARG2, Fail, shrunk);
- }
+ emit_linear_search(ARG2, Fail, shrunk);
return;
}
@@ -249,18 +253,17 @@ void BeamModuleAssembler::emit_i_jump_on_val(const ArgSource &Src,
* one bit set.
*
* ARG2 contains the value.
- * Return true if the optimization was possible, in
- * which case ARG1 should be considered trashed.
+ *
+ * Return true if the optimization was possible, in which case ARG1
+ * and RET should be considered trashed. If the destructive argument
+ * is true, ARG2 will also be trashed.
*/
-bool BeamModuleAssembler::emit_optimized_three_way_select(
- const ArgVal &Fail,
- const Span<ArgVal> &args) {
- if (args.size() != 4 || (args[2] != args[3])) {
- return false;
- }
-
- uint64_t x = args[0].as<ArgImmed>().get();
- uint64_t y = args[1].as<ArgImmed>().get();
+bool BeamModuleAssembler::emit_optimized_two_way_select(bool destructive,
+ const ArgVal &value1,
+ const ArgVal &value2,
+ const ArgVal &label) {
+ uint64_t x = value1.as<ArgImmed>().get();
+ uint64_t y = value2.as<ArgImmed>().get();
uint64_t combined = x | y;
uint64_t diff = x ^ y;
ArgVal val(ArgVal::Immediate, combined);
@@ -273,22 +276,15 @@ bool BeamModuleAssembler::emit_optimized_three_way_select(
diff,
combined);
- if (Support::isInt32((Sint)diff)) {
+ if (destructive && Support::isInt32((Sint)diff)) {
a.or_(ARG2, imm(diff));
+ cmp_arg(ARG2, val, RET);
} else {
- a.mov(ARG1, imm(diff));
- a.or_(ARG2, ARG1);
- }
-
- cmp_arg(ARG2, val, ARG1);
- a.je(resolve_beam_label(args[2]));
-
- if (Fail.isLabel()) {
- a.jmp(resolve_beam_label(Fail));
- } else {
- /* NIL means fallthrough to the next instruction. */
- ASSERT(Fail.isNil());
+ mov_imm(RET, diff);
+ a.or_(RET, ARG2);
+ cmp_arg(RET, val, ARG1);
}
+ a.je(resolve_beam_label(label));
return true;
}
diff --git a/erts/emulator/test/beam_SUITE.erl b/erts/emulator/test/beam_SUITE.erl
index c38aa9a1ab..eff8c6b426 100644
--- a/erts/emulator/test/beam_SUITE.erl
+++ b/erts/emulator/test/beam_SUITE.erl
@@ -24,7 +24,7 @@
init_per_group/2,end_per_group/2,
packed_registers/1, apply_last/1, apply_last_bif/1,
heap_sizes/1, big_lists/1, fconv/1,
- select_val/1,
+ select_val/1, select_tuple_arity/1,
swap_temp_apply/1, beam_init_yregs/1]).
-export([applied/2,swap_temp_applied/1]).
@@ -36,9 +36,9 @@ suite() -> [{ct_hooks,[ts_install_cth]}].
all() ->
[packed_registers, apply_last, apply_last_bif,
- heap_sizes, big_lists, select_val,
- swap_temp_apply,
- beam_init_yregs].
+ heap_sizes, big_lists,
+ select_val, select_tuple_arity,
+ swap_temp_apply, beam_init_yregs].
groups() ->
[].
@@ -397,6 +397,45 @@ make_func(Name, List) ->
Cs = [?Q(["(_@I@) -> _@Body@"]) || {I,Body} <- List],
erl_syntax:function(erl_syntax:atom(Name), Cs).
+select_tuple_arity(_Config) ->
+ Mod = ?FUNCTION_NAME,
+
+ {Vs,Cs} = make_tuple_tests(300, [], []),
+ Name = erl_syntax:atom(match_tuple),
+ F = erl_syntax:function(Name, Cs),
+ Code = ?Q(["-module('@Mod@').\n"
+ "-export([match_tuple/1]).\n"]) ++ [F],
+
+ merl:compile_and_load(Code, []),
+
+ %% %% Uncomment the following line to print the generated code.
+ %% merl:print(Code),
+
+ verify_tuple_match(Vs, Mod),
+
+ %% Clean up.
+ true = code:delete(Mod),
+ false = code:purge(Mod),
+
+ ok.
+
+make_tuple_tests(0, Clauses, Acc) ->
+ {Acc,Clauses};
+make_tuple_tests(Size, Clauses, Acc) ->
+ V = erlang:phash2(Size),
+ Es = lists:duplicate(Size, erl_syntax:underscore()),
+ Tuple = erl_syntax:tuple(Es),
+ Value = erl_syntax:integer(V),
+ Clause = erl_syntax:clause([Tuple], [], [Value]),
+ make_tuple_tests(Size-1, [Clause|Clauses], [{Size,V}|Acc]).
+
+verify_tuple_match([{Size,Result}|T], Mod) ->
+ Tuple = erlang:make_tuple(Size, a),
+ Result = Mod:match_tuple(Tuple),
+ verify_tuple_match(T, Mod);
+verify_tuple_match([], _) ->
+ ok.
+
swap_temp_apply(_Config) ->
{swap_temp_applied,42} = do_swap_temp_apply(41),
not_an_integer = do_swap_temp_apply(not_an_integer),
--
2.35.3