File 1104-Optimize-bsr-for-small-operands.patch of Package erlang
From 55813dd0a871bf1aa2619e12e98fb8dc4315c97a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Wed, 2 Aug 2023 07:05:02 +0200
Subject: [PATCH 4/7] Optimize bsr for small operands
Inline the code for right shift a small operand any number steps.
We used to call a helper routine when the shift count exceeded the
number of bits in a small.
---
erts/emulator/beam/jit/arm/instr_arith.cpp | 42 ++++++++--
erts/emulator/beam/jit/x86/instr_arith.cpp | 29 ++++++-
erts/emulator/test/small_SUITE.erl | 91 +++++++++++++++++++++-
3 files changed, 149 insertions(+), 13 deletions(-)
diff --git a/erts/emulator/beam/jit/arm/instr_arith.cpp b/erts/emulator/beam/jit/arm/instr_arith.cpp
index a14ad5cbaf..dbdb05b86a 100644
--- a/erts/emulator/beam/jit/arm/instr_arith.cpp
+++ b/erts/emulator/beam/jit/arm/instr_arith.cpp
@@ -1485,34 +1485,62 @@ void BeamModuleAssembler::emit_i_bsr(const ArgLabel &Fail,
if (RHS.isSmall()) {
Sint shift = RHS.as<ArgSmall>().getSigned();
- if (shift >= 0 && shift < SMALL_BITS - 1) {
+ if (shift >= 0) {
+ arm::Gp small_tag = TMP1;
if (always_small(LHS)) {
comment("skipped test for small left operand because it is "
"always small");
need_generic = false;
+ mov_imm(small_tag, _TAG_IMMED1_SMALL);
} else if (always_one_of<BeamTypeId::Number>(LHS)) {
comment("simplified test for small operand since it is a "
"number");
emit_is_not_boxed(generic, lhs.reg);
+ mov_imm(small_tag, _TAG_IMMED1_SMALL);
} else {
- a.and_(TMP1, lhs.reg, imm(_TAG_IMMED1_MASK));
- a.cmp(TMP1, imm(_TAG_IMMED1_SMALL));
+ a.and_(small_tag, lhs.reg, imm(_TAG_IMMED1_MASK));
+ a.cmp(small_tag, imm(_TAG_IMMED1_SMALL));
a.b_ne(generic);
}
/* We don't need to clear the mask after shifting because
* _TAG_IMMED1_SMALL will set all the bits anyway. */
ERTS_CT_ASSERT(_TAG_IMMED1_MASK == _TAG_IMMED1_SMALL);
- a.asr(TMP1, lhs.reg, imm(shift));
- a.orr(dst.reg, TMP1, imm(_TAG_IMMED1_SMALL));
+ shift = std::min<Sint>(shift, 63);
+ a.orr(dst.reg, small_tag, lhs.reg, arm::asr(shift));
if (need_generic) {
a.b(next);
}
} else {
- /* Constant shift is negative or too big to fit the `asr`
- * instruction; fall back to the generic path. */
+ /* Constant shift is negative; fall back to the generic
+ * path. */
}
+ } else {
+ auto rhs = load_source(RHS, ARG3);
+
+ /* Ensure that both operands are small and that the shift
+ * count is positive. */
+ ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
+ a.ands(TMP1, rhs.reg, imm((1ull << 63) | _TAG_IMMED1_MASK));
+ a.and_(TMP1, lhs.reg, TMP1);
+ a.ccmp(TMP1,
+ imm(_TAG_IMMED1_SMALL),
+ imm(NZCV::kNone),
+ arm::CondCode::kPL);
+ a.b_ne(generic);
+
+ /* Calculate shift count. */
+ a.asr(TMP1, rhs.reg, imm(_TAG_IMMED1_SIZE));
+ mov_imm(TMP2, 63);
+ a.cmp(TMP1, TMP2);
+ a.csel(TMP1, TMP1, TMP2, imm(arm::CondCode::kLE));
+
+ /* Shift right. */
+ ERTS_CT_ASSERT(_TAG_IMMED1_MASK == _TAG_IMMED1_SMALL);
+ a.asr(dst.reg, lhs.reg, TMP1);
+ a.orr(dst.reg, dst.reg, imm(_TAG_IMMED1_SMALL));
+ a.b(next);
}
a.bind(generic);
diff --git a/erts/emulator/beam/jit/x86/instr_arith.cpp b/erts/emulator/beam/jit/x86/instr_arith.cpp
index 56c4eb06d3..35976cc048 100644
--- a/erts/emulator/beam/jit/x86/instr_arith.cpp
+++ b/erts/emulator/beam/jit/x86/instr_arith.cpp
@@ -1512,13 +1512,14 @@ void BeamModuleAssembler::emit_i_bsr(const ArgSource &LHS,
const ArgRegister &Dst) {
Label generic = a.newLabel(), next = a.newLabel();
bool need_generic = true;
+ bool need_register_load = true;
mov_arg(ARG2, LHS);
if (RHS.isSmall()) {
Sint shift = RHS.as<ArgSmall>().getSigned();
- if (shift >= 0 && shift < SMALL_BITS - 1) {
+ if (shift >= 0) {
if (always_small(LHS)) {
comment("skipped test for small left operand because it is "
"always small");
@@ -1532,6 +1533,7 @@ void BeamModuleAssembler::emit_i_bsr(const ArgSource &LHS,
/* We don't need to clear the mask after shifting because
* _TAG_IMMED1_SMALL will set all the bits anyway. */
ERTS_CT_ASSERT(_TAG_IMMED1_MASK == _TAG_IMMED1_SMALL);
+ shift = std::min<Sint>(shift, 63);
a.sar(RET, imm(shift));
a.or_(RET, imm(_TAG_IMMED1_SMALL));
@@ -1539,14 +1541,33 @@ void BeamModuleAssembler::emit_i_bsr(const ArgSource &LHS,
a.short_().jmp(next);
}
} else {
- /* Constant shift is negative or too big to fit the `sar`
- * instruction, fall back to the generic path. */
+ /* Constant shift is negative; fall back to the generic
+ * path. */
}
+ } else if (hasCpuFeature(CpuFeatures::X86::kBMI2)) {
+ mov_arg(RET, RHS);
+ need_register_load = false;
+
+ emit_are_both_small(generic, LHS, ARG2, RHS, RET);
+
+ a.mov(ARG1, RET);
+ a.sar(ARG1, imm(_TAG_IMMED1_SIZE));
+ a.js(generic);
+
+ mov_imm(RET, 63);
+ a.cmp(ARG1, RET);
+ a.cmova(ARG1, RET);
+
+ a.sarx(RET, ARG2, ARG1);
+ a.or_(RET, imm(_TAG_IMMED1_SMALL));
+ a.short_().jmp(next);
}
a.bind(generic);
if (need_generic) {
- mov_arg(RET, RHS);
+ if (need_register_load) {
+ mov_arg(RET, RHS);
+ }
if (Fail.get() != 0) {
safe_fragment_call(ga->get_i_bsr_guard_shared());
diff --git a/erts/emulator/test/small_SUITE.erl b/erts/emulator/test/small_SUITE.erl
index bb4d69e355..aa732c1e24 100644
--- a/erts/emulator/test/small_SUITE.erl
+++ b/erts/emulator/test/small_SUITE.erl
@@ -25,7 +25,7 @@
-export([edge_cases/1,
addition/1, subtraction/1, negation/1,
multiplication/1, mul_add/1, division/1,
- test_bitwise/1, test_bsl/1,
+ test_bitwise/1, test_bsl/1, test_bsr/1,
element/1,
range_optimization/1]).
-export([mul_add/0, division/0]).
@@ -43,7 +43,7 @@ groups() ->
[{p, [parallel],
[edge_cases,
addition, subtraction, negation, multiplication, mul_add, division,
- test_bitwise, test_bsl,
+ test_bitwise, test_bsl, test_bsr,
element,
range_optimization]}].
@@ -1219,6 +1219,93 @@ test_bsl([{Name,{N,S}}|T], Mod) ->
test_bsl([], _) ->
ok.
+test_bsr(_Config) ->
+ _ = rand:uniform(), %Seed generator
+ io:format("Seed: ~p", [rand:export_seed()]),
+ Mod = list_to_atom(lists:concat([?MODULE,"_",?FUNCTION_NAME])),
+ Pairs = bsr_gen_pairs(),
+ Fs0 = gen_func_names(Pairs, 0),
+ Fs = [gen_bsr_function(F) || F <- Fs0],
+ Tree = ?Q(["-module('@Mod@').",
+ "-compile([export_all,nowarn_export_all]).",
+ "id(I) -> I."]) ++ Fs,
+ %% merl:print(Tree),
+ {ok,_Bin} = merl:compile_and_load(Tree, []),
+ test_bsr(Fs0, Mod),
+ unload(Mod),
+ ok.
+
+bsr_gen_pairs() ->
+ {_MinSmall, MaxSmall} = determine_small_limits(0),
+ SmallBits = num_bits(MaxSmall),
+
+ {Powers,Shifts} =
+ if
+ SmallBits < 32 ->
+ {lists:seq(15, SmallBits+2),
+ lists:seq(0, 7) ++ lists:seq(24, 36)};
+ true ->
+ {lists:seq(30, SmallBits+2),
+ lists:seq(0, 7) ++ lists:seq(56, 72)}
+ end,
+
+ [{N,S} ||
+ P <- Powers,
+ N <- [rand:uniform(1 bsl P), (1 bsl P)-1],
+ S <- Shifts].
+
+gen_bsr_function({Name,{N,S}}) ->
+ Mask = (1 bsl num_bits(N)) - 1,
+ ?Q("'@Name@'(N0, fixed, More) ->
+ Res = N0 bsr _@S@,
+ if
+ More ->
+ N = N0 band _@Mask@,
+ Res = N0 bsr _@S@,
+ Res = N bsr _@S@;
+ true ->
+ Res
+ end;
+ '@Name@'(N0, S, More) ->
+ Res = id(N0 bsr S),
+ if
+ More ->
+ N = N0 band _@Mask@,
+ Res = id(N0 bsr S),
+ Res = id(N bsr S),
+ if
+ S >= 0 ->
+ Res = id(N bsr S);
+ true ->
+ Res
+ end;
+ true ->
+ Res
+ end. ").
+
+test_bsr([{Name,{N,S}}|T], Mod) ->
+ try
+ Res = N bsr S,
+ Res = Mod:Name(N, fixed, true),
+ Res = Mod:Name(N, S, true),
+
+ NegRes = -N bsr S,
+ NegRes = Mod:Name(-N, fixed, false),
+
+ NegRes = -N bsr S,
+ NegRes = Mod:Name(-N, S, false),
+
+ BslRes = N bsr -S,
+ BslRes = Mod:Name(N, -S, false)
+ catch
+ C:R:Stk ->
+ io:format("~p failed. numbers: ~p ~p\n", [Name,N,S]),
+ erlang:raise(C, R, Stk)
+ end,
+ test_bsr(T, Mod);
+test_bsr([], _) ->
+ ok.
+
element(_Config) ->
%% Test element_1: Can't fail for integer arguments.
zero = element_1(0),
--
2.35.3