File 1104-Optimize-bsr-for-small-operands.patch of Package erlang

From 55813dd0a871bf1aa2619e12e98fb8dc4315c97a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Wed, 2 Aug 2023 07:05:02 +0200
Subject: [PATCH 4/7] Optimize bsr for small operands

Inline the code for right shift a small operand any number steps.
We used to call a helper routine when the shift count exceeded the
number of bits in a small.
---
 erts/emulator/beam/jit/arm/instr_arith.cpp | 42 ++++++++--
 erts/emulator/beam/jit/x86/instr_arith.cpp | 29 ++++++-
 erts/emulator/test/small_SUITE.erl         | 91 +++++++++++++++++++++-
 3 files changed, 149 insertions(+), 13 deletions(-)

diff --git a/erts/emulator/beam/jit/arm/instr_arith.cpp b/erts/emulator/beam/jit/arm/instr_arith.cpp
index a14ad5cbaf..dbdb05b86a 100644
--- a/erts/emulator/beam/jit/arm/instr_arith.cpp
+++ b/erts/emulator/beam/jit/arm/instr_arith.cpp
@@ -1485,34 +1485,62 @@ void BeamModuleAssembler::emit_i_bsr(const ArgLabel &Fail,
     if (RHS.isSmall()) {
         Sint shift = RHS.as<ArgSmall>().getSigned();
 
-        if (shift >= 0 && shift < SMALL_BITS - 1) {
+        if (shift >= 0) {
+            arm::Gp small_tag = TMP1;
             if (always_small(LHS)) {
                 comment("skipped test for small left operand because it is "
                         "always small");
                 need_generic = false;
+                mov_imm(small_tag, _TAG_IMMED1_SMALL);
             } else if (always_one_of<BeamTypeId::Number>(LHS)) {
                 comment("simplified test for small operand since it is a "
                         "number");
                 emit_is_not_boxed(generic, lhs.reg);
+                mov_imm(small_tag, _TAG_IMMED1_SMALL);
             } else {
-                a.and_(TMP1, lhs.reg, imm(_TAG_IMMED1_MASK));
-                a.cmp(TMP1, imm(_TAG_IMMED1_SMALL));
+                a.and_(small_tag, lhs.reg, imm(_TAG_IMMED1_MASK));
+                a.cmp(small_tag, imm(_TAG_IMMED1_SMALL));
                 a.b_ne(generic);
             }
 
             /* We don't need to clear the mask after shifting because
              * _TAG_IMMED1_SMALL will set all the bits anyway. */
             ERTS_CT_ASSERT(_TAG_IMMED1_MASK == _TAG_IMMED1_SMALL);
-            a.asr(TMP1, lhs.reg, imm(shift));
-            a.orr(dst.reg, TMP1, imm(_TAG_IMMED1_SMALL));
+            shift = std::min<Sint>(shift, 63);
+            a.orr(dst.reg, small_tag, lhs.reg, arm::asr(shift));
 
             if (need_generic) {
                 a.b(next);
             }
         } else {
-            /* Constant shift is negative or too big to fit the `asr`
-             * instruction; fall back to the generic path. */
+            /* Constant shift is negative; fall back to the generic
+             * path. */
         }
+    } else {
+        auto rhs = load_source(RHS, ARG3);
+
+        /* Ensure that both operands are small and that the shift
+         * count is positive. */
+        ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
+        a.ands(TMP1, rhs.reg, imm((1ull << 63) | _TAG_IMMED1_MASK));
+        a.and_(TMP1, lhs.reg, TMP1);
+        a.ccmp(TMP1,
+               imm(_TAG_IMMED1_SMALL),
+               imm(NZCV::kNone),
+               arm::CondCode::kPL);
+        a.b_ne(generic);
+
+        /* Calculate shift count. */
+        a.asr(TMP1, rhs.reg, imm(_TAG_IMMED1_SIZE));
+        mov_imm(TMP2, 63);
+        a.cmp(TMP1, TMP2);
+        a.csel(TMP1, TMP1, TMP2, imm(arm::CondCode::kLE));
+
+        /* Shift right. */
+        ERTS_CT_ASSERT(_TAG_IMMED1_MASK == _TAG_IMMED1_SMALL);
+        a.asr(dst.reg, lhs.reg, TMP1);
+        a.orr(dst.reg, dst.reg, imm(_TAG_IMMED1_SMALL));
+        a.b(next);
     }
 
     a.bind(generic);
diff --git a/erts/emulator/beam/jit/x86/instr_arith.cpp b/erts/emulator/beam/jit/x86/instr_arith.cpp
index 56c4eb06d3..35976cc048 100644
--- a/erts/emulator/beam/jit/x86/instr_arith.cpp
+++ b/erts/emulator/beam/jit/x86/instr_arith.cpp
@@ -1512,13 +1512,14 @@ void BeamModuleAssembler::emit_i_bsr(const ArgSource &LHS,
                                      const ArgRegister &Dst) {
     Label generic = a.newLabel(), next = a.newLabel();
     bool need_generic = true;
+    bool need_register_load = true;
 
     mov_arg(ARG2, LHS);
 
     if (RHS.isSmall()) {
         Sint shift = RHS.as<ArgSmall>().getSigned();
 
-        if (shift >= 0 && shift < SMALL_BITS - 1) {
+        if (shift >= 0) {
             if (always_small(LHS)) {
                 comment("skipped test for small left operand because it is "
                         "always small");
@@ -1532,6 +1533,7 @@ void BeamModuleAssembler::emit_i_bsr(const ArgSource &LHS,
             /* We don't need to clear the mask after shifting because
              * _TAG_IMMED1_SMALL will set all the bits anyway. */
             ERTS_CT_ASSERT(_TAG_IMMED1_MASK == _TAG_IMMED1_SMALL);
+            shift = std::min<Sint>(shift, 63);
             a.sar(RET, imm(shift));
             a.or_(RET, imm(_TAG_IMMED1_SMALL));
 
@@ -1539,14 +1541,33 @@ void BeamModuleAssembler::emit_i_bsr(const ArgSource &LHS,
                 a.short_().jmp(next);
             }
         } else {
-            /* Constant shift is negative or too big to fit the `sar`
-             * instruction, fall back to the generic path. */
+            /* Constant shift is negative; fall back to the generic
+             * path. */
         }
+    } else if (hasCpuFeature(CpuFeatures::X86::kBMI2)) {
+        mov_arg(RET, RHS);
+        need_register_load = false;
+
+        emit_are_both_small(generic, LHS, ARG2, RHS, RET);
+
+        a.mov(ARG1, RET);
+        a.sar(ARG1, imm(_TAG_IMMED1_SIZE));
+        a.js(generic);
+
+        mov_imm(RET, 63);
+        a.cmp(ARG1, RET);
+        a.cmova(ARG1, RET);
+
+        a.sarx(RET, ARG2, ARG1);
+        a.or_(RET, imm(_TAG_IMMED1_SMALL));
+        a.short_().jmp(next);
     }
 
     a.bind(generic);
     if (need_generic) {
-        mov_arg(RET, RHS);
+        if (need_register_load) {
+            mov_arg(RET, RHS);
+        }
 
         if (Fail.get() != 0) {
             safe_fragment_call(ga->get_i_bsr_guard_shared());
diff --git a/erts/emulator/test/small_SUITE.erl b/erts/emulator/test/small_SUITE.erl
index bb4d69e355..aa732c1e24 100644
--- a/erts/emulator/test/small_SUITE.erl
+++ b/erts/emulator/test/small_SUITE.erl
@@ -25,7 +25,7 @@
 -export([edge_cases/1,
          addition/1, subtraction/1, negation/1,
          multiplication/1, mul_add/1, division/1,
-         test_bitwise/1, test_bsl/1,
+         test_bitwise/1, test_bsl/1, test_bsr/1,
          element/1,
          range_optimization/1]).
 -export([mul_add/0, division/0]).
@@ -43,7 +43,7 @@ groups() ->
     [{p, [parallel],
       [edge_cases,
        addition, subtraction, negation, multiplication, mul_add, division,
-       test_bitwise, test_bsl,
+       test_bitwise, test_bsl, test_bsr,
        element,
        range_optimization]}].
 
@@ -1219,6 +1219,93 @@ test_bsl([{Name,{N,S}}|T], Mod) ->
 test_bsl([], _) ->
     ok.
 
+test_bsr(_Config) ->
+    _ = rand:uniform(),				%Seed generator
+    io:format("Seed: ~p", [rand:export_seed()]),
+    Mod = list_to_atom(lists:concat([?MODULE,"_",?FUNCTION_NAME])),
+    Pairs = bsr_gen_pairs(),
+    Fs0 = gen_func_names(Pairs, 0),
+    Fs = [gen_bsr_function(F) || F <- Fs0],
+    Tree = ?Q(["-module('@Mod@').",
+               "-compile([export_all,nowarn_export_all]).",
+               "id(I) -> I."]) ++ Fs,
+    %% merl:print(Tree),
+    {ok,_Bin} = merl:compile_and_load(Tree, []),
+    test_bsr(Fs0, Mod),
+    unload(Mod),
+    ok.
+
+bsr_gen_pairs() ->
+    {_MinSmall, MaxSmall} = determine_small_limits(0),
+    SmallBits = num_bits(MaxSmall),
+
+    {Powers,Shifts} =
+        if
+            SmallBits < 32 ->
+                {lists:seq(15, SmallBits+2),
+                 lists:seq(0, 7) ++ lists:seq(24, 36)};
+            true ->
+                {lists:seq(30, SmallBits+2),
+                 lists:seq(0, 7) ++ lists:seq(56, 72)}
+        end,
+
+    [{N,S} ||
+        P <- Powers,
+        N <- [rand:uniform(1 bsl P), (1 bsl P)-1],
+        S <- Shifts].
+
+gen_bsr_function({Name,{N,S}}) ->
+    Mask = (1 bsl num_bits(N)) - 1,
+    ?Q("'@Name@'(N0, fixed, More) ->
+           Res = N0 bsr _@S@,
+           if
+               More ->
+                   N = N0 band _@Mask@,
+                   Res = N0 bsr _@S@,
+                   Res = N bsr _@S@;
+               true ->
+                   Res
+           end;
+        '@Name@'(N0, S, More) ->
+           Res = id(N0 bsr S),
+           if
+               More ->
+                   N = N0 band _@Mask@,
+                   Res = id(N0 bsr S),
+                   Res = id(N bsr S),
+                   if
+                      S >= 0 ->
+                          Res = id(N bsr S);
+                      true ->
+                           Res
+                   end;
+               true ->
+                   Res
+           end. ").
+
+test_bsr([{Name,{N,S}}|T], Mod) ->
+    try
+        Res = N bsr S,
+        Res = Mod:Name(N, fixed, true),
+        Res = Mod:Name(N, S, true),
+
+        NegRes = -N bsr S,
+        NegRes = Mod:Name(-N, fixed, false),
+
+        NegRes = -N bsr S,
+        NegRes = Mod:Name(-N, S, false),
+
+        BslRes = N bsr -S,
+        BslRes = Mod:Name(N, -S, false)
+    catch
+        C:R:Stk ->
+            io:format("~p failed. numbers: ~p ~p\n", [Name,N,S]),
+            erlang:raise(C, R, Stk)
+    end,
+    test_bsr(T, Mod);
+test_bsr([], _) ->
+    ok.
+
 element(_Config) ->
     %% Test element_1: Can't fail for integer arguments.
     zero = element_1(0),
-- 
2.35.3

openSUSE Build Service is sponsored by