File 1103-Optimize-division-by-powers-of-two.patch of Package erlang

From 16dacd03bd0334f09d52383513f7bd9e45f1d6fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Sat, 29 Jul 2023 14:17:18 +0200
Subject: [PATCH 3/7] Optimize division by powers of two

We used to replace division by a power of two with a right shift
only when the dividend was known to be a positive integer. Extend
the implementation to do right shift when the range of the dividend
is unknown.
---
 erts/emulator/beam/jit/arm/beam_asm.hpp    |   9 ++
 erts/emulator/beam/jit/arm/instr_arith.cpp | 159 +++++++++++++++------
 erts/emulator/beam/jit/x86/instr_arith.cpp |  55 ++++++-
 erts/emulator/test/small_SUITE.erl         |   5 +-
 4 files changed, 181 insertions(+), 47 deletions(-)

diff --git a/erts/emulator/beam/jit/arm/beam_asm.hpp b/erts/emulator/beam/jit/arm/beam_asm.hpp
index ab162f951a..38f1c6875f 100644
--- a/erts/emulator/beam/jit/arm/beam_asm.hpp
+++ b/erts/emulator/beam/jit/arm/beam_asm.hpp
@@ -1128,6 +1128,15 @@ protected:
                              const a64::Gp rhs_reg,
                              const Label next);
 
+    void emit_div_rem_literal(Sint divisor,
+                              const ArgSource &Dividend,
+                              arm::Gp dividend,
+                              arm::Gp quotient,
+                              arm::Gp remainder,
+                              const Label &generic,
+                              bool need_div,
+                              bool need_rem);
+
     void emit_div_rem(const ArgLabel &Fail,
                       const ArgSource &LHS,
                       const ArgSource &RHS,
diff --git a/erts/emulator/beam/jit/arm/instr_arith.cpp b/erts/emulator/beam/jit/arm/instr_arith.cpp
index 8ca898b675..a14ad5cbaf 100644
--- a/erts/emulator/beam/jit/arm/instr_arith.cpp
+++ b/erts/emulator/beam/jit/arm/instr_arith.cpp
@@ -853,6 +853,97 @@ void BeamGlobalAssembler::emit_int_div_rem_body_shared() {
     }
 }
 
+void BeamModuleAssembler::emit_div_rem_literal(Sint divisor,
+                                               const ArgSource &Dividend,
+                                               arm::Gp dividend,
+                                               arm::Gp quotient,
+                                               arm::Gp remainder,
+                                               const Label &generic,
+                                               bool need_div,
+                                               bool need_rem) {
+    arm::Gp small_tag = TMP6;
+    bool small_dividend = !generic.isValid();
+
+    ASSERT(divisor != (Sint)0);
+
+    if (!small_dividend) {
+        a.and_(small_tag, dividend, imm(_TAG_IMMED1_MASK));
+        a.cmp(small_tag, imm(_TAG_IMMED1_SMALL));
+        a.b_ne(generic);
+    }
+
+    if (Support::isPowerOf2(divisor)) {
+        arm::Gp original_dividend = dividend;
+        int shift = Support::ctz<Eterm>(divisor);
+
+        if (need_div && small_dividend) {
+            mov_imm(small_tag, _TAG_IMMED1_SMALL);
+        }
+
+        ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
+        if (std::get<0>(getClampedRange(Dividend)) >= 0) {
+            /* Positive dividend. */
+            if (need_div) {
+                comment("optimized div by replacing with right shift");
+                if (need_rem && quotient == dividend) {
+                    original_dividend = TMP5;
+                    a.mov(original_dividend, dividend);
+                }
+                a.orr(quotient, small_tag, dividend, arm::lsr(shift));
+            }
+            if (need_rem) {
+                auto mask = Support::lsbMask<Uint>(shift + _TAG_IMMED1_SIZE);
+                comment("optimized rem by replacing with masking");
+                a.and_(remainder, original_dividend, imm(mask));
+            }
+        } else {
+            /* Negative dividend. */
+            if (need_div) {
+                comment("optimized div by replacing with right shift");
+            }
+            if (divisor == 2) {
+                ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
+                a.add(TMP3, dividend, dividend, arm::lsr(63));
+            } else {
+                add(TMP1, dividend, (divisor - 1) << _TAG_IMMED1_SIZE);
+                a.cmp(dividend, imm(0));
+                a.csel(TMP3, TMP1, dividend, imm(arm::CondCode::kLT));
+            }
+            if (need_div) {
+                if (need_rem && quotient == dividend) {
+                    original_dividend = TMP5;
+                    a.mov(original_dividend, dividend);
+                }
+                a.orr(quotient, small_tag, TMP3, arm::asr(shift));
+            }
+            if (need_rem) {
+                Uint mask = (Uint)-1 << (shift + _TAG_IMMED1_SIZE);
+                comment("optimized rem by replacing with subtraction");
+                a.and_(TMP1, TMP3, imm(mask));
+                a.sub(remainder, original_dividend, TMP1);
+            }
+        }
+    } else {
+        a.asr(TMP1, dividend, imm(_TAG_IMMED1_SIZE));
+        mov_imm(TMP2, divisor);
+        a.sdiv(quotient, TMP1, TMP2);
+        if (need_rem) {
+            a.msub(remainder, quotient, TMP2, TMP1);
+        }
+
+        if (small_dividend) {
+            mov_imm(small_tag, _TAG_IMMED1_SMALL);
+        }
+        const arm::Shift tagShift = arm::lsl(_TAG_IMMED1_SIZE);
+        if (need_div) {
+            a.orr(quotient, small_tag, quotient, tagShift);
+        }
+        if (need_rem) {
+            a.orr(remainder, small_tag, remainder, tagShift);
+        }
+    }
+}
+
 void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
                                        const ArgSource &LHS,
                                        const ArgSource &RHS,
@@ -865,52 +956,26 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
 
     if (RHS.isSmall()) {
         divisor = RHS.as<ArgSmall>().getSigned();
+        if (divisor == -1) {
+            divisor = 0;
+        }
     }
 
-    if (always_small(LHS) && divisor != (Sint)0 && divisor != (Sint)-1) {
+    if (always_small(LHS) && divisor != 0) {
         auto lhs = load_source(LHS, ARG3);
         auto quotient = init_destination(Quotient, ARG1);
         auto remainder = init_destination(Remainder, ARG2);
+        Label invalidLabel; /* Intentionally not initialized */
 
         comment("skipped test for smalls operands and overflow");
-        if (Support::isPowerOf2(divisor) &&
-            std::get<0>(getClampedRange(LHS)) >= 0) {
-            int trailing_bits = Support::ctz<Eterm>(divisor);
-            arm::Gp LHS_reg = lhs.reg;
-            if (need_div) {
-                comment("optimized div by replacing with right shift");
-                ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
-                if (need_rem && quotient.reg == lhs.reg) {
-                    LHS_reg = TMP1;
-                    a.mov(LHS_reg, lhs.reg);
-                }
-                a.lsr(quotient.reg, lhs.reg, imm(trailing_bits));
-                a.orr(quotient.reg, quotient.reg, imm(_TAG_IMMED1_SMALL));
-            }
-            if (need_rem) {
-                comment("optimized rem by replacing with masking");
-                auto mask = Support::lsbMask<Uint>(trailing_bits +
-                                                   _TAG_IMMED1_SIZE);
-                a.and_(remainder.reg, LHS_reg, imm(mask));
-            }
-        } else {
-            a.asr(TMP1, lhs.reg, imm(_TAG_IMMED1_SIZE));
-            mov_imm(TMP2, divisor);
-            a.sdiv(quotient.reg, TMP1, TMP2);
-            if (need_rem) {
-                a.msub(remainder.reg, quotient.reg, TMP2, TMP1);
-            }
-
-            mov_imm(TMP3, _TAG_IMMED1_SMALL);
-            const arm::Shift tagShift = arm::lsl(_TAG_IMMED1_SIZE);
-            if (need_div) {
-                a.orr(quotient.reg, TMP3, quotient.reg, tagShift);
-            }
-            if (need_rem) {
-                a.orr(remainder.reg, TMP3, remainder.reg, tagShift);
-            }
-        }
-
+        emit_div_rem_literal(divisor,
+                             LHS,
+                             lhs.reg,
+                             quotient.reg,
+                             remainder.reg,
+                             invalidLabel,
+                             need_div,
+                             need_rem);
         if (need_div) {
             flush_var(quotient);
         }
@@ -918,11 +983,24 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
             flush_var(remainder);
         }
     } else {
+        Label generic = a.newLabel(), done = a.newLabel();
         auto [lhs, rhs] = load_sources(LHS, ARG2, RHS, ARG3);
 
+        if (divisor != (Sint)0) {
+            emit_div_rem_literal(divisor,
+                                 LHS,
+                                 lhs.reg,
+                                 ARG1,
+                                 ARG2,
+                                 generic,
+                                 need_div,
+                                 need_rem);
+            a.b(done);
+        }
+
+        a.bind(generic);
         mov_var(ARG2, lhs);
         mov_var(ARG3, rhs);
-
         if (Fail.get() != 0) {
             fragment_call(ga->get_int_div_rem_guard_shared());
             a.b_eq(resolve_beam_label(Fail, disp1MB));
@@ -931,6 +1009,7 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
             fragment_call(ga->get_int_div_rem_body_shared());
         }
 
+        a.bind(done);
         if (need_div) {
             mov_arg(Quotient, ARG1);
         }
diff --git a/erts/emulator/beam/jit/x86/instr_arith.cpp b/erts/emulator/beam/jit/x86/instr_arith.cpp
index fdb021fa7c..56c4eb06d3 100644
--- a/erts/emulator/beam/jit/x86/instr_arith.cpp
+++ b/erts/emulator/beam/jit/x86/instr_arith.cpp
@@ -652,10 +652,10 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
         divisor = RHS.as<ArgSmall>().getSigned();
     }
 
-    if (divisor != (Sint)0 && divisor != (Sint)-1) {
+    mov_arg(x86::rax, LHS);
+
+    if (divisor != 0 && divisor != -1) {
         /* There is no possibility of overflow. */
-        a.mov(ARG6, imm(divisor));
-        mov_arg(x86::rax, LHS);
         if (always_small(LHS)) {
             comment("skipped test for small dividend since it is always small");
             need_generic = false;
@@ -672,10 +672,9 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
             a.short_().jne(generic_div);
         }
 
-        /* Sign-extend and divide. The result is implicitly placed in
-         * RAX and the remainder in RDX (ARG3). */
         if (Support::isPowerOf2(divisor) &&
             std::get<0>(getClampedRange(LHS)) >= 0) {
+            /* Unsigned integer division. */
             int trailing_bits = Support::ctz<Eterm>(divisor);
 
             if (need_rem) {
@@ -692,8 +691,52 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
                 a.shr(x86::rax, imm(trailing_bits));
                 a.or_(x86::rax, imm(_TAG_IMMED1_SMALL));
             }
+        } else if (Support::isPowerOf2(divisor)) {
+            /* Signed integer division. */
+            int shift = Support::ctz<Eterm>(divisor);
+            Sint offset = (divisor - 1) << _TAG_IMMED1_SIZE;
+
+            if (need_rem) {
+                a.mov(x86::rdx, x86::rax);
+                ASSERT(x86::rdx != ARG1);
+            }
+
+            if (need_div) {
+                comment("optimized div by replacing with right shift");
+            }
+
+            if (divisor == 2) {
+                ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
+                a.mov(ARG1, x86::rax);
+                a.shr(ARG1, imm(63));
+                a.add(x86::rax, ARG1);
+            } else {
+                if (Support::isInt32(offset)) {
+                    a.lea(ARG1, x86::qword_ptr(x86::rax, offset));
+                } else {
+                    a.mov(ARG1, offset);
+                    a.add(ARG1, x86::rax);
+                }
+                a.test(x86::rax, x86::rax);
+                a.cmovs(x86::rax, ARG1);
+            }
+
+            if (need_rem) {
+                Uint mask = (Uint)-1 << (shift + _TAG_IMMED1_SIZE);
+                comment("optimized rem by replacing with subtraction");
+                mov_imm(ARG1, mask);
+                a.and_(ARG1, x86::rax);
+                a.sub(x86::rdx, ARG1);
+            }
+
+            if (need_div) {
+                ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
+                a.sar(x86::rax, imm(shift));
+                a.or_(x86::rax, imm(_TAG_IMMED1_SMALL));
+            }
         } else {
             comment("divide with inlined code");
+            a.mov(ARG6, imm(divisor));
             a.sar(x86::rax, imm(_TAG_IMMED1_SIZE));
             a.cqo();
             a.idiv(ARG6);
@@ -723,7 +766,7 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
     a.bind(generic_div);
     if (need_generic) {
         mov_arg(ARG4, RHS); /* Done first as mov_arg may clobber ARG1 */
-        mov_arg(ARG1, LHS);
+        a.mov(ARG1, x86::rax);
 
         if (Fail.get() != 0) {
             safe_fragment_call(ga->get_int_div_rem_guard_shared());
diff --git a/erts/emulator/test/small_SUITE.erl b/erts/emulator/test/small_SUITE.erl
index c8a1b2fbf2..bb4d69e355 100644
--- a/erts/emulator/test/small_SUITE.erl
+++ b/erts/emulator/test/small_SUITE.erl
@@ -28,7 +28,7 @@
          test_bitwise/1, test_bsl/1,
          element/1,
          range_optimization/1]).
--export([mul_add/0]).
+-export([mul_add/0, division/0]).
 
 -include_lib("common_test/include/ct.hrl").
 
@@ -713,6 +713,8 @@ madd(_, _, _, _) -> error.
 
 
 %% Test that the JIT only omits the overflow check when it's safe.
+division() ->
+    [{timetrap, {minutes, 5}}].
 division(_Config) ->
     _ = rand:uniform(),				%Seed generator
     io:format("Seed: ~p", [rand:export_seed()]),
@@ -945,6 +947,7 @@ gen_div_function({Name,{A,B}}) ->
            R = X rem Y,
            {Q, R}. ").
 
+
 test_division([{Name,{A,B}}|T], Mod) ->
     F = fun Mod:Name/3,
     try
-- 
2.35.3

openSUSE Build Service is sponsored by