File 1103-Optimize-division-by-powers-of-two.patch of Package erlang
From 16dacd03bd0334f09d52383513f7bd9e45f1d6fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Sat, 29 Jul 2023 14:17:18 +0200
Subject: [PATCH 3/7] Optimize division by powers of two
We used to replace division by a power of two with a right shift
only when the dividend was known to be a positive integer. Extend
the implementation to do right shift when the range of the dividend
is unknown.
---
erts/emulator/beam/jit/arm/beam_asm.hpp | 9 ++
erts/emulator/beam/jit/arm/instr_arith.cpp | 159 +++++++++++++++------
erts/emulator/beam/jit/x86/instr_arith.cpp | 55 ++++++-
erts/emulator/test/small_SUITE.erl | 5 +-
4 files changed, 181 insertions(+), 47 deletions(-)
diff --git a/erts/emulator/beam/jit/arm/beam_asm.hpp b/erts/emulator/beam/jit/arm/beam_asm.hpp
index ab162f951a..38f1c6875f 100644
--- a/erts/emulator/beam/jit/arm/beam_asm.hpp
+++ b/erts/emulator/beam/jit/arm/beam_asm.hpp
@@ -1128,6 +1128,15 @@ protected:
const a64::Gp rhs_reg,
const Label next);
+ void emit_div_rem_literal(Sint divisor,
+ const ArgSource &Dividend,
+ arm::Gp dividend,
+ arm::Gp quotient,
+ arm::Gp remainder,
+ const Label &generic,
+ bool need_div,
+ bool need_rem);
+
void emit_div_rem(const ArgLabel &Fail,
const ArgSource &LHS,
const ArgSource &RHS,
diff --git a/erts/emulator/beam/jit/arm/instr_arith.cpp b/erts/emulator/beam/jit/arm/instr_arith.cpp
index 8ca898b675..a14ad5cbaf 100644
--- a/erts/emulator/beam/jit/arm/instr_arith.cpp
+++ b/erts/emulator/beam/jit/arm/instr_arith.cpp
@@ -853,6 +853,97 @@ void BeamGlobalAssembler::emit_int_div_rem_body_shared() {
}
}
+void BeamModuleAssembler::emit_div_rem_literal(Sint divisor,
+ const ArgSource &Dividend,
+ arm::Gp dividend,
+ arm::Gp quotient,
+ arm::Gp remainder,
+ const Label &generic,
+ bool need_div,
+ bool need_rem) {
+ arm::Gp small_tag = TMP6;
+ bool small_dividend = !generic.isValid();
+
+ ASSERT(divisor != (Sint)0);
+
+ if (!small_dividend) {
+ a.and_(small_tag, dividend, imm(_TAG_IMMED1_MASK));
+ a.cmp(small_tag, imm(_TAG_IMMED1_SMALL));
+ a.b_ne(generic);
+ }
+
+ if (Support::isPowerOf2(divisor)) {
+ arm::Gp original_dividend = dividend;
+ int shift = Support::ctz<Eterm>(divisor);
+
+ if (need_div && small_dividend) {
+ mov_imm(small_tag, _TAG_IMMED1_SMALL);
+ }
+
+ ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
+ if (std::get<0>(getClampedRange(Dividend)) >= 0) {
+ /* Positive dividend. */
+ if (need_div) {
+ comment("optimized div by replacing with right shift");
+ if (need_rem && quotient == dividend) {
+ original_dividend = TMP5;
+ a.mov(original_dividend, dividend);
+ }
+ a.orr(quotient, small_tag, dividend, arm::lsr(shift));
+ }
+ if (need_rem) {
+ auto mask = Support::lsbMask<Uint>(shift + _TAG_IMMED1_SIZE);
+ comment("optimized rem by replacing with masking");
+ a.and_(remainder, original_dividend, imm(mask));
+ }
+ } else {
+ /* Negative dividend. */
+ if (need_div) {
+ comment("optimized div by replacing with right shift");
+ }
+ if (divisor == 2) {
+ ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
+ a.add(TMP3, dividend, dividend, arm::lsr(63));
+ } else {
+ add(TMP1, dividend, (divisor - 1) << _TAG_IMMED1_SIZE);
+ a.cmp(dividend, imm(0));
+ a.csel(TMP3, TMP1, dividend, imm(arm::CondCode::kLT));
+ }
+ if (need_div) {
+ if (need_rem && quotient == dividend) {
+ original_dividend = TMP5;
+ a.mov(original_dividend, dividend);
+ }
+ a.orr(quotient, small_tag, TMP3, arm::asr(shift));
+ }
+ if (need_rem) {
+ Uint mask = (Uint)-1 << (shift + _TAG_IMMED1_SIZE);
+ comment("optimized rem by replacing with subtraction");
+ a.and_(TMP1, TMP3, imm(mask));
+ a.sub(remainder, original_dividend, TMP1);
+ }
+ }
+ } else {
+ a.asr(TMP1, dividend, imm(_TAG_IMMED1_SIZE));
+ mov_imm(TMP2, divisor);
+ a.sdiv(quotient, TMP1, TMP2);
+ if (need_rem) {
+ a.msub(remainder, quotient, TMP2, TMP1);
+ }
+
+ if (small_dividend) {
+ mov_imm(small_tag, _TAG_IMMED1_SMALL);
+ }
+ const arm::Shift tagShift = arm::lsl(_TAG_IMMED1_SIZE);
+ if (need_div) {
+ a.orr(quotient, small_tag, quotient, tagShift);
+ }
+ if (need_rem) {
+ a.orr(remainder, small_tag, remainder, tagShift);
+ }
+ }
+}
+
void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
const ArgSource &LHS,
const ArgSource &RHS,
@@ -865,52 +956,26 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
if (RHS.isSmall()) {
divisor = RHS.as<ArgSmall>().getSigned();
+ if (divisor == -1) {
+ divisor = 0;
+ }
}
- if (always_small(LHS) && divisor != (Sint)0 && divisor != (Sint)-1) {
+ if (always_small(LHS) && divisor != 0) {
auto lhs = load_source(LHS, ARG3);
auto quotient = init_destination(Quotient, ARG1);
auto remainder = init_destination(Remainder, ARG2);
+ Label invalidLabel; /* Intentionally not initialized */
comment("skipped test for smalls operands and overflow");
- if (Support::isPowerOf2(divisor) &&
- std::get<0>(getClampedRange(LHS)) >= 0) {
- int trailing_bits = Support::ctz<Eterm>(divisor);
- arm::Gp LHS_reg = lhs.reg;
- if (need_div) {
- comment("optimized div by replacing with right shift");
- ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
- if (need_rem && quotient.reg == lhs.reg) {
- LHS_reg = TMP1;
- a.mov(LHS_reg, lhs.reg);
- }
- a.lsr(quotient.reg, lhs.reg, imm(trailing_bits));
- a.orr(quotient.reg, quotient.reg, imm(_TAG_IMMED1_SMALL));
- }
- if (need_rem) {
- comment("optimized rem by replacing with masking");
- auto mask = Support::lsbMask<Uint>(trailing_bits +
- _TAG_IMMED1_SIZE);
- a.and_(remainder.reg, LHS_reg, imm(mask));
- }
- } else {
- a.asr(TMP1, lhs.reg, imm(_TAG_IMMED1_SIZE));
- mov_imm(TMP2, divisor);
- a.sdiv(quotient.reg, TMP1, TMP2);
- if (need_rem) {
- a.msub(remainder.reg, quotient.reg, TMP2, TMP1);
- }
-
- mov_imm(TMP3, _TAG_IMMED1_SMALL);
- const arm::Shift tagShift = arm::lsl(_TAG_IMMED1_SIZE);
- if (need_div) {
- a.orr(quotient.reg, TMP3, quotient.reg, tagShift);
- }
- if (need_rem) {
- a.orr(remainder.reg, TMP3, remainder.reg, tagShift);
- }
- }
-
+ emit_div_rem_literal(divisor,
+ LHS,
+ lhs.reg,
+ quotient.reg,
+ remainder.reg,
+ invalidLabel,
+ need_div,
+ need_rem);
if (need_div) {
flush_var(quotient);
}
@@ -918,11 +983,24 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
flush_var(remainder);
}
} else {
+ Label generic = a.newLabel(), done = a.newLabel();
auto [lhs, rhs] = load_sources(LHS, ARG2, RHS, ARG3);
+ if (divisor != (Sint)0) {
+ emit_div_rem_literal(divisor,
+ LHS,
+ lhs.reg,
+ ARG1,
+ ARG2,
+ generic,
+ need_div,
+ need_rem);
+ a.b(done);
+ }
+
+ a.bind(generic);
mov_var(ARG2, lhs);
mov_var(ARG3, rhs);
-
if (Fail.get() != 0) {
fragment_call(ga->get_int_div_rem_guard_shared());
a.b_eq(resolve_beam_label(Fail, disp1MB));
@@ -931,6 +1009,7 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
fragment_call(ga->get_int_div_rem_body_shared());
}
+ a.bind(done);
if (need_div) {
mov_arg(Quotient, ARG1);
}
diff --git a/erts/emulator/beam/jit/x86/instr_arith.cpp b/erts/emulator/beam/jit/x86/instr_arith.cpp
index fdb021fa7c..56c4eb06d3 100644
--- a/erts/emulator/beam/jit/x86/instr_arith.cpp
+++ b/erts/emulator/beam/jit/x86/instr_arith.cpp
@@ -652,10 +652,10 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
divisor = RHS.as<ArgSmall>().getSigned();
}
- if (divisor != (Sint)0 && divisor != (Sint)-1) {
+ mov_arg(x86::rax, LHS);
+
+ if (divisor != 0 && divisor != -1) {
/* There is no possibility of overflow. */
- a.mov(ARG6, imm(divisor));
- mov_arg(x86::rax, LHS);
if (always_small(LHS)) {
comment("skipped test for small dividend since it is always small");
need_generic = false;
@@ -672,10 +672,9 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
a.short_().jne(generic_div);
}
- /* Sign-extend and divide. The result is implicitly placed in
- * RAX and the remainder in RDX (ARG3). */
if (Support::isPowerOf2(divisor) &&
std::get<0>(getClampedRange(LHS)) >= 0) {
+ /* Unsigned integer division. */
int trailing_bits = Support::ctz<Eterm>(divisor);
if (need_rem) {
@@ -692,8 +691,52 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
a.shr(x86::rax, imm(trailing_bits));
a.or_(x86::rax, imm(_TAG_IMMED1_SMALL));
}
+ } else if (Support::isPowerOf2(divisor)) {
+ /* Signed integer division. */
+ int shift = Support::ctz<Eterm>(divisor);
+ Sint offset = (divisor - 1) << _TAG_IMMED1_SIZE;
+
+ if (need_rem) {
+ a.mov(x86::rdx, x86::rax);
+ ASSERT(x86::rdx != ARG1);
+ }
+
+ if (need_div) {
+ comment("optimized div by replacing with right shift");
+ }
+
+ if (divisor == 2) {
+ ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
+ a.mov(ARG1, x86::rax);
+ a.shr(ARG1, imm(63));
+ a.add(x86::rax, ARG1);
+ } else {
+ if (Support::isInt32(offset)) {
+ a.lea(ARG1, x86::qword_ptr(x86::rax, offset));
+ } else {
+ a.mov(ARG1, offset);
+ a.add(ARG1, x86::rax);
+ }
+ a.test(x86::rax, x86::rax);
+ a.cmovs(x86::rax, ARG1);
+ }
+
+ if (need_rem) {
+ Uint mask = (Uint)-1 << (shift + _TAG_IMMED1_SIZE);
+ comment("optimized rem by replacing with subtraction");
+ mov_imm(ARG1, mask);
+ a.and_(ARG1, x86::rax);
+ a.sub(x86::rdx, ARG1);
+ }
+
+ if (need_div) {
+ ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
+ a.sar(x86::rax, imm(shift));
+ a.or_(x86::rax, imm(_TAG_IMMED1_SMALL));
+ }
} else {
comment("divide with inlined code");
+ a.mov(ARG6, imm(divisor));
a.sar(x86::rax, imm(_TAG_IMMED1_SIZE));
a.cqo();
a.idiv(ARG6);
@@ -723,7 +766,7 @@ void BeamModuleAssembler::emit_div_rem(const ArgLabel &Fail,
a.bind(generic_div);
if (need_generic) {
mov_arg(ARG4, RHS); /* Done first as mov_arg may clobber ARG1 */
- mov_arg(ARG1, LHS);
+ a.mov(ARG1, x86::rax);
if (Fail.get() != 0) {
safe_fragment_call(ga->get_int_div_rem_guard_shared());
diff --git a/erts/emulator/test/small_SUITE.erl b/erts/emulator/test/small_SUITE.erl
index c8a1b2fbf2..bb4d69e355 100644
--- a/erts/emulator/test/small_SUITE.erl
+++ b/erts/emulator/test/small_SUITE.erl
@@ -28,7 +28,7 @@
test_bitwise/1, test_bsl/1,
element/1,
range_optimization/1]).
--export([mul_add/0]).
+-export([mul_add/0, division/0]).
-include_lib("common_test/include/ct.hrl").
@@ -713,6 +713,8 @@ madd(_, _, _, _) -> error.
%% Test that the JIT only omits the overflow check when it's safe.
+division() ->
+ [{timetrap, {minutes, 5}}].
division(_Config) ->
_ = rand:uniform(), %Seed generator
io:format("Seed: ~p", [rand:export_seed()]),
@@ -945,6 +947,7 @@ gen_div_function({Name,{A,B}}) ->
R = X rem Y,
{Q, R}. ").
+
test_division([{Name,{A,B}}|T], Mod) ->
F = fun Mod:Name/3,
try
--
2.35.3