File 1222-Optimize-construction-of-little-endian-segments.patch of Package erlang
From c0d22e700c2bba1c7b0df736e73ad70d019d8307 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Wed, 6 Aug 2025 10:35:48 +0200
Subject: [PATCH 2/3] Optimize construction of little-endian segments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
In Erlang/OTP 26 (in #6031), the JIT learned to optimize binary
construction such as:
<<A:16/big, B:32/big, C:16/big>>
The optimization is done on the native-code level, but the idea behind
it can be illustrated in Erlang by rewriting the construction as
follows:
Acc0 = A,
Acc1 = (Acc0 bsl 32) bor B,
Acc = (Acc1 bsl 16) bor C,
<<Acc:64/big>>
When done in native code, the values of the segments is accumulated
into a CPU register, which is then written to memory. This is faster
than writing each segment to memory one at a time, especially if the
sizes are not byte-sized as in the following example:
<<A:6, B:6, C:6, D:6>>
This commit introduces a similar optimization for little-endian
integer segments. Example:
<<A:16/little, B:32/little, C:16/little>>
This expression can be rewritten as follows:
Acc0 = C,
Acc1 = (Acc0 bsl 32) bor B,
Acc = (Acc1 bsl 16) bor A,
<<Acc:64/little>>
Note that this rewriting is only safe if all segments except the last
one are byte-sized.
Co-authored-by: John Högberg <john@erlang.org>
---
erts/emulator/beam/jit/beam_jit_bs.cpp | 160 +++++++++++++++++-----
erts/emulator/test/bs_construct_SUITE.erl | 33 ++++-
2 files changed, 155 insertions(+), 38 deletions(-)
diff --git a/erts/emulator/beam/jit/beam_jit_bs.cpp b/erts/emulator/beam/jit/beam_jit_bs.cpp
index 470f2a6e8e..1c203278c8 100644
--- a/erts/emulator/beam/jit/beam_jit_bs.cpp
+++ b/erts/emulator/beam/jit/beam_jit_bs.cpp
@@ -28,6 +28,9 @@
#include "beam_jit_common.hpp"
#include "beam_jit_bs.hpp"
+#include <iterator>
+#include <numeric>
+
extern "C"
{
#include "beam_file.h"
@@ -86,70 +89,153 @@ std::vector<BscSegment> beam_jit_bsc_init(const Span<ArgVal> &args) {
return segments;
}
+template<typename It>
+static auto fold_group(std::vector<BscSegment> &segs, It first, It last) {
+ auto &back = segs.emplace_back(*first);
+
+ back.action = BscSegment::action::ACCUMULATE_FIRST;
+
+ return std::accumulate(std::next(first),
+ last,
+ back.effectiveSize,
+ [&segs](Sint acc, const BscSegment &seg) {
+ auto &back = segs.emplace_back(seg);
+
+ back.action = BscSegment::action::ACCUMULATE;
+
+ return acc + back.effectiveSize;
+ });
+}
+
+static void push_group(std::vector<BscSegment> &segs,
+ std::vector<BscSegment>::const_iterator start,
+ std::vector<BscSegment>::const_iterator end) {
+ if (start < end) {
+ auto groupSize = ((start->flags & BSF_LITTLE) != 0)
+ ? fold_group(segs,
+ std::make_reverse_iterator(end),
+ std::make_reverse_iterator(start))
+ : fold_group(segs, start, end);
+
+ auto &seg = segs.emplace_back();
+
+ seg.type = am_integer;
+ seg.action = BscSegment::action::STORE;
+ seg.effectiveSize = groupSize;
+ seg.flags = start->flags;
+ }
+}
+
+/*
+ * Combine small segments into a group so that the values for the
+ * segments can be combined into an accumulator register and then
+ * written to memory. Here is an example in Erlang illustrating the
+ * idea. Consider this binary construction example:
+ *
+ * <<A:16/big, B:32/big, C:16/big>>
+ *
+ * This can be rewritten as follows:
+ *
+ * Acc0 = A,
+ * Acc1 = (Acc0 bsl 32) bor B,
+ * Acc = (Acc1 bsl 16) bor C,
+ * <<Acc:64/big>>
+ *
+ * Translated to native code, this is faster because the accumulating
+ * is done in a CPU register, and then the result is written to memory.
+ * For big-endian segments, this rewrite works even if sizes are not
+ * byte-sized. For example:
+ *
+ * <<A:6, B:6, C:6, D:6>>
+ *
+ * Little-endian segments can be optimized in a similar way. Consider:
+ *
+ * <<A:16/little, B:32/little, C:16/little>>
+ *
+ * This can be rewritten like so:
+ *
+ * Acc0 = C,
+ * Acc1 = (Acc0 bsl 32) bor B,
+ * Acc = (Acc1 bsl 16) bor A,
+ * <<Acc:64/little>>
+ *
+ * However, for little-endian segments, this rewriting will only work
+ * if all segment sizes but the last one are byte-sized.
+ */
+
std::vector<BscSegment> beam_jit_bsc_combine_segments(
const std::vector<BscSegment> segments) {
std::vector<BscSegment> segs;
- for (auto seg : segments) {
+ auto group = segments.cend();
+ Sint combinedSize = 0;
+
+ for (auto it = segments.cbegin(); it != segments.cend(); it++) {
+ auto &seg = *it;
+
switch (seg.type) {
case am_integer: {
if (!(0 < seg.effectiveSize && seg.effectiveSize <= 64)) {
/* Unknown or too large size. Handle using the default
* DIRECT action. */
+ push_group(segs, group, it);
+ group = segments.cend();
+
segs.push_back(seg);
continue;
}
- if (seg.flags & BSF_LITTLE || segs.size() == 0 ||
- segs.back().action == BscSegment::action::DIRECT) {
- /* There are no previous compatible ACCUMULATE / STORE
- * actions. Create the first ones. */
- seg.action = BscSegment::action::ACCUMULATE_FIRST;
- segs.push_back(seg);
- seg.action = BscSegment::action::STORE;
- segs.push_back(seg);
+ /* The current segment has a known size not exceeding 64
+ * bits. Try to add it to the current group. */
+ if (group == segments.cend()) {
+ group = it;
+
+ combinedSize = seg.effectiveSize;
continue;
}
- auto prev = segs.back();
- if (prev.flags & BSF_LITTLE) {
- /* Little-endian segments cannot be combined with other
- * segments. Create new ACCUMULATE_FIRST / STORE actions. */
- seg.action = BscSegment::action::ACCUMULATE_FIRST;
- segs.push_back(seg);
- seg.action = BscSegment::action::STORE;
- segs.push_back(seg);
+ /* There is already at least one segment in the group.
+ * Append the current segment to the group only if it is
+ * compatible and will fit. */
+
+ bool sameEndian =
+ (seg.flags & BSF_LITTLE) == (group->flags & BSF_LITTLE);
+
+ /* Big-endian segments can always be grouped (if the size
+ * does not exceed 64 bits). Little-endian segments can
+ * only be grouped if all but the last segment are
+ * byte-sized. */
+ bool suitableSizes =
+ ((seg.flags & BSF_LITTLE) == 0 || combinedSize % 8 == 0);
+
+ if (sameEndian && combinedSize + seg.effectiveSize <= 64 &&
+ suitableSizes) {
+ combinedSize += seg.effectiveSize;
continue;
}
- /* The current segment is compatible with the previous
- * segment. Try combining them. */
- if (prev.effectiveSize + seg.effectiveSize <= 64) {
- /* The combined values of the segments fit in the
- * accumulator. Insert an ACCUMULATE action for the
- * current segment before the pre-existing STORE
- * action. */
- segs.pop_back();
- prev.effectiveSize += seg.effectiveSize;
- seg.action = BscSegment::action::ACCUMULATE;
- segs.push_back(seg);
- segs.push_back(prev);
- } else {
- /* The size exceeds 64 bits. Can't combine. */
- seg.action = BscSegment::action::ACCUMULATE_FIRST;
- segs.push_back(seg);
- seg.action = BscSegment::action::STORE;
- segs.push_back(seg);
- }
+ /*
+ * Not possible to fit anything more into the group.
+ * Flush the group and start a new group.
+ */
+ push_group(segs, group, it);
+ group = it;
+
+ combinedSize = seg.effectiveSize;
break;
}
default:
+ push_group(segs, group, it);
+ group = segments.cend();
+
segs.push_back(seg);
break;
}
}
- /* Calculate bit offsets for each ACCUMULATE segment. */
+ push_group(segs, group, segments.cend());
+
+ /* Calculate bit offsets for ACCUMULATE and STORE segments. */
Uint offset = 0;
for (int i = segs.size() - 1; i >= 0; i--) {
diff --git a/erts/emulator/test/bs_construct_SUITE.erl b/erts/emulator/test/bs_construct_SUITE.erl
index aceb28b800..7339b82e02 100644
--- a/erts/emulator/test/bs_construct_SUITE.erl
+++ b/erts/emulator/test/bs_construct_SUITE.erl
@@ -212,7 +212,27 @@ l(I_13, I_big1) ->
%% Test non-byte sizes and also that the value does not bleed
%% into the previous segment.
?T(<<17, I_big1:33>>, <<17, 197,49,128,73,1:1>>),
- ?T(<<19, I_big1:39>>, <<19, 11,20,198,1,19:7>>)
+ ?T(<<19, I_big1:39>>, <<19, 11,20,198,1,19:7>>),
+
+ %% Test multiple little-endian segments.
+ ?T(<<I_big1:16/little, I_13:24/little>>,
+ [147,0,13,0,0]),
+ ?T(<<I_big1:13/little, I_13:3/little, I_big1:16/little>>,
+ [147,5,147,0]),
+ ?T(<<I_big1:16/little, I_13:24/little, I_big1:80/little>>,
+ [147,0,13,0,0,147,0,99,138,5,229,249,42,184,98]),
+ ?T(<<I_big1:48/little, (I_big1 bsr 17):16/little>>,
+ [147,0,99,138,5,229,49,197]),
+ ?T(<<I_big1:16/little, (I_big1 bsr 13):16/little,
+ (I_big1 bsr 15):16/little, (I_big1 bsr 23):16/little>>,
+ [147,0,24,83,198,20,20,11]),
+ ?T(<<I_big1:24/little, (I_big1 bsr 11):16/little,
+ (I_big1 bsr 18):16/little, (I_big1 bsr 26):32/little>>,
+ [147,0,99,96,76,152,98,98,65,121,190]),
+ ?T(<<0:5,I_big1:16/little, I_13:3/little>>,
+ [4,152,5]),
+ ?T(<<0:5,I_big1:16/little, (I_big1 bsr 15):19/little>>,
+ [4,152,6,48,163])
].
native_3798() ->
@@ -842,6 +862,7 @@ dynamic_little(Bef, N, Int, Lpad, Rpad) ->
Bin = <<Lpad:Bef/little,Int:N/little,Rpad:(128-Bef-N)/little>>,
if
+ %% Test unusual units.
Bef rem 8 =:= 0 ->
Bin = <<Lpad:(Bef div 8)/little-unit:8,
Int:N/little,Rpad:(128-Bef-N)/little>>;
@@ -851,6 +872,16 @@ dynamic_little(Bef, N, Int, Lpad, Rpad) ->
(128-Bef-N) rem 17 =:= 0 ->
Aft = (128 - Bef - N) div 17,
Bin = <<Lpad:Bef/little,Int:N/little,Rpad:Aft/little-unit:17>>;
+
+ %% Test combinations of little-integer segments of fixed size.
+ Bef =:= 33, N =:= 45 ->
+ Bin = <<Lpad:Bef/little,Int:N/little,Rpad:50/little>>;
+ Bef =:= 16, N =:= 40 ->
+ Bin = <<Lpad:Bef/little,Int:N/little,Rpad:72/little>>;
+ Bef =:= 16, N =:= 48 ->
+ Bin = <<Lpad:Bef/little,Int:N/little,Rpad:64/little>>;
+ Bef =:= 65, N =:= 32 ->
+ Bin = <<Lpad:Bef/little,Int:N/little,Rpad:31/little>>;
true ->
ok
end,
--
2.51.0