File 1222-Optimize-construction-of-little-endian-segments.patch of Package erlang

From c0d22e700c2bba1c7b0df736e73ad70d019d8307 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Wed, 6 Aug 2025 10:35:48 +0200
Subject: [PATCH 2/3] Optimize construction of little-endian segments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In Erlang/OTP 26 (in #6031), the JIT learned to optimize binary
construction such as:

    <<A:16/big, B:32/big, C:16/big>>

The optimization is done on the native-code level, but the idea behind
it can be illustrated in Erlang by rewriting the construction as
follows:

    Acc0 = A,
    Acc1 = (Acc0 bsl 32) bor B,
    Acc = (Acc1 bsl 16) bor C,
    <<Acc:64/big>>

When done in native code, the values of the segments is accumulated
into a CPU register, which is then written to memory. This is faster
than writing each segment to memory one at a time, especially if the
sizes are not byte-sized as in the following example:

    <<A:6, B:6, C:6, D:6>>

This commit introduces a similar optimization for little-endian
integer segments. Example:

    <<A:16/little, B:32/little, C:16/little>>

This expression can be rewritten as follows:

    Acc0 = C,
    Acc1 = (Acc0 bsl 32) bor B,
    Acc = (Acc1 bsl 16) bor A,
    <<Acc:64/little>>

Note that this rewriting is only safe if all segments except the last
one are byte-sized.

Co-authored-by: John Högberg <john@erlang.org>
---
 erts/emulator/beam/jit/beam_jit_bs.cpp    | 160 +++++++++++++++++-----
 erts/emulator/test/bs_construct_SUITE.erl |  33 ++++-
 2 files changed, 155 insertions(+), 38 deletions(-)

diff --git a/erts/emulator/beam/jit/beam_jit_bs.cpp b/erts/emulator/beam/jit/beam_jit_bs.cpp
index 470f2a6e8e..1c203278c8 100644
--- a/erts/emulator/beam/jit/beam_jit_bs.cpp
+++ b/erts/emulator/beam/jit/beam_jit_bs.cpp
@@ -28,6 +28,9 @@
 #include "beam_jit_common.hpp"
 #include "beam_jit_bs.hpp"
 
+#include <iterator>
+#include <numeric>
+
 extern "C"
 {
 #include "beam_file.h"
@@ -86,70 +89,153 @@ std::vector<BscSegment> beam_jit_bsc_init(const Span<ArgVal> &args) {
     return segments;
 }
 
+template<typename It>
+static auto fold_group(std::vector<BscSegment> &segs, It first, It last) {
+    auto &back = segs.emplace_back(*first);
+
+    back.action = BscSegment::action::ACCUMULATE_FIRST;
+
+    return std::accumulate(std::next(first),
+                           last,
+                           back.effectiveSize,
+                           [&segs](Sint acc, const BscSegment &seg) {
+                               auto &back = segs.emplace_back(seg);
+
+                               back.action = BscSegment::action::ACCUMULATE;
+
+                               return acc + back.effectiveSize;
+                           });
+}
+
+static void push_group(std::vector<BscSegment> &segs,
+                       std::vector<BscSegment>::const_iterator start,
+                       std::vector<BscSegment>::const_iterator end) {
+    if (start < end) {
+        auto groupSize = ((start->flags & BSF_LITTLE) != 0)
+                                 ? fold_group(segs,
+                                              std::make_reverse_iterator(end),
+                                              std::make_reverse_iterator(start))
+                                 : fold_group(segs, start, end);
+
+        auto &seg = segs.emplace_back();
+
+        seg.type = am_integer;
+        seg.action = BscSegment::action::STORE;
+        seg.effectiveSize = groupSize;
+        seg.flags = start->flags;
+    }
+}
+
+/*
+ * Combine small segments into a group so that the values for the
+ * segments can be combined into an accumulator register and then
+ * written to memory. Here is an example in Erlang illustrating the
+ * idea. Consider this binary construction example:
+ *
+ *     <<A:16/big, B:32/big, C:16/big>>
+ *
+ * This can be rewritten as follows:
+ *
+ *     Acc0 = A,
+ *     Acc1 = (Acc0 bsl 32) bor B,
+ *     Acc = (Acc1 bsl 16) bor C,
+ *     <<Acc:64/big>>
+ *
+ * Translated to native code, this is faster because the accumulating
+ * is done in a CPU register, and then the result is written to memory.
+ * For big-endian segments, this rewrite works even if sizes are not
+ * byte-sized. For example:
+ *
+ *     <<A:6, B:6, C:6, D:6>>
+ *
+ * Little-endian segments can be optimized in a similar way. Consider:
+ *
+ *     <<A:16/little, B:32/little, C:16/little>>
+ *
+ * This can be rewritten like so:
+ *
+ *     Acc0 = C,
+ *     Acc1 = (Acc0 bsl 32) bor B,
+ *     Acc = (Acc1 bsl 16) bor A,
+ *     <<Acc:64/little>>
+ *
+ * However, for little-endian segments, this rewriting will only work
+ * if all segment sizes but the last one are byte-sized.
+ */
+
 std::vector<BscSegment> beam_jit_bsc_combine_segments(
         const std::vector<BscSegment> segments) {
     std::vector<BscSegment> segs;
 
-    for (auto seg : segments) {
+    auto group = segments.cend();
+    Sint combinedSize = 0;
+
+    for (auto it = segments.cbegin(); it != segments.cend(); it++) {
+        auto &seg = *it;
+
         switch (seg.type) {
         case am_integer: {
             if (!(0 < seg.effectiveSize && seg.effectiveSize <= 64)) {
                 /* Unknown or too large size. Handle using the default
                  * DIRECT action. */
+                push_group(segs, group, it);
+                group = segments.cend();
+
                 segs.push_back(seg);
                 continue;
             }
 
-            if (seg.flags & BSF_LITTLE || segs.size() == 0 ||
-                segs.back().action == BscSegment::action::DIRECT) {
-                /* There are no previous compatible ACCUMULATE / STORE
-                 * actions. Create the first ones. */
-                seg.action = BscSegment::action::ACCUMULATE_FIRST;
-                segs.push_back(seg);
-                seg.action = BscSegment::action::STORE;
-                segs.push_back(seg);
+            /* The current segment has a known size not exceeding 64
+             * bits. Try to add it to the current group. */
+            if (group == segments.cend()) {
+                group = it;
+
+                combinedSize = seg.effectiveSize;
                 continue;
             }
 
-            auto prev = segs.back();
-            if (prev.flags & BSF_LITTLE) {
-                /* Little-endian segments cannot be combined with other
-                 * segments. Create new ACCUMULATE_FIRST / STORE actions. */
-                seg.action = BscSegment::action::ACCUMULATE_FIRST;
-                segs.push_back(seg);
-                seg.action = BscSegment::action::STORE;
-                segs.push_back(seg);
+            /* There is already at least one segment in the group.
+             * Append the current segment to the group only if it is
+             * compatible and will fit. */
+
+            bool sameEndian =
+                    (seg.flags & BSF_LITTLE) == (group->flags & BSF_LITTLE);
+
+            /* Big-endian segments can always be grouped (if the size
+             * does not exceed 64 bits). Little-endian segments can
+             * only be grouped if all but the last segment are
+             * byte-sized. */
+            bool suitableSizes =
+                    ((seg.flags & BSF_LITTLE) == 0 || combinedSize % 8 == 0);
+
+            if (sameEndian && combinedSize + seg.effectiveSize <= 64 &&
+                suitableSizes) {
+                combinedSize += seg.effectiveSize;
                 continue;
             }
 
-            /* The current segment is compatible with the previous
-             * segment. Try combining them. */
-            if (prev.effectiveSize + seg.effectiveSize <= 64) {
-                /* The combined values of the segments fit in the
-                 * accumulator. Insert an ACCUMULATE action for the
-                 * current segment before the pre-existing STORE
-                 * action. */
-                segs.pop_back();
-                prev.effectiveSize += seg.effectiveSize;
-                seg.action = BscSegment::action::ACCUMULATE;
-                segs.push_back(seg);
-                segs.push_back(prev);
-            } else {
-                /* The size exceeds 64 bits. Can't combine. */
-                seg.action = BscSegment::action::ACCUMULATE_FIRST;
-                segs.push_back(seg);
-                seg.action = BscSegment::action::STORE;
-                segs.push_back(seg);
-            }
+            /*
+             * Not possible to fit anything more into the group.
+             * Flush the group and start a new group.
+             */
+            push_group(segs, group, it);
+            group = it;
+
+            combinedSize = seg.effectiveSize;
             break;
         }
         default:
+            push_group(segs, group, it);
+            group = segments.cend();
+
             segs.push_back(seg);
             break;
         }
     }
 
-    /* Calculate bit offsets for each ACCUMULATE segment. */
+    push_group(segs, group, segments.cend());
+
+    /* Calculate bit offsets for ACCUMULATE and STORE segments. */
 
     Uint offset = 0;
     for (int i = segs.size() - 1; i >= 0; i--) {
diff --git a/erts/emulator/test/bs_construct_SUITE.erl b/erts/emulator/test/bs_construct_SUITE.erl
index aceb28b800..7339b82e02 100644
--- a/erts/emulator/test/bs_construct_SUITE.erl
+++ b/erts/emulator/test/bs_construct_SUITE.erl
@@ -212,7 +212,27 @@ l(I_13, I_big1) ->
      %% Test non-byte sizes and also that the value does not bleed
      %% into the previous segment.
      ?T(<<17, I_big1:33>>, <<17, 197,49,128,73,1:1>>),
-     ?T(<<19, I_big1:39>>, <<19, 11,20,198,1,19:7>>)
+     ?T(<<19, I_big1:39>>, <<19, 11,20,198,1,19:7>>),
+
+     %% Test multiple little-endian segments.
+     ?T(<<I_big1:16/little, I_13:24/little>>,
+        [147,0,13,0,0]),
+     ?T(<<I_big1:13/little, I_13:3/little, I_big1:16/little>>,
+        [147,5,147,0]),
+     ?T(<<I_big1:16/little, I_13:24/little, I_big1:80/little>>,
+        [147,0,13,0,0,147,0,99,138,5,229,249,42,184,98]),
+     ?T(<<I_big1:48/little, (I_big1 bsr 17):16/little>>,
+        [147,0,99,138,5,229,49,197]),
+     ?T(<<I_big1:16/little, (I_big1 bsr 13):16/little,
+          (I_big1 bsr 15):16/little, (I_big1 bsr 23):16/little>>,
+        [147,0,24,83,198,20,20,11]),
+     ?T(<<I_big1:24/little, (I_big1 bsr 11):16/little,
+          (I_big1 bsr 18):16/little, (I_big1 bsr 26):32/little>>,
+        [147,0,99,96,76,152,98,98,65,121,190]),
+     ?T(<<0:5,I_big1:16/little, I_13:3/little>>,
+        [4,152,5]),
+     ?T(<<0:5,I_big1:16/little, (I_big1 bsr 15):19/little>>,
+        [4,152,6,48,163])
     ].
 
 native_3798() ->
@@ -842,6 +862,7 @@ dynamic_little(Bef, N, Int, Lpad, Rpad) ->
     Bin = <<Lpad:Bef/little,Int:N/little,Rpad:(128-Bef-N)/little>>,
 
     if
+        %% Test unusual units.
         Bef rem 8 =:= 0 ->
             Bin = <<Lpad:(Bef div 8)/little-unit:8,
                     Int:N/little,Rpad:(128-Bef-N)/little>>;
@@ -851,6 +872,16 @@ dynamic_little(Bef, N, Int, Lpad, Rpad) ->
         (128-Bef-N) rem 17 =:= 0 ->
             Aft = (128 - Bef - N) div 17,
             Bin = <<Lpad:Bef/little,Int:N/little,Rpad:Aft/little-unit:17>>;
+
+        %% Test combinations of little-integer segments of fixed size.
+        Bef =:= 33, N =:= 45 ->
+            Bin = <<Lpad:Bef/little,Int:N/little,Rpad:50/little>>;
+        Bef =:= 16, N =:= 40 ->
+            Bin = <<Lpad:Bef/little,Int:N/little,Rpad:72/little>>;
+        Bef =:= 16, N =:= 48 ->
+            Bin = <<Lpad:Bef/little,Int:N/little,Rpad:64/little>>;
+        Bef =:= 65, N =:= 32 ->
+            Bin = <<Lpad:Bef/little,Int:N/little,Rpad:31/little>>;
         true ->
             ok
     end,
-- 
2.51.0
Places

File 1222-Optimize-construction-of-little-endian-segments.patch of Package erlang

Places