File 4351-Use-SWAR-to-optimize-JSON-string-scanning.patch of Package erlang

From 61c3df61ec1c662c34f9107577184aeed6d1a705 Mon Sep 17 00:00:00 2001
From: Nelson Vides <videsnelson@gmail.com>
Date: Sun, 29 Mar 2026 17:25:14 +0000
Subject: [PATCH] Use SWAR to optimize JSON string scanning

Replace the per-byte ASCII check in the JSON encoder and decoder hot
loops with a SWAR (SIMD Within A Register) approach that validates 7
bytes at once using bitwise arithmetic on a single 56-bit integer.

The previous implementation matched 8 individual bytes and checked
each with `is_ascii_plain/1`, which the JIT compiled to 8 separate
jump table lookups with indirect branches. The new approach matches
a single `<<W:56>>` and uses 4 bitwise guard checks to validate all
7 bytes in parallel: range check `[32, 127]` via bit masking, and
detection of `"` and `\` via Mycroft's zero-byte trick.

56 bits is chosen as the largest integer that fits in a BEAM small
(59-bit on 64-bit), ensuring all guard operations (band, bxor, +, -)
compile to bare native instructions with no type checks or bignum
fallbacks.

The JIT-generated assembly for `escape_binary_ascii/5` shrinks from
~1750 lines to ~100 lines. The original emitted 8 indirect jumps
through 96-entry jump tables (768 table entries in .rodata), while
the SWAR version uses ~20 sequential ALU instructions (and, or, xor,
add, sub, cmp) with only direct conditional branches. This reduces
code size, eliminates icache/dcache pressure from the jump tables,
and is friendlier to branch prediction.

Benchmarks from `Jason` show consistent improvements of 3-55% over the
original, with the largest gains on string-heavy payloads:

Issue 90 (long strings): 55% faster (7.66ms -> 4.94ms)
UTF-8 unescaped:         16% faster
GitHub (API response):   13% faster
Giphy (URL-heavy):       10% faster

4/6/7-byte SWAR variants were benchmarked; 7 bytes wins decisively
on long-string inputs and is never meaningfully slower on short-string
workloads, where the fallback byte-by-byte path handles strings
shorter than 7 bytes.
---
 lib/stdlib/src/json.erl | 20 ++++++------
 lib/stdlib/src/json.hrl | 68 +++++++++++++++++++++++++++++++++++------
 2 files changed, 69 insertions(+), 19 deletions(-)

diff --git a/lib/stdlib/src/json.erl b/lib/stdlib/src/json.erl
index 354b71097c..f2996790ef 100644
--- a/lib/stdlib/src/json.erl
+++ b/lib/stdlib/src/json.erl
@@ -361,15 +361,15 @@ escape_binary(Bin) -> escape_binary_ascii(Bin, [$"], Bin, 0, 0).
 
 escape_binary_ascii(Binary, Acc, Orig, Skip, Len) ->
     case Binary of
-        <<B1, B2, B3, B4, B5, B6, B7, B8, Rest/binary>> when ?are_all_ascii_plain(B1, B2, B3, B4, B5, B6, B7, B8) ->
-            escape_binary_ascii(Rest, Acc, Orig, Skip, Len + 8);
+        <<W:56, Rest/binary>> when ?are_all_ascii_plain_swar(W) ->
+            escape_binary_ascii(Rest, Acc, Orig, Skip, Len + 7);
         Other ->
             escape_binary(Other, Acc, Orig, Skip, Len)
     end.
 
 escape_binary(<<Byte, Rest/binary>>, Acc, Orig, Skip, Len) when ?is_ascii_plain(Byte) ->
-    %% we got here because there were either less than 8 bytes left
-    %% or we have an escape in the next 8 bytes,
+    %% we got here because there were either less than 7 bytes left
+    %% or we have an escape in the next 7 bytes,
     %% escape_binary_ascii would fail and dispatch here anyway
     escape_binary(Rest, Acc, Orig, Skip, Len + 1);
 escape_binary(<<Byte, Rest/binary>>, Acc, Orig, Skip0, Len) when ?is_ascii_escape(Byte) ->
@@ -410,8 +410,8 @@ escape_all(Bin) -> escape_all_ascii(Bin, [$"], Bin, 0, 0).
 
 escape_all_ascii(Binary, Acc, Orig, Skip, Len) ->
     case Binary of
-        <<B1, B2, B3, B4, B5, B6, B7, B8, Rest/binary>> when ?are_all_ascii_plain(B1, B2, B3, B4, B5, B6, B7, B8) ->
-            escape_all_ascii(Rest, Acc, Orig, Skip, Len + 8);
+        <<W:56, Rest/binary>> when ?are_all_ascii_plain_swar(W) ->
+            escape_all_ascii(Rest, Acc, Orig, Skip, Len + 7);
         Other ->
             escape_all(Other, Acc, Orig, Skip, Len)
     end.
@@ -1175,8 +1175,8 @@ string(Binary, Original, Skip, Acc, Stack, Decode) ->
 
 string_ascii(Binary, Original, Skip, Acc, Stack, Decode, Len) ->
     case Binary of
-        <<B1, B2, B3, B4, B5, B6, B7, B8, Rest/binary>> when ?are_all_ascii_plain(B1, B2, B3, B4, B5, B6, B7, B8) ->
-            string_ascii(Rest, Original, Skip, Acc, Stack, Decode, Len + 8);
+        <<W:56, Rest/binary>> when ?are_all_ascii_plain_swar(W) ->
+            string_ascii(Rest, Original, Skip, Acc, Stack, Decode, Len + 7);
         Other ->
             string(Other, Original, Skip, Acc, Stack, Decode, Len)
     end.
@@ -1218,8 +1218,8 @@ string_utf8(_, Orig, Skip, Acc, Stack, Decode, Len, _State0) ->
 
 string_ascii(Binary, Original, Skip, Acc, Stack, Decode, Start, Len, SAcc) ->
     case Binary of
-        <<B1, B2, B3, B4, B5, B6, B7, B8, Rest/binary>> when ?are_all_ascii_plain(B1, B2, B3, B4, B5, B6, B7, B8) ->
-            string_ascii(Rest, Original, Skip, Acc, Stack, Decode, Start, Len + 8, SAcc);
+        <<W:56, Rest/binary>> when ?are_all_ascii_plain_swar(W) ->
+            string_ascii(Rest, Original, Skip, Acc, Stack, Decode, Start, Len + 7, SAcc);
         Other ->
             string(Other, Original, Skip, Acc, Stack, Decode, Start, Len, SAcc)
     end.
diff --git a/lib/stdlib/src/json.hrl b/lib/stdlib/src/json.hrl
index bf16b2e0d7..aa0f80a1c3 100644
--- a/lib/stdlib/src/json.hrl
+++ b/lib/stdlib/src/json.hrl
@@ -179,13 +179,63 @@
     Byte =:= 127
 ).
 
--define(are_all_ascii_plain(B1, B2, B3, B4, B5, B6, B7, B8),
-    (?is_ascii_plain(B1)) andalso
-    (?is_ascii_plain(B2)) andalso
-    (?is_ascii_plain(B3)) andalso
-    (?is_ascii_plain(B4)) andalso
-    (?is_ascii_plain(B5)) andalso
-    (?is_ascii_plain(B6)) andalso
-    (?is_ascii_plain(B7)) andalso
-    (?is_ascii_plain(B8))
+%% SWAR (SIMD Within A Register) check for 7 bytes of plain ASCII at once.
+%%
+%% Instead of matching 8 individual bytes and checking each against
+%% is_ascii_plain/1 (which the JIT compiles to 8 jump table lookups
+%% with indirect branches), we match a single 56-bit integer and use
+%% bitwise arithmetic to validate all 7 bytes in parallel.
+%%
+%% We use 56 bits (7 bytes) because it is the largest value that fits
+%% in a BEAM small integer (59-bit on 64-bit). This ensures all
+%% bitwise and arithmetic guard operations (band, bor, bxor, +, -)
+%% compile to single native instructions with no type checks or
+%% bignum fallback calls. Benchmarks showed 4/6/7-byte variants all
+%% outperform the original, with 7 bytes winning on string-heavy
+%% inputs (up to 55% faster) due to its larger stride.
+%%
+%% The byte-by-byte is_ascii_plain fallback path handles any remaining
+%% bytes (< 7) and is always entered when the SWAR check fails, so
+%% correctness does not depend on the SWAR path.
+
+-define(SWAR_MASK80, 16#80808080808080).
+-define(SWAR_MASK01, 16#01010101010101).
+
+%% Detect if any byte in a 56-bit word is zero (Mycroft's trick).
+%%
+%% This is a simplified variant that omits the standard (bnot V) term.
+%% The full formula is: ((V - 0x01..01) band (bnot V) band 0x80..80).
+%% The (bnot V) term filters out false positives from bytes >= 0x80,
+%% where subtracting 0x01 does not clear the high bit. This term is
+%% unnecessary here: check 1 in are_all_ascii_plain_swar/1 proves all
+%% bytes of W are < 128 before no_zero_byte is reached (thanks to
+%% andalso short-circuit evaluation), and XOR of two 7-bit values is
+%% still 7-bit, so no byte in V can have bit 7 set.
+%%
+%% We also avoid bnot because the JIT lacks an always_small fast path
+%% for it, emitting runtime type checks and bignum fallback calls even
+%% when the result provably fits in a small.
+%%
+%% Borrow propagation between bytes may cause rare false positives
+%% (a non-zero byte adjacent to a zero byte detected as zero), but
+%% these are harmless: we simply fall through to the byte-by-byte
+%% path which is always correct.
+-define(no_zero_byte(V),
+    ((V) - ?SWAR_MASK01) band ?SWAR_MASK80 =:= 0
+).
+
+%% SWAR check: all 7 bytes (in one 56-bit word) are "plain ASCII"
+%% i.e., in [32, 127] and not $" (0x22) or $\\ (0x5C).
+%%
+%% Four checks, each operating on all 7 bytes simultaneously:
+%%   1. band with 0x80..80 detects bytes >= 128
+%%   2. add 0x60..60 then band 0x80..80 detects bytes < 32
+%%      (byte + 0x60 sets the high bit iff byte >= 0x20, given byte < 0x80)
+%%   3. XOR with 0x22..22 maps $" to 0, then no_zero_byte detects it
+%%   4. XOR with 0x5C..5C maps $\\ to 0, then no_zero_byte detects it
+-define(are_all_ascii_plain_swar(W),
+    (W) band ?SWAR_MASK80 =:= 0 andalso
+    ((W) + 16#60606060606060) band ?SWAR_MASK80 =:= ?SWAR_MASK80 andalso
+    ?no_zero_byte((W) bxor 16#22222222222222) andalso
+    ?no_zero_byte((W) bxor 16#5C5C5C5C5C5C5C)
 ).
-- 
2.51.0
Places

File 4351-Use-SWAR-to-optimize-JSON-string-scanning.patch of Package erlang

Places