File 2662-Optimize-deallocate_return-instructions.patch of Package erlang

From 8041849a8e55281b4d954e63e9415995607e1870 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Fri, 16 Aug 2019 16:04:02 +0200
Subject: [PATCH 2/3] Optimize deallocate_return instructions

Eliminating the CP register and putting continuation
pointers directly on the stack made the deallocate_return
instruction slower.

Try to mitigate this slow down by specializing deallocate_return
for small stack. For the move_deallocate_return instruction,
reorder instructions to make it possible to execute the read
instructions in parallel.
---
 erts/emulator/beam/instrs.tab | 71 ++++++++++++++++++++++++++++++++++++++++---
 erts/emulator/beam/ops.tab    | 12 ++++++++
 2 files changed, 79 insertions(+), 4 deletions(-)

diff --git a/erts/emulator/beam/instrs.tab b/erts/emulator/beam/instrs.tab
index efdba73057..807f4512d1 100644
--- a/erts/emulator/beam/instrs.tab
+++ b/erts/emulator/beam/instrs.tab
@@ -66,18 +66,81 @@ deallocate(Deallocate) {
     E = ADD_BYTE_OFFSET(E, $Deallocate);
 }
 
+deallocate_return0 := dealloc_ret.n0.execute;
+deallocate_return1 := dealloc_ret.n1.execute;
+deallocate_return2 := dealloc_ret.n2.execute;
+deallocate_return3 := dealloc_ret.n3.execute;
+deallocate_return4 := dealloc_ret.n4.execute;
+
+dealloc_ret.head() {
+    Uint num_bytes;
+}
+
+dealloc_ret.n0() {
+    num_bytes = (0+1) * sizeof(Eterm);
+}
+
+dealloc_ret.n1() {
+    num_bytes = (1+1) * sizeof(Eterm);
+}
+
+dealloc_ret.n2() {
+    num_bytes = (2+1) * sizeof(Eterm);
+}
+
+dealloc_ret.n3() {
+    num_bytes = (3+1) * sizeof(Eterm);
+}
+
+dealloc_ret.n4() {
+    num_bytes = (4+1) * sizeof(Eterm);
+}
+
+dealloc_ret.execute() {
+    //| -no_next
+
+    /*
+     * Micro-benchmarks showed that the deallocate_return instruction
+     * became slower when the continuation pointer was moved from
+     * the process struct to the stack. The reason seems to be read
+     * dependencies, i.e. that the CPU cannot figure out beforehand
+     * from which position on the stack the continuation pointer
+     * should be fetched.
+     *
+     * Making sure that num_bytes is always initialized with a
+     * constant value seems to restore the lost speed.
+     */
+
+    E = ADD_BYTE_OFFSET(E, num_bytes);
+    $RETURN();
+    CHECK_TERM(x(0));
+    DispatchReturn;
+}
+
 deallocate_return(Deallocate) {
     //| -no_next
-    int words_to_pop = $Deallocate;
-    E = ADD_BYTE_OFFSET(E, words_to_pop);
+    Uint bytes_to_pop = $Deallocate;
+    E = ADD_BYTE_OFFSET(E, bytes_to_pop);
     $RETURN();
     CHECK_TERM(x(0));
     DispatchReturn;
 }
 
 move_deallocate_return(Src, Deallocate) {
-    x(0) = $Src;
-    $deallocate_return($Deallocate);
+    //| -no_next
+
+    /*
+     * Explicitly do reads first to mitigate the impact of read
+     * dependencies.
+     */
+
+    Uint bytes_to_pop = $Deallocate;
+    Eterm src = $Src;
+    E = ADD_BYTE_OFFSET(E, bytes_to_pop);
+    x(0) = src;
+    $RETURN();
+    CHECK_TERM(x(0));
+    DispatchReturn;
 }
 
 // Call instructions
diff --git a/erts/emulator/beam/ops.tab b/erts/emulator/beam/ops.tab
index f525d126e7..c0ca9260a0 100644
--- a/erts/emulator/beam/ops.tab
+++ b/erts/emulator/beam/ops.tab
@@ -596,8 +596,20 @@ move S x==0 | deallocate D | return => move_deallocate_return S D
 
 move_deallocate_return xycn Q
 
+deallocate u==0 | return => deallocate_return0
+deallocate u==1 | return => deallocate_return1
+deallocate u==2 | return => deallocate_return2
+deallocate u==3 | return => deallocate_return3
+deallocate u==4 | return => deallocate_return4
+
 deallocate D | return => deallocate_return D
 
+deallocate_return0
+deallocate_return1
+deallocate_return2
+deallocate_return3
+deallocate_return4
+
 deallocate_return Q
 
 test_heap Need u==1 | put_list Y=y x==0 x==0 => test_heap_1_put_list Need Y
-- 
2.16.4

openSUSE Build Service is sponsored by