File 2941-Reduce-register-shuffling.patch of Package erlang

From 3643376f3f789f2f4920af43978d13d400d746cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Tue, 16 Feb 2021 03:53:22 +0100
Subject: [PATCH] Reduce register shuffling

Consider this code:

    send_deadline_cont(SockRef, Bin, Cont, Deadline, HasWritten) ->
        SelectHandle = make_ref(),
        Sent = prim_socket:send(SockRef, Bin, Cont, SelectHandle),
        {SockRef, Bin, SelectHandle, Deadline, HasWritten, Sent}.

Looking closer at the BEAM code between the two calls we see four
`move` instructions followed by three `swap` instructions:

    {call_ext,0,{extfunc,erlang,make_ref,0}}.
    {move,{y,2},{x,3}}.
    {move,{y,3},{x,2}}.
    {move,{x,0},{y,2}}.
    {move,{y,4},{x,1}}.
    {swap,{x,2},{x,1}}.
    {swap,{x,3},{x,2}}.
    {swap,{x,0},{x,3}}.
    {call_ext,4,{extfunc,prim_socket,send,4}}.

It turns out that the sequence of `move` and `swap` instructions is
longer than it needs to be, because the initial two `move`
instructions move the terms in two of the Y registers to the wrong X
registers. Two of the `swap` instructions are needed merely to move
the terms to the right registers.

This commit updates optimizations in `beam_ssa_pre_codegen` and
`beam_block` to eliminate the extra instructions:

    {call_ext,0,{extfunc,erlang,make_ref,0}}.
    {move,{y,2},{x,2}}.
    {move,{y,3},{x,1}}.
    {move,{y,4},{x,3}}.
    {move,{x,0},{y,2}}.
    {swap,{x,0},{x,3}}.
    {call_ext,4,{extfunc,prim_socket,send,4}}.

By counting the number of inserted and deleted lines in the diff between
a run of the `diffable` before this change and a run after this change,
we can see that this commit results in 1099 fewer instructions in total.
---
 lib/compiler/src/beam_block.erl           | 51 ++++++++++--
 lib/compiler/src/beam_ssa_pre_codegen.erl | 95 ++++++++++++++++-------
 2 files changed, 112 insertions(+), 34 deletions(-)

diff --git a/lib/compiler/src/beam_block.erl b/lib/compiler/src/beam_block.erl
index 6bc334963f..e8d024b20b 100644
--- a/lib/compiler/src/beam_block.erl
+++ b/lib/compiler/src/beam_block.erl
@@ -22,7 +22,7 @@
 -module(beam_block).
 
 -export([module/2]).
--import(lists, [keysort/2,reverse/1,splitwith/2]).
+-import(lists, [keysort/2,reverse/1,reverse/2,splitwith/2]).
 
 -spec module(beam_utils:module_code(), [compile:option()]) ->
                     {'ok',beam_utils:module_code()}.
@@ -51,20 +51,55 @@ function({function,Name,Arity,CLabel,Is0}) ->
 %%% for the moves generated by a sequence of SSA instructions.
 %%% Therefore, this optimization is needed.
 %%%
+%%% We'll need to handle non-consecutive sequences of moves, such
+%%% as the following instruction sequence:
+%%%
+%%%     move y2, x2
+%%%     move x0, y2
+%%%     move y1, x1
+%%%     init_yregs [y1]
+%%%     move x2, x0
+%%%
+%%% The first two `move` instructions and the last `move` instruction
+%%% should be combined to a `swap` instruction:
+%%%
+%%%     swap y2, x0
+%%%     move y1, x1
+%%%     init_yregs [y1]
+%%%
+%%% (Provided that x2 is killed in the code that follows.)
+%%%
 
-swap_opt([{move,Reg1,{x,X}=Temp}=Move1,
-          {move,Reg2,Reg1}=Move2,
-          {move,Temp,Reg2}=Move3|Is]) when Reg1 =/= Temp ->
-    case is_unused(X, Is) of
-        true ->
+swap_opt([{move,Reg1,{x,_}=Temp}=Move1,
+          {move,Reg2,Reg1}=Move2|Is0]) when Reg1 =/= Temp ->
+    case swap_opt_end(Is0, Temp, Reg2, []) of
+        {yes,Is} ->
             [{swap,Reg1,Reg2}|swap_opt(Is)];
-        false ->
-            [Move1|swap_opt([Move2,Move3|Is])]
+        no ->
+            [Move1|swap_opt([Move2|Is0])]
     end;
 swap_opt([I|Is]) ->
     [I|swap_opt(Is)];
 swap_opt([]) -> [].
 
+swap_opt_end([{move,S,D}=I|Is], Temp, Dst, Acc) ->
+    case {S,D} of
+        {Temp,Dst} ->
+            {x,X} = Temp,
+            case is_unused(X, Is) of
+                true -> {yes,reverse(Acc, Is)};
+                false -> no
+            end;
+        {Temp,_} -> no;
+        {Dst,_} -> no;
+        {_,Temp} -> no;
+        {_,Dst} -> no;
+        {_,_} -> swap_opt_end(Is, Temp, Dst, [I|Acc])
+    end;
+swap_opt_end([{init_yregs,_}=I|Is], Temp, Dst, Acc) ->
+    swap_opt_end(Is, Temp, Dst, [I|Acc]);
+swap_opt_end(_, _, _, _) -> no.
+
 is_unused(X, [{call,A,_}|_]) when A =< X -> true;
 is_unused(X, [{call_ext,A,_}|_]) when A =< X -> true;
 is_unused(X, [{make_fun2,_,_,_,A}|_]) when A =< X -> true;
diff --git a/lib/compiler/src/beam_ssa_pre_codegen.erl b/lib/compiler/src/beam_ssa_pre_codegen.erl
index 2337ec2b19..6ff4e3001f 100644
--- a/lib/compiler/src/beam_ssa_pre_codegen.erl
+++ b/lib/compiler/src/beam_ssa_pre_codegen.erl
@@ -2064,46 +2064,89 @@ copy_retval_is([], RC, _, Copy, Count, Acc) ->
 %%
 %% Consider this code:
 %%
-%%   Var = ...
+%%   P = ...
+%%   Q = ...
 %%   ...
-%%   A1 = call foo/0
-%%   A = copy A1
-%%   B = call bar/1, Var
+%%   A = call foo/0
+%%   A1 = copy A
+%%   B = call bar/2, P, Q
 %%
-%% If the Var variable is no longer used after this code, its Y register
-%% can't be reused for A. To allow the Y register to be reused
-%% we will need to insert 'copy' instructions for arguments that are
-%% in Y registers:
+%% If the P or Q variables are no longer used after this code, one of
+%% their Y registers can't be reused for A. To allow one of the Y registers to
+%% be reused we will need to insert 'copy' instructions for arguments
+%% that are in Y registers:
 %%
-%%   Var = ...
+%%   P = ...
+%%   Q = ...
 %%   ...
 %%   A1 = call foo/0
-%%   Var1 = copy Var
+%%   Q1 = copy Q
+%%   P1 = copy P
+%%   A = copy A1
+%%   B = call bar/2, P1, Q1
+%%
+%% Note that copies of the arguments are done in reverse order to help the
+%% reserve_xregs/3 function place the copies into the X registers they will
+%% need to be in.
+%%
+%% For this example, P1 needs to be in x0 and Q1 needs to be in x1. If we
+%% would copy the arguments in order the registers would be assigned like
+%% this:
+%%
+%%   x0/A1 = call foo/0
+%%   x1/P1 = copy P
+%%   x2/Q1 = copy Q
+%%   A = copy A1
+%%   B = call bar/2, P1, Q1
+%%
+%% That is, both P1 and Q1 would be misplaced and would have to be
+%% moved to their correct registers before the call. However, with the
+%% copies in reverse order and with a little help from
+%% reserve_xregs/3, at least the Q1 variable can be can be placed in
+%% the correct register:
+%%
+%%   x0/A1 = call foo/0
+%%   x1/Q1 = copy Q
+%%   x2/P1 = copy P
 %%   A = copy A1
-%%   B = call bar/1, Var1
+%%   B = call bar/2, P1, Q1
+%%
+%% In general, all but the first argument can be placed in their correct registers.
 %%
 
 place_retval_copy(I, _Yregs, none, Count, Acc) ->
+    %% There is no copy of a previous return value, so there is nothing
+    %% to gain by copying the function arguments.
     {I,Count,Acc};
-place_retval_copy(#b_set{args=[F|Args0]}=I, Yregs, Copy, Count0, Acc0) ->
-    #b_set{dst=Avoid} = Copy,
-    {Args,Acc1,Count} = copy_func_args(Args0, Yregs, Avoid, Acc0, [], Count0),
-    Acc = [Copy|Acc1],
-    {I#b_set{args=[F|Args]},Count,Acc}.
-
-copy_func_args([#b_var{name=AName}=A|As], Yregs, Avoid, CopyAcc, Acc, Count0) ->
+place_retval_copy(#b_set{args=[F|Args0]}=I0, Yregs0, RetCopy, Count0, Acc0) ->
+    %% Copy function arguments, but make sure that we don't make an extra
+    %% copy of the previous return value.
+    #b_set{dst=Avoid} = RetCopy,
+    Yregs = sets:del_element(Avoid, Yregs0),
+    {Args,Acc1,Count} = copy_func_args(Args0, Yregs, Acc0, Count0),
+    I = I0#b_set{args=[F|Args]},
+
+    %% Place the copy instruction for the previous return value after the
+    %% copy instruction for the arguments.
+    Acc = [RetCopy|Acc1],
+    {I,Count,Acc}.
+
+copy_func_args(Args, Yregs, Acc, Count) ->
+    copy_func_args_1(reverse(Args), Yregs, Acc, [], Count).
+
+copy_func_args_1([#b_var{name=AName}=A|As], Yregs, InstrAcc, ArgAcc, Count0) ->
     case sets:is_element(A, Yregs) of
-        true when A =/= Avoid ->
+        true ->
             {NewVar,Count} = new_var(AName, Count0),
             Copy = #b_set{op=copy,dst=NewVar,args=[A]},
-            copy_func_args(As, Yregs, Avoid, [Copy|CopyAcc], [NewVar|Acc], Count);
-        _ ->
-            copy_func_args(As, Yregs, Avoid, CopyAcc, [A|Acc], Count0)
+            copy_func_args_1(As, Yregs, [Copy|InstrAcc], [NewVar|ArgAcc], Count);
+        false ->
+            copy_func_args_1(As, Yregs, InstrAcc, [A|ArgAcc], Count0)
     end;
-copy_func_args([A|As], Yregs, Avoid, CopyAcc, Acc, Count) ->
-    copy_func_args(As, Yregs, Avoid, CopyAcc, [A|Acc], Count);
-copy_func_args([], _Yregs, _Avoid, CopyAcc, Acc, Count) ->
-    {reverse(Acc),CopyAcc,Count}.
+copy_func_args_1([A|As], Yregs, InstrAcc, ArgAcc, Count) ->
+    copy_func_args_1(As, Yregs, InstrAcc, [A|ArgAcc], Count);
+copy_func_args_1([], _Yregs, InstrAcc, ArgAcc, Count) ->
+    {ArgAcc,InstrAcc,Count}.
 
 acc_copy(Acc, none) -> Acc;
 acc_copy(Acc, #b_set{}=Copy) -> [Copy|Acc].
-- 
2.26.2

openSUSE Build Service is sponsored by