File 1861-Improve-register-allocation-for-try-.-catch-construc.patch of Package erlang
From 8b814a7419db1f710f2a853f25a51269ce16133d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Mon, 27 Jan 2025 15:09:04 +0100
Subject: [PATCH] Improve register allocation for try ... catch constructs
Consider this function:
do_try(F) ->
try
F()
catch
C:R:Stk ->
{'EXIT',C,R,Stk}
end.
The compiler in Erlang/OTP 27 generates the following code for the
body of the function:
{allocate,3,1}. % Allocate three Y registers.
{init_yregs,{list,[{y,0},{y,1}]}}.
{'try',{y,2},{f,3}}.
{call_fun,0}.
{try_end,{y,2}}.
{deallocate,3}.
return.
{label,3}.
{try_case,{y,2}}.
%% {y,2} is free, but is not being reused.
{move,{x,1},{y,0}}.
{move,{x,0},{y,1}}.
{move,{x,2},{x,0}}.
build_stacktrace.
{test_heap,5,1}.
{put_tuple2,{x,0},{list,[{atom,'EXIT'},{y,1},{y,0},{x,0}]}}.
{deallocate,3}.
return.
Note that a stackframe with three Y registers will be allocated. It
would be sufficient to allocate two Y registers if `{y,2}` could be
reused following the `{try_case,{y,2}}` instruction.
This commit extends an existing optimization to ensure that `{y,2}`
can be reused, resulting in the following code:
{allocate,2,1}. % Allocate two Y registers.
{init_yregs,{list,[{y,0}]}}.
{'try',{y,1},{f,3}}.
{line,[{location,"t.erl",7}]}.
{call_fun,0}.
{try_end,{y,1}}.
{deallocate,2}.
return.
{label,3}.
{try_case,{y,1}}.
{move,{x,0},{y,0}}.
{move,{x,1},{y,1}}. % Reusing {y,1}.
{move,{x,2},{x,0}}.
build_stacktrace.
{test_heap,5,1}.
{put_tuple2,{x,0},{list,[{atom,'EXIT'},{y,0},{y,1},{x,0}]}}.
{deallocate,2}.
return.
This optimization improves the code in more than 200 modules out of
the sample of about 1000 modules compiled by `./scripts/diffable`.
---
lib/compiler/src/beam_ssa_pre_codegen.erl | 65 +++++++++++++++++++++--
1 file changed, 62 insertions(+), 3 deletions(-)
diff --git a/lib/compiler/src/beam_ssa_pre_codegen.erl b/lib/compiler/src/beam_ssa_pre_codegen.erl
index a985b7c8c3..43aea9f949 100644
--- a/lib/compiler/src/beam_ssa_pre_codegen.erl
+++ b/lib/compiler/src/beam_ssa_pre_codegen.erl
@@ -1990,6 +1990,11 @@ copy_retval_is([#b_set{op=call,dst=#b_var{}=Dst}=I0|Is], RC, Yregs,
false ->
copy_retval_is(Is, RC, Yregs, none, Count1, [I1|Acc])
end;
+copy_retval_is([#b_set{op=landingpad,args=[#b_literal{val='try'}|_]=Args0}=I0|Is],
+ _RC, Yregs, Copy, Count, Acc0) ->
+ I = I0#b_set{args=copy_sub_args(Args0, Copy)},
+ Acc = [I|acc_copy(Acc0, Copy)],
+ copy_landingpad(Is, Yregs, Count, Acc, []);
copy_retval_is([#b_set{args=Args0}=I0|Is], RC, Yregs, Copy, Count, Acc) ->
I = I0#b_set{args=copy_sub_args(Args0, Copy)},
case beam_ssa:clobbers_xregs(I) of
@@ -2008,6 +2013,53 @@ copy_retval_is([], RC, _, Copy, Count, Acc) ->
{reverse(Acc, [Copy]),Count}
end.
+%% Consider this function:
+%%
+%% do_try(F) ->
+%% try F()
+%% catch
+%% C:R:Stk ->
+%% {'EXIT',C,R,Stk}
+%% end.
+%%
+%% That would result in the following SSA code for the `catch` clause:
+%%
+%% z0/_16 = landingpad `'try'`, y2/_14
+%% y1/_4 = extract z0/_16, `0`
+%% y0/_3 = extract z0/_16, `1`
+%% x0/_2 = extract z0/_16, `2`
+%% z0/_17 = kill_try_tag y2/_14
+%% x0/Stk = build_stacktrace x0/_2
+%%
+%% Note that three Y registers are required. That can be reduced to
+%% two Y registers if we rewrite the code like so:
+%%
+%% x0/_37 = extract z0/_16, `0`
+%% x1/_38 = extract z0/_16, `1`
+%% x2/_2 = extract z0/_16, `2`
+%% z0/_17 = kill_try_tag y1/_14
+%% y1/_3 = copy x1/_38
+%% y0/_4 = copy x0/_37
+%%
+
+copy_landingpad([I0|Is], Yregs, Count0, Acc0, Copies0) ->
+ case I0 of
+ #b_set{dst=Dst,op=extract} ->
+ case sets:is_element(Dst, Yregs) of
+ true ->
+ {NewDst,Count} = new_var(Count0),
+ Copies = [#b_set{op=copy,dst=Dst,args=[NewDst]}|Copies0],
+ I = I0#b_set{dst=NewDst},
+ Acc = [I|Acc0],
+ copy_landingpad(Is, Yregs, Count, Acc, Copies);
+ false ->
+ Acc = [I0|Acc0],
+ copy_landingpad(Is, Yregs, Count0, Acc, Copies0)
+ end;
+ #b_set{op=kill_try_tag} ->
+ {reverse(Acc0, [I0|Copies0 ++ Is]),Count0}
+ end.
+
%%
%% Consider this code:
%%
@@ -2770,15 +2822,22 @@ reserve_xregs_is([gc|Is], Res, Xs0, Used) ->
Xs = res_xregs_prune(Xs0, Used, Res),
reserve_xregs_is(Is, Res, Xs, Used);
reserve_xregs_is([#b_set{op=Op,dst=Dst,args=Args}=I|Is], Res0, Xs0, Used0) ->
- Res = reserve_xreg(Dst, Xs0, Res0),
+ Res1 = reserve_xreg(Dst, Xs0, Res0),
Used1 = ordsets:union(Used0, beam_ssa:used(I)),
Used = ordsets:del_element(Dst, Used1),
case Op of
call ->
Xs = reserve_call_args(tl(Args)),
- reserve_xregs_is(Is, Res, Xs, Used);
+ reserve_xregs_is(Is, Res1, Xs, Used);
+ extract ->
+ %% Avoid potential register shuffling by pinning the
+ %% destination variable to the X register where the
+ %% runtime system will place it.
+ [_,#b_literal{val=Reg}] = Args,
+ Res = Res1#{Dst => {x,Reg}},
+ reserve_xregs_is(Is, Res, Xs0, Used);
_ ->
- reserve_xregs_is(Is, Res, Xs0, Used)
+ reserve_xregs_is(Is, Res1, Xs0, Used)
end;
reserve_xregs_is([], Res, Xs, _Used) ->
{Res,Xs}.
--
2.43.0