File 1131-x86_64-Optimize-creation-of-fun-environment.patch of Package erlang
From ae127203ac2423d057e1ef151d4ca8b114740b84 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Thu, 7 Sep 2023 17:04:03 +0200
Subject: [PATCH 11/25] x86_64: Optimize creation of fun environment
Whenever possible, use SSE/AVX to copy two registers at once to the
fun environment.
---
erts/emulator/beam/jit/x86/instr_fun.cpp | 38 ++++++++++++++++++++++--
1 file changed, 35 insertions(+), 3 deletions(-)
diff --git a/erts/emulator/beam/jit/x86/instr_fun.cpp b/erts/emulator/beam/jit/x86/instr_fun.cpp
index 2d7fe8c168..e621d2529f 100644
--- a/erts/emulator/beam/jit/x86/instr_fun.cpp
+++ b/erts/emulator/beam/jit/x86/instr_fun.cpp
@@ -206,9 +206,41 @@ void BeamModuleAssembler::emit_i_make_fun3(const ArgLambda &Lambda,
comment("Move fun environment");
for (unsigned i = 0; i < num_free; i++) {
- mov_arg(x86::qword_ptr(RET,
- offsetof(ErlFunThing, env) + i * sizeof(Eterm)),
- env[i]);
+ const ArgVal &next = i + 1 < num_free ? env[i + 1] : ArgNil();
+ switch (ArgVal::memory_relation(env[i], next)) {
+ case ArgVal::Relation::consecutive: {
+ x86::Mem src_ptr = getArgRef(env[i].as<ArgRegister>(), 16);
+ x86::Mem dst_ptr = x86::xmmword_ptr(RET,
+ offsetof(ErlFunThing, env) +
+ i * sizeof(Eterm));
+ comment("(moving two items)");
+ vmovups(x86::xmm0, src_ptr);
+ vmovups(dst_ptr, x86::xmm0);
+ i++;
+ break;
+ }
+ case ArgVal::Relation::reverse_consecutive: {
+ if (!hasCpuFeature(CpuFeatures::X86::kAVX)) {
+ goto fallback;
+ }
+ x86::Mem src_ptr = getArgRef(env[i + 1].as<ArgRegister>(), 16);
+ x86::Mem dst_ptr = x86::xmmword_ptr(RET,
+ offsetof(ErlFunThing, env) +
+ i * sizeof(Eterm));
+ comment("(moving and swapping two items)");
+ a.vpermilpd(x86::xmm0, src_ptr, 1); /* Load and swap */
+ a.vmovups(dst_ptr, x86::xmm0);
+ i++;
+ break;
+ }
+ case ArgVal::Relation::none:
+ fallback:
+ mov_arg(x86::qword_ptr(RET,
+ offsetof(ErlFunThing, env) +
+ i * sizeof(Eterm)),
+ env[i]);
+ break;
+ }
}
comment("Create boxed ptr");
--
2.35.3