File 0128-jit-Be-more-paranoid-about-ARM-instruction-caches.patch of Package erlang

From a7a2db8c03ea8ad7012d55fff02fc76d207457cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?John=20H=C3=B6gberg?= <john@erlang.org>
Date: Fri, 9 Jun 2023 18:32:13 +0200
Subject: [PATCH] jit: Be more paranoid about ARM instruction caches

Not every processor has a minimum cache line size of 64 bytes, and
the barriers were a bit more lax than they should have been.
---
 erts/emulator/beam/jit/beam_jit_main.cpp | 70 +++++++++++++++++++-----
 1 file changed, 55 insertions(+), 15 deletions(-)

diff --git a/erts/emulator/beam/jit/beam_jit_main.cpp b/erts/emulator/beam/jit/beam_jit_main.cpp
index 3862663877..7692e4b2d9 100644
--- a/erts/emulator/beam/jit/beam_jit_main.cpp
+++ b/erts/emulator/beam/jit/beam_jit_main.cpp
@@ -76,6 +76,32 @@ static BeamGlobalAssembler *bga;
 static BeamModuleAssembler *bma;
 static CpuInfo cpuinfo;
 
+#if defined(__aarch64__) && !(defined(WIN32) || defined(__APPLE__)) &&         \
+        defined(__GNUC__) && defined(ERTS_THR_INSTRUCTION_BARRIER) &&          \
+        ETHR_HAVE_GCC_ASM_ARM_IC_IVAU_INSTRUCTION &&                           \
+        ETHR_HAVE_GCC_ASM_ARM_DC_CVAU_INSTRUCTION
+#    define BEAMASM_MANUAL_ICACHE_FLUSHING
+#endif
+
+#ifdef BEAMASM_MANUAL_ICACHE_FLUSHING
+static UWord min_icache_line_size;
+static UWord min_dcache_line_size;
+#endif
+
+static void init_cache_info() {
+#if defined(__aarch64__) && defined(BEAMASM_MANUAL_ICACHE_FLUSHING)
+    UWord ctr_el0;
+
+    /* DC/IC operate on a cache line basis, so we need to step according to the
+     * _smallest_ data and instruction cache line size.
+     *
+     * Query the "Cache Type Register" MSR to find out what they are. */
+    __asm__ __volatile__("mrs %0, ctr_el0\n" : "=r"(ctr_el0));
+    min_dcache_line_size = (4 << ((ctr_el0 >> 16) & 0xF));
+    min_icache_line_size = (4 << (ctr_el0 & 0xF));
+#endif
+}
+
 /*
  * Enter all BIFs into the export table.
  *
@@ -257,6 +283,7 @@ void beamasm_init() {
 #endif
 
     beamasm_metadata_early_init();
+    init_cache_info();
 
     /*
      * Ensure that commonly used fields in the PCB can be accessed with
@@ -421,25 +448,38 @@ extern "C"
 #elif defined(__aarch64__) && defined(__APPLE__)
         /* Issues full memory/instruction barriers on all threads for us. */
         sys_icache_invalidate((char *)address, size);
-#elif defined(__aarch64__) && defined(__GNUC__) &&                             \
-        defined(ERTS_THR_INSTRUCTION_BARRIER) &&                               \
-        ETHR_HAVE_GCC_ASM_ARM_IC_IVAU_INSTRUCTION &&                           \
-        ETHR_HAVE_GCC_ASM_ARM_DC_CVAU_INSTRUCTION
-        /* Note that we do not issue any barriers here, whether instruction or
-         * memory. This is on purpose as we must issue those on all schedulers
+#elif defined(__aarch64__) && defined(BEAMASM_MANUAL_ICACHE_FLUSHING)
+        /* Note that we do not issue an instruction synchronization barrier
+         * here. This is on purpose as we must issue those on all schedulers
          * and not just the calling thread, and the chances of us forgetting to
-         * do that is much higher if we issue them here. */
-        UWord start = reinterpret_cast<UWord>(address);
-        UWord end = start + size;
+         * do that is much higher if we issue one here. */
+        UWord start, end, stride;
 
-        ETHR_COMPILER_BARRIER;
+        start = reinterpret_cast<UWord>(address);
+        end = start + size;
 
-        for (UWord i = start & ~ERTS_CACHE_LINE_MASK; i < end;
-             i += ERTS_CACHE_LINE_SIZE) {
-            __asm__ __volatile__("dc cvau, %0\n"
-                                 "ic ivau, %0\n" ::"r"(i)
-                                 :);
+        stride = min_dcache_line_size;
+        for (UWord i = start & ~(stride - 1); i < end; i += stride) {
+            __asm__ __volatile__("dc cvau, %0\n" ::"r"(i) :);
         }
+
+        /* We need a special memory barrier between clearing dcache and icache,
+         * or there's a chance that the icache on another core is invalidated
+         * before the dcache, which can then be repopulated with stale data. */
+        __asm__ __volatile__("dsb ish\n" ::: "memory");
+
+        stride = min_icache_line_size;
+        for (UWord i = start & ~(stride - 1); i < end; i += stride) {
+            __asm__ __volatile__("ic ivau, %0\n" ::"r"(i) :);
+        }
+
+        /* Ensures that all cores clear their instruction cache before moving
+         * on. The usual full memory barrier (`dmb sy`) executed by the thread
+         * progress mechanism is not sufficient for this.
+         *
+         * Note that this barrier need not be executed on other cores, it's
+         * enough for them to issue an instruction synchronization barrier. */
+        __asm__ __volatile__("dsb ish\n" ::: "memory");
 #elif (defined(__x86_64__) || defined(_M_X64)) &&                              \
         defined(ERTS_THR_INSTRUCTION_BARRIER)
         /* We don't need to invalidate cache on this platform, but since we
-- 
2.35.3

openSUSE Build Service is sponsored by