File 0128-jit-Be-more-paranoid-about-ARM-instruction-caches.patch of Package erlang
From a7a2db8c03ea8ad7012d55fff02fc76d207457cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?John=20H=C3=B6gberg?= <john@erlang.org>
Date: Fri, 9 Jun 2023 18:32:13 +0200
Subject: [PATCH] jit: Be more paranoid about ARM instruction caches
Not every processor has a minimum cache line size of 64 bytes, and
the barriers were a bit more lax than they should have been.
---
erts/emulator/beam/jit/beam_jit_main.cpp | 70 +++++++++++++++++++-----
1 file changed, 55 insertions(+), 15 deletions(-)
diff --git a/erts/emulator/beam/jit/beam_jit_main.cpp b/erts/emulator/beam/jit/beam_jit_main.cpp
index 3862663877..7692e4b2d9 100644
--- a/erts/emulator/beam/jit/beam_jit_main.cpp
+++ b/erts/emulator/beam/jit/beam_jit_main.cpp
@@ -76,6 +76,32 @@ static BeamGlobalAssembler *bga;
static BeamModuleAssembler *bma;
static CpuInfo cpuinfo;
+#if defined(__aarch64__) && !(defined(WIN32) || defined(__APPLE__)) && \
+ defined(__GNUC__) && defined(ERTS_THR_INSTRUCTION_BARRIER) && \
+ ETHR_HAVE_GCC_ASM_ARM_IC_IVAU_INSTRUCTION && \
+ ETHR_HAVE_GCC_ASM_ARM_DC_CVAU_INSTRUCTION
+# define BEAMASM_MANUAL_ICACHE_FLUSHING
+#endif
+
+#ifdef BEAMASM_MANUAL_ICACHE_FLUSHING
+static UWord min_icache_line_size;
+static UWord min_dcache_line_size;
+#endif
+
+static void init_cache_info() {
+#if defined(__aarch64__) && defined(BEAMASM_MANUAL_ICACHE_FLUSHING)
+ UWord ctr_el0;
+
+ /* DC/IC operate on a cache line basis, so we need to step according to the
+ * _smallest_ data and instruction cache line size.
+ *
+ * Query the "Cache Type Register" MSR to find out what they are. */
+ __asm__ __volatile__("mrs %0, ctr_el0\n" : "=r"(ctr_el0));
+ min_dcache_line_size = (4 << ((ctr_el0 >> 16) & 0xF));
+ min_icache_line_size = (4 << (ctr_el0 & 0xF));
+#endif
+}
+
/*
* Enter all BIFs into the export table.
*
@@ -257,6 +283,7 @@ void beamasm_init() {
#endif
beamasm_metadata_early_init();
+ init_cache_info();
/*
* Ensure that commonly used fields in the PCB can be accessed with
@@ -421,25 +448,38 @@ extern "C"
#elif defined(__aarch64__) && defined(__APPLE__)
/* Issues full memory/instruction barriers on all threads for us. */
sys_icache_invalidate((char *)address, size);
-#elif defined(__aarch64__) && defined(__GNUC__) && \
- defined(ERTS_THR_INSTRUCTION_BARRIER) && \
- ETHR_HAVE_GCC_ASM_ARM_IC_IVAU_INSTRUCTION && \
- ETHR_HAVE_GCC_ASM_ARM_DC_CVAU_INSTRUCTION
- /* Note that we do not issue any barriers here, whether instruction or
- * memory. This is on purpose as we must issue those on all schedulers
+#elif defined(__aarch64__) && defined(BEAMASM_MANUAL_ICACHE_FLUSHING)
+ /* Note that we do not issue an instruction synchronization barrier
+ * here. This is on purpose as we must issue those on all schedulers
* and not just the calling thread, and the chances of us forgetting to
- * do that is much higher if we issue them here. */
- UWord start = reinterpret_cast<UWord>(address);
- UWord end = start + size;
+ * do that is much higher if we issue one here. */
+ UWord start, end, stride;
- ETHR_COMPILER_BARRIER;
+ start = reinterpret_cast<UWord>(address);
+ end = start + size;
- for (UWord i = start & ~ERTS_CACHE_LINE_MASK; i < end;
- i += ERTS_CACHE_LINE_SIZE) {
- __asm__ __volatile__("dc cvau, %0\n"
- "ic ivau, %0\n" ::"r"(i)
- :);
+ stride = min_dcache_line_size;
+ for (UWord i = start & ~(stride - 1); i < end; i += stride) {
+ __asm__ __volatile__("dc cvau, %0\n" ::"r"(i) :);
}
+
+ /* We need a special memory barrier between clearing dcache and icache,
+ * or there's a chance that the icache on another core is invalidated
+ * before the dcache, which can then be repopulated with stale data. */
+ __asm__ __volatile__("dsb ish\n" ::: "memory");
+
+ stride = min_icache_line_size;
+ for (UWord i = start & ~(stride - 1); i < end; i += stride) {
+ __asm__ __volatile__("ic ivau, %0\n" ::"r"(i) :);
+ }
+
+ /* Ensures that all cores clear their instruction cache before moving
+ * on. The usual full memory barrier (`dmb sy`) executed by the thread
+ * progress mechanism is not sufficient for this.
+ *
+ * Note that this barrier need not be executed on other cores, it's
+ * enough for them to issue an instruction synchronization barrier. */
+ __asm__ __volatile__("dsb ish\n" ::: "memory");
#elif (defined(__x86_64__) || defined(_M_X64)) && \
defined(ERTS_THR_INSTRUCTION_BARRIER)
/* We don't need to invalidate cache on this platform, but since we
--
2.35.3