File 5602-Add-an-option-to-map-mseg-pages-with-THP.patch of Package erlang
From aebc8f756b946baa92980af953efb7f1c7878547 Mon Sep 17 00:00:00 2001
From: lexprfuncall <5360361+lexprfuncall@users.noreply.github.com>
Date: Wed, 20 Dec 2023 15:02:55 -0800
Subject: [PATCH 2/2] Add an option to map mseg pages with THP
In order for mseg pages to be reliably mapped with pages lager than
the default page size, the mapping must start and end at a multiple of
the larger page size.
To do this, this change adds an abstraction for performing a
memory-mapping with a specified alignment. On an operating systems
like SunOS 5.9 and later, this is done by passing some extra flags to
mmap(2). On operating systems without such a capability, we must do
this manually by over-allocating and freeing the excess.
The logic in this change only affects super carrier allocations but it
can be generalized to other mseg allocations.
---
erts/doc/references/erts_alloc.md | 6 ++
erts/emulator/beam/erl_alloc.c | 17 ++++
erts/emulator/sys/common/erl_mmap.c | 140 +++++++++++++++++++++-------
erts/emulator/sys/common/erl_mmap.h | 5 +-
erts/etc/common/erlexec.c | 1 +
5 files changed, 132 insertions(+), 37 deletions(-)
diff --git a/erts/doc/src/erts_alloc.xml b/erts/doc/src/erts_alloc.xml
index e7199daa4f..2cdc48fa45 100644
--- a/erts/doc/src/erts_alloc.xml
+++ b/erts/doc/src/erts_alloc.xml
@@ -338,6 +338,14 @@
requested size with more than relative maximum cache bad fit
percent of the requested size. Defaults to <c>20</c>.</p>
</item>
+ <tag><marker id="MMlp"/><c><![CDATA[+MMlp on|off]]></c></tag>
+ <item>
+ <p>Enables the use of large pages, sometimes known
+ as huge pages or super pages, for mapping memory segment allocations. Large
+ pages improve performance by reducing TLB pressure but they can sometimes be
+ costly to allocate or can only be allocated on a best-effort basis. Currently
+ only affects memory segments allocated in a super carrier. Defaults to <c>off</c>.</p>
+ </item>
<tag><marker id="MMsco"/><c><![CDATA[+MMsco true|false]]></c></tag>
<item>
<p>Sets <seecref marker="#MMscs">super carrier</seecref> only flag.
diff --git a/erts/emulator/beam/erl_alloc.c b/erts/emulator/beam/erl_alloc.c
index 703d7bdc28..9fdba3e9a7 100644
--- a/erts/emulator/beam/erl_alloc.c
+++ b/erts/emulator/beam/erl_alloc.c
@@ -1650,6 +1650,23 @@ handle_args(int *argc, char **argv, erts_alc_hndl_args_init_t *init)
#endif
get_amount_value(argv[i]+9, argv, &i);
}
+ else if (has_prefix("lp", argv[i]+3)) {
+ char *param_end = argv[i]+5;
+ char *value = get_value(param_end, argv, &i);
+ if (sys_strcmp(value, "on") == 0) {
+#if HAVE_ERTS_MSEG
+ init->mseg.dflt_mmap.lp = 1;
+ init->mseg.literal_mmap.lp = 1;
+#endif
+ } else if (sys_strcmp(value, "off") == 0) {
+#if HAVE_ERTS_MSEG
+ init->mseg.dflt_mmap.lp = 0;
+ init->mseg.literal_mmap.lp = 0;
+#endif
+ } else {
+ bad_value(param, param_end, value);
+ }
+ }
else {
bad_param(param, param+2);
}
diff --git a/erts/emulator/sys/common/erl_mmap.c b/erts/emulator/sys/common/erl_mmap.c
index b45146a783..7a4919c65b 100644
--- a/erts/emulator/sys/common/erl_mmap.c
+++ b/erts/emulator/sys/common/erl_mmap.c
@@ -1272,18 +1272,13 @@ Eterm build_free_seg_list(Process* p, ErtsFreeSegMap* map)
#endif
static ERTS_INLINE void *
-os_mmap(void *hint_ptr, UWord size, int try_superalign)
+os_mmap(void *hint_ptr, UWord size)
{
#if HAVE_MMAP
void *res;
-#ifdef MAP_ALIGN
- if (try_superalign)
- res = mmap((void *) ERTS_SUPERALIGNED_SIZE, size, ERTS_MMAP_PROT,
- ERTS_MMAP_FLAGS|MAP_ALIGN, ERTS_MMAP_FD, 0);
- else
-#endif
- res = mmap((void *) hint_ptr, size, ERTS_MMAP_PROT,
- ERTS_MMAP_FLAGS, ERTS_MMAP_FD, 0);
+
+ res = mmap((void *) hint_ptr, size, ERTS_MMAP_PROT,
+ ERTS_MMAP_FLAGS, ERTS_MMAP_FD, 0);
if (res == MAP_FAILED)
return NULL;
return res;
@@ -1315,6 +1310,84 @@ os_munmap(void *ptr, UWord size)
#endif
}
+#define ALIGN_UP(x, a) ((void*)((((UWord)(x)) + ((a) - 1)) & ~((a) - 1)))
+#define IS_ALIGNED(x, a) ((((UWord)(x)) & ((a) - 1)) == 0)
+
+/*
+ * Just like os_mmap, but ensures that mapping is a multiple of the
+ * specified alignment. Alignment must be a power-of-2 multiple of
+ * the page size in bytes.
+ */
+static ERTS_INLINE void *
+os_mmap_aligned(UWord size, UWord alignment)
+{
+ char *result;
+#ifdef MAP_ALIGN
+
+ /*
+ * On an operating systems that support MAP_ALIGN (SunOS >=5.9) we
+ * can directly ask mmap(2) to align the virtual memory mapping.
+ */
+ result = mmap((void *) alignment, size, ERTS_MMAP_PROT,
+ ERTS_MMAP_FLAGS|MAP_ALIGN, ERTS_MMAP_FD, 0);
+ if (result == MAP_FAILED) {
+ return NULL;
+ }
+#else
+ UWord diff;
+
+ ASSERT((size % sys_page_size) == 0);
+ ASSERT((alignment % sys_page_size) == 0);
+
+ /*
+ * Allocate and test for alignment. It is possible 1) the
+ * operating aligned the allocation based its length or 2) the
+ * previous allocation aligned the next available address.
+ */
+ if ((result = os_mmap(NULL, size)) == NULL) {
+ return NULL;
+ }
+
+ if (IS_ALIGNED(result, alignment)) {
+ return result;
+ }
+
+ /*
+ * The virtual memory allocation was not aligned, clean-up the
+ * mapping so we can try a different strategy.
+ */
+ os_munmap(result, size);
+
+ /*
+ * Retry the virtual memory allocation adding padding to ensure
+ * the requested alignment.
+ */
+ if ((result = os_mmap(NULL, size + alignment)) == NULL) {
+ return NULL;
+ }
+
+ diff = (char *)ALIGN_UP(result, alignment) - result;
+
+ /*
+ * Unmap any extra pages at the beginning of the allocation. If
+ * the allocation ended up being aligned, there will be nothing to
+ * unmap.
+ */
+ if (diff != 0) {
+ os_munmap(result, diff);
+ result += diff;
+ }
+
+ /*
+ * Unmap extra pages at the end of the allocation. There must
+ * always be at least one.
+ */
+ os_munmap(result + size, alignment - diff);
+#endif
+
+ return result;
+}
+
#ifdef ERTS_HAVE_OS_MREMAP
# if HAVE_MREMAP
# if defined(__NetBSD__)
@@ -1324,7 +1397,7 @@ os_munmap(void *ptr, UWord size)
# endif
# endif
static ERTS_INLINE void *
-os_mremap(void *ptr, UWord old_size, UWord new_size, int try_superalign)
+os_mremap(void *ptr, UWord old_size, UWord new_size)
{
void *new_seg;
#if HAVE_MREMAP
@@ -1442,7 +1515,7 @@ alloc_desc_insert_free_seg(ErtsMemMapper* mm,
#if ERTS_HAVE_OS_MMAP
if (!mm->no_os_mmap) {
- ptr = os_mmap(mm->desc.new_area_hint, ERTS_PAGEALIGNED_SIZE, 0);
+ ptr = os_mmap(mm->desc.new_area_hint, ERTS_PAGEALIGNED_SIZE);
if (ptr) {
mm->desc.new_area_hint = ptr+ERTS_PAGEALIGNED_SIZE;
ERTS_MMAP_SIZE_OS_INC(ERTS_PAGEALIGNED_SIZE);
@@ -1650,35 +1723,15 @@ erts_mmap(ErtsMemMapper* mm, Uint32 flags, UWord *sizep)
/* Map using OS primitives */
if (!(ERTS_MMAPFLG_SUPERCARRIER_ONLY & flags) && !mm->no_os_mmap) {
if (!(ERTS_MMAPFLG_SUPERALIGNED & flags)) {
- seg = os_mmap(NULL, asize, 0);
+ seg = os_mmap(NULL, asize);
if (!seg)
goto failure;
}
else {
asize = ERTS_SUPERALIGNED_CEILING(*sizep);
- seg = os_mmap(NULL, asize, 1);
+ seg = os_mmap_aligned(asize, ERTS_SUPERALIGNED_SIZE);
if (!seg)
goto failure;
-
- if (!ERTS_IS_SUPERALIGNED(seg)) {
- char *ptr;
- UWord sz;
-
- os_munmap(seg, asize);
-
- ptr = os_mmap(NULL, asize + ERTS_SUPERALIGNED_SIZE, 1);
- if (!ptr)
- goto failure;
-
- seg = (char *) ERTS_SUPERALIGNED_CEILING(ptr);
- sz = (UWord) (seg - ptr);
- ERTS_MMAP_ASSERT(sz <= ERTS_SUPERALIGNED_SIZE);
- if (sz)
- os_munmap(ptr, sz);
- sz = ERTS_SUPERALIGNED_SIZE - sz;
- if (sz)
- os_munmap(seg+asize, sz);
- }
}
ERTS_MMAP_OP_LCK(seg, *sizep, asize);
@@ -1899,7 +1952,7 @@ erts_mremap(ErtsMemMapper* mm,
if (superaligned) {
return remap_move(mm, flags, ptr, old_size, sizep);
} else {
- new_ptr = os_mremap(ptr, old_size, asize, 0);
+ new_ptr = os_mremap(ptr, old_size, asize);
if (!new_ptr)
return NULL;
if (asize > old_size)
@@ -2232,13 +2285,30 @@ erts_mmap_init(ErtsMemMapper* mm, ErtsMMapInit *init)
* The whole supercarrier will by physically
* reserved all the time.
*/
- start = os_mmap(NULL, sz, 1);
+ UWord alignment;
+
+ if (init->lp)
+ alignment = MAX(sys_large_page_size, ERTS_SUPERALIGNED_SIZE);
+ else
+ alignment = ERTS_SUPERALIGNED_SIZE;
+ start = os_mmap_aligned(sz, alignment);
}
if (!start)
erts_exit(1,
"erts_mmap: Failed to create super carrier of size %bpu MB\n",
init->scs/1024/1024);
end = start + sz;
+#ifdef HAVE_LINUX_THP
+ if (init->lp) {
+ /*
+ * Enable the Transparent Huge Pages for the virtual
+ * memory reservation.
+ */
+ if (madvise(start, sz, MADV_HUGEPAGE) != 0) {
+ erts_exit(1, "erts_mmap: Failed to enable THP for the super carrier: %s\n", strerror(errno));
+ }
+ }
+#endif
#ifdef ERTS_MMAP_DEBUG_FILL_AREAS
if (!virtual_map) {
Uint32 *uip;
diff --git a/erts/emulator/sys/common/erl_mmap.h b/erts/emulator/sys/common/erl_mmap.h
index ecb1ef6a13..4322324ace 100644
--- a/erts/emulator/sys/common/erl_mmap.h
+++ b/erts/emulator/sys/common/erl_mmap.h
@@ -88,15 +88,16 @@ typedef struct {
int sco; /* super carrier only? */
UWord scrfsd; /* super carrier reserved free segment descriptors */
int scrpm; /* super carrier reserve physical memory */
+ int lp; /* try to use large pages? */
}ErtsMMapInit;
#define ERTS_MMAP_INIT_DEFAULT_INITER \
- {{NULL, NULL}, {NULL, NULL}, 0, 1, (1 << 16), 1}
+ {{NULL, NULL}, {NULL, NULL}, 0, 1, (1 << 16), 1, 0}
#define ERTS_LITERAL_VIRTUAL_AREA_SIZE (UWORD_CONSTANT(1)*1024*1024*1024)
#define ERTS_MMAP_INIT_LITERAL_INITER \
- {{NULL, NULL}, {NULL, NULL}, ERTS_LITERAL_VIRTUAL_AREA_SIZE, 1, (1 << 10), 0}
+ {{NULL, NULL}, {NULL, NULL}, ERTS_LITERAL_VIRTUAL_AREA_SIZE, 1, (1 << 10), 0, 0}
#define ERTS_SUPERALIGNED_SIZE \
diff --git a/erts/etc/common/erlexec.c b/erts/etc/common/erlexec.c
index 854e30406b..d05721dba8 100644
--- a/erts/etc/common/erlexec.c
+++ b/erts/etc/common/erlexec.c
@@ -111,6 +111,7 @@ static char *plusM_other_switches[] = {
"Mscrfsd",
"Msco",
"Mscrpm",
+ "Mlp",
"Ye",
"Ym",
"Ytp",
--
2.35.3