summaryrefslogtreecommitdiff
path: root/malloc-hugepage-0004-malloc-Add-Huge-Page-support-for-mmap.patch
diff options
context:
space:
mode:
Diffstat (limited to 'malloc-hugepage-0004-malloc-Add-Huge-Page-support-for-mmap.patch')
-rw-r--r--malloc-hugepage-0004-malloc-Add-Huge-Page-support-for-mmap.patch476
1 files changed, 476 insertions, 0 deletions
diff --git a/malloc-hugepage-0004-malloc-Add-Huge-Page-support-for-mmap.patch b/malloc-hugepage-0004-malloc-Add-Huge-Page-support-for-mmap.patch
new file mode 100644
index 0000000..1969a1f
--- /dev/null
+++ b/malloc-hugepage-0004-malloc-Add-Huge-Page-support-for-mmap.patch
@@ -0,0 +1,476 @@
+From 98d5fcb8d099a1a868e032c89891c395a2f365c5 Mon Sep 17 00:00:00 2001
+From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date: Mon, 16 Aug 2021 15:08:27 -0300
+Subject: [PATCH 4/7] malloc: Add Huge Page support for mmap
+
+With the morecore hook removed, there is not easy way to provide huge
+pages support on with glibc allocator without resorting to transparent
+huge pages. And some users and programs do prefer to use the huge pages
+directly instead of THP for multiple reasons: no splitting, re-merging
+by the VM, no TLB shootdowns for running processes, fast allocation
+from the reserve pool, no competition with the rest of the processes
+unlike THP, no swapping all, etc.
+
+This patch extends the 'glibc.malloc.hugetlb' tunable: the value
+'2' means to use huge pages directly with the system default size,
+while a positive value means and specific page size that is matched
+against the supported ones by the system.
+
+Currently only memory allocated on sysmalloc() is handled, the arenas
+still uses the default system page size.
+
+To test is a new rule is added tests-malloc-hugetlb2, which run the
+addes tests with the required GLIBC_TUNABLE setting. On systems without
+a reserved huge pages pool, is just stress the mmap(MAP_HUGETLB)
+allocation failure. To improve test coverage it is required to create
+a pool with some allocated pages.
+
+Checked on x86_64-linux-gnu.
+
+Reviewed-by: DJ Delorie <dj@redhat.com>
+---
+ NEWS | 8 +-
+ Rules | 17 +++
+ elf/dl-tunables.list | 3 +-
+ elf/tst-rtld-list-tunables.exp | 2 +-
+ malloc/Makefile | 8 +-
+ malloc/arena.c | 4 +-
+ malloc/malloc.c | 31 ++++-
+ manual/tunables.texi | 7 ++
+ sysdeps/generic/malloc-hugepages.c | 8 ++
+ sysdeps/generic/malloc-hugepages.h | 7 ++
+ sysdeps/unix/sysv/linux/malloc-hugepages.c | 127 +++++++++++++++++++++
+ 11 files changed, 207 insertions(+), 15 deletions(-)
+
+diff --git a/NEWS b/NEWS
+index 3b94dd209c..c7200cd4e8 100644
+--- a/NEWS
++++ b/NEWS
+@@ -93,9 +93,11 @@ Major new features:
+ Restartable Sequences.
+
+ * On Linux, a new tunable, glibc.malloc.hugetlb, can be used to
+- make malloc issue madvise plus MADV_HUGEPAGE on mmap and sbrk calls.
+- Setting this might improve performance with Transparent Huge Pages madvise
+- mode depending of the workload.
++ either make malloc issue madvise plus MADV_HUGEPAGE on mmap and sbrk
++ or to use huge pages directly with mmap calls with the MAP_HUGETLB
++ flags). The former can improve performance when Transparent Huge Pages
++ is set to 'madvise' mode while the latter uses the system reserved
++ huge pages.
+
+ Deprecated and removed features, and other changes affecting compatibility:
+
+diff --git a/Rules b/Rules
+index 5f5d9ba4cc..be34982daa 100644
+--- a/Rules
++++ b/Rules
+@@ -158,6 +158,7 @@ tests: $(tests:%=$(objpfx)%.out) $(tests-internal:%=$(objpfx)%.out) \
+ $(tests-mcheck:%=$(objpfx)%-mcheck.out) \
+ $(tests-malloc-check:%=$(objpfx)%-malloc-check.out) \
+ $(tests-malloc-hugetlb1:%=$(objpfx)%-malloc-hugetlb1.out) \
++ $(tests-malloc-hugetlb2:%=$(objpfx)%-malloc-hugetlb2.out) \
+ $(tests-special) $(tests-printers-out)
+ xtests: tests $(xtests:%=$(objpfx)%.out) $(xtests-special)
+ endif
+@@ -170,6 +171,7 @@ else
+ tests-expected = $(tests) $(tests-internal) $(tests-printers) \
+ $(tests-container) $(tests-malloc-check:%=%-malloc-check) \
+ $(tests-malloc-hugetlb1:%=%-malloc-hugetlb1) \
++ $(tests-malloc-hugetlb2:%=%-malloc-hugetlb2) \
+ $(tests-mcheck:%=%-mcheck)
+ endif
+ tests:
+@@ -199,6 +201,7 @@ endif
+ binaries-mcheck-tests = $(tests-mcheck:%=%-mcheck)
+ binaries-malloc-check-tests = $(tests-malloc-check:%=%-malloc-check)
+ binaries-malloc-hugetlb1-tests = $(tests-malloc-hugetlb1:%=%-malloc-hugetlb1)
++binaries-malloc-hugetlb2-tests = $(tests-malloc-hugetlb2:%=%-malloc-hugetlb2)
+ else
+ binaries-all-notests =
+ binaries-all-tests = $(tests) $(tests-internal) $(xtests) $(test-srcs)
+@@ -211,6 +214,7 @@ binaries-pie-notests =
+ binaries-mcheck-tests =
+ binaries-malloc-check-tests =
+ binaries-malloc-hugetlb1-tests =
++binaries-malloc-hugetlb2-tests =
+ endif
+
+ binaries-pie = $(binaries-pie-tests) $(binaries-pie-notests)
+@@ -259,6 +263,14 @@ $(addprefix $(objpfx),$(binaries-malloc-hugetlb1-tests)): %-malloc-hugetlb1: %.o
+ $(+link-tests)
+ endif
+
++ifneq "$(strip $(binaries-malloc-hugetlb2-tests))" ""
++$(addprefix $(objpfx),$(binaries-malloc-hugetlb2-tests)): %-malloc-hugetlb2: %.o \
++ $(link-extra-libs-tests) \
++ $(sort $(filter $(common-objpfx)lib%,$(link-libc))) \
++ $(addprefix $(csu-objpfx),start.o) $(+preinit) $(+postinit)
++ $(+link-tests)
++endif
++
+ ifneq "$(strip $(binaries-pie-tests))" ""
+ $(addprefix $(objpfx),$(binaries-pie-tests)): %: %.o \
+ $(link-extra-libs-tests) \
+@@ -302,6 +314,11 @@ $(1)-malloc-hugetlb1-ENV += GLIBC_TUNABLES=glibc.malloc.hugetlb=1
+ endef
+ $(foreach t,$(tests-malloc-hugetlb1),$(eval $(call malloc-hugetlb1-ENVS,$(t))))
+
++# All malloc-hugetlb2 tests will be run with GLIBC_TUNABLE=glibc.malloc.hugetlb=2
++define malloc-hugetlb2-ENVS
++$(1)-malloc-hugetlb2-ENV += GLIBC_TUNABLES=glibc.malloc.hugetlb=2
++endef
++$(foreach t,$(tests-malloc-hugetlb2),$(eval $(call malloc-hugetlb2-ENVS,$(t))))
+
+ # mcheck tests need the debug DSO to support -lmcheck.
+ define mcheck-ENVS
+diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
+index d1fd3f3e91..845d521a43 100644
+--- a/elf/dl-tunables.list
++++ b/elf/dl-tunables.list
+@@ -93,9 +93,8 @@ glibc {
+ security_level: SXID_IGNORE
+ }
+ hugetlb {
+- type: INT_32
++ type: SIZE_T
+ minval: 0
+- maxval: 1
+ }
+ }
+ cpu {
+diff --git a/elf/tst-rtld-list-tunables.exp b/elf/tst-rtld-list-tunables.exp
+index d8e363f2c5..cdfdb56a94 100644
+--- a/elf/tst-rtld-list-tunables.exp
++++ b/elf/tst-rtld-list-tunables.exp
+@@ -1,7 +1,7 @@
+ glibc.malloc.arena_max: 0x0 (min: 0x1, max: 0x[f]+)
+ glibc.malloc.arena_test: 0x0 (min: 0x1, max: 0x[f]+)
+ glibc.malloc.check: 0 (min: 0, max: 3)
+-glibc.malloc.hugetlb: 0 (min: 0, max: 1)
++glibc.malloc.hugetlb: 0x0 (min: 0x0, max: 0x[f]+)
+ glibc.malloc.mmap_max: 0 (min: 0, max: 2147483647)
+ glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0x[f]+)
+ glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0x[f]+)
+diff --git a/malloc/Makefile b/malloc/Makefile
+index 0137595e17..e9a6666d22 100644
+--- a/malloc/Makefile
++++ b/malloc/Makefile
+@@ -78,9 +78,9 @@ tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \
+ tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \
+ $(tests-static),$(tests))
+
+-# Run all testes with GLIBC_TUNABLES=glibc.malloc.hugetlb=1 that check the
+-# Transparent Huge Pages support. We need exclude some tests that define
+-# the ENV vars.
++# Run all tests with GLIBC_TUNABLES=glibc.malloc.hugetlb={1,2} which check
++# the Transparent Huge Pages support (1) or automatic huge page support (2).
++# We need exclude some tests that define the ENV vars.
+ tests-exclude-hugetlb1 = \
+ tst-compathooks-off \
+ tst-compathooks-on \
+@@ -93,6 +93,8 @@ tests-exclude-hugetlb1 = \
+ tst-mallocstate
+ tests-malloc-hugetlb1 = \
+ $(filter-out $(tests-exclude-hugetlb1), $(tests))
++tests-malloc-hugetlb2 = \
++ $(filter-out $(tests-exclude-hugetlb1), $(tests))
+
+ # -lmcheck needs __malloc_initialize_hook, which was deprecated in 2.24.
+ ifeq ($(have-GLIBC_2.23)$(build-shared),yesyes)
+diff --git a/malloc/arena.c b/malloc/arena.c
+index cd00c7bef4..9a6e1af2bd 100644
+--- a/malloc/arena.c
++++ b/malloc/arena.c
+@@ -230,7 +230,7 @@ TUNABLE_CALLBACK_FNDECL (set_tcache_count, size_t)
+ TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
+ #endif
+ TUNABLE_CALLBACK_FNDECL (set_mxfast, size_t)
+-TUNABLE_CALLBACK_FNDECL (set_hugetlb, int32_t)
++TUNABLE_CALLBACK_FNDECL (set_hugetlb, size_t)
+ #else
+ /* Initialization routine. */
+ #include <string.h>
+@@ -331,7 +331,7 @@ ptmalloc_init (void)
+ TUNABLE_CALLBACK (set_tcache_unsorted_limit));
+ # endif
+ TUNABLE_GET (mxfast, size_t, TUNABLE_CALLBACK (set_mxfast));
+- TUNABLE_GET (hugetlb, int32_t, TUNABLE_CALLBACK (set_hugetlb));
++ TUNABLE_GET (hugetlb, size_t, TUNABLE_CALLBACK (set_hugetlb));
+ #else
+ if (__glibc_likely (_environ != NULL))
+ {
+diff --git a/malloc/malloc.c b/malloc/malloc.c
+index 6b6ec53db1..75efdc2ee7 100644
+--- a/malloc/malloc.c
++++ b/malloc/malloc.c
+@@ -1883,6 +1883,10 @@ struct malloc_par
+ #if HAVE_TUNABLES
+ /* Transparent Large Page support. */
+ INTERNAL_SIZE_T thp_pagesize;
++ /* A value different than 0 means to align mmap allocation to hp_pagesize
++ add hp_flags on flags. */
++ INTERNAL_SIZE_T hp_pagesize;
++ int hp_flags;
+ #endif
+
+ /* Memory map support */
+@@ -2440,7 +2444,10 @@ sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
+ if (mm == MAP_FAILED)
+ return mm;
+
+- madvise_thp (mm, size);
++#ifdef MAP_HUGETLB
++ if (!(extra_flags & MAP_HUGETLB))
++ madvise_thp (mm, size);
++#endif
+
+ /*
+ The offset to the start of the mmapped region is stored in the prev_size
+@@ -2528,7 +2535,18 @@ sysmalloc (INTERNAL_SIZE_T nb, mstate av)
+ || ((unsigned long) (nb) >= (unsigned long) (mp_.mmap_threshold)
+ && (mp_.n_mmaps < mp_.n_mmaps_max)))
+ {
+- char *mm = sysmalloc_mmap (nb, pagesize, 0, av);
++ char *mm;
++#if HAVE_TUNABLES
++ if (mp_.hp_pagesize > 0 && nb >= mp_.hp_pagesize)
++ {
++ /* There is no need to isse the THP madvise call if Huge Pages are
++ used directly. */
++ mm = sysmalloc_mmap (nb, mp_.hp_pagesize, mp_.hp_flags, av);
++ if (mm != MAP_FAILED)
++ return mm;
++ }
++#endif
++ mm = sysmalloc_mmap (nb, pagesize, 0, av);
+ if (mm != MAP_FAILED)
+ return mm;
+ tried_mmap = true;
+@@ -2609,7 +2627,9 @@ sysmalloc (INTERNAL_SIZE_T nb, mstate av)
+ }
+ else if (!tried_mmap)
+ {
+- /* We can at least try to use to mmap memory. */
++ /* We can at least try to use to mmap memory. If new_heap fails
++ it is unlikely that trying to allocate huge pages will
++ succeed. */
+ char *mm = sysmalloc_mmap (nb, pagesize, 0, av);
+ if (mm != MAP_FAILED)
+ return mm;
+@@ -5383,7 +5403,7 @@ do_set_mxfast (size_t value)
+
+ #if HAVE_TUNABLES
+ static __always_inline int
+-do_set_hugetlb (int32_t value)
++do_set_hugetlb (size_t value)
+ {
+ if (value == 1)
+ {
+@@ -5395,6 +5415,9 @@ do_set_hugetlb (int32_t value)
+ if (thp_mode == malloc_thp_mode_madvise)
+ mp_.thp_pagesize = __malloc_default_thp_pagesize ();
+ }
++ else if (value >= 2)
++ __malloc_hugepage_config (value == 2 ? 0 : value, &mp_.hp_pagesize,
++ &mp_.hp_flags);
+ return 0;
+ }
+ #endif
+diff --git a/manual/tunables.texi b/manual/tunables.texi
+index 9ca6e3f603..58a47b2e9b 100644
+--- a/manual/tunables.texi
++++ b/manual/tunables.texi
+@@ -278,6 +278,13 @@ default value is @code{0}, which disables any additional support on
+ Setting its value to @code{1} enables the use of @code{madvise} with
+ @code{MADV_HUGEPAGE} after memory allocation with @code{mmap}. It is enabled
+ only if the system supports Transparent Huge Page (currently only on Linux).
++
++Setting its value to @code{2} enables the use of Huge Page directly with
++@code{mmap} with the use of @code{MAP_HUGETLB} flag. The huge page size
++to use will be the default one provided by the system. A value larger than
++@code{2} specifies huge page size, which will be matched against the system
++supported ones. If provided value is invalid, @code{MAP_HUGETLB} will not
++be used.
+ @end deftp
+
+ @node Dynamic Linking Tunables
+diff --git a/sysdeps/generic/malloc-hugepages.c b/sysdeps/generic/malloc-hugepages.c
+index 8fb459a263..946284a33c 100644
+--- a/sysdeps/generic/malloc-hugepages.c
++++ b/sysdeps/generic/malloc-hugepages.c
+@@ -29,3 +29,11 @@ __malloc_thp_mode (void)
+ {
+ return malloc_thp_mode_not_supported;
+ }
++
++/* Return the default transparent huge page size. */
++void
++__malloc_hugepage_config (size_t requested, size_t *pagesize, int *flags)
++{
++ *pagesize = 0;
++ *flags = 0;
++}
+diff --git a/sysdeps/generic/malloc-hugepages.h b/sysdeps/generic/malloc-hugepages.h
+index f5a442e328..75cda3796a 100644
+--- a/sysdeps/generic/malloc-hugepages.h
++++ b/sysdeps/generic/malloc-hugepages.h
+@@ -34,4 +34,11 @@ enum malloc_thp_mode_t
+
+ enum malloc_thp_mode_t __malloc_thp_mode (void) attribute_hidden;
+
++/* Return the supported huge page size from the REQUESTED sizes on PAGESIZE
++ along with the required extra mmap flags on FLAGS, Requesting the value
++ of 0 returns the default huge page size, otherwise the value will be
++ matched against the sizes supported by the system. */
++void __malloc_hugepage_config (size_t requested, size_t *pagesize, int *flags)
++ attribute_hidden;
++
+ #endif /* _MALLOC_HUGEPAGES_H */
+diff --git a/sysdeps/unix/sysv/linux/malloc-hugepages.c b/sysdeps/unix/sysv/linux/malloc-hugepages.c
+index 7497e07260..0e05291d61 100644
+--- a/sysdeps/unix/sysv/linux/malloc-hugepages.c
++++ b/sysdeps/unix/sysv/linux/malloc-hugepages.c
+@@ -17,8 +17,10 @@
+ not, see <https://www.gnu.org/licenses/>. */
+
+ #include <intprops.h>
++#include <dirent.h>
+ #include <malloc-hugepages.h>
+ #include <not-cancel.h>
++#include <sys/mman.h>
+
+ unsigned long int
+ __malloc_default_thp_pagesize (void)
+@@ -72,3 +74,128 @@ __malloc_thp_mode (void)
+ }
+ return malloc_thp_mode_not_supported;
+ }
++
++static size_t
++malloc_default_hugepage_size (void)
++{
++ int fd = __open64_nocancel ("/proc/meminfo", O_RDONLY);
++ if (fd == -1)
++ return 0;
++
++ size_t hpsize = 0;
++
++ char buf[512];
++ off64_t off = 0;
++ while (1)
++ {
++ ssize_t r = __pread64_nocancel (fd, buf, sizeof (buf) - 1, off);
++ if (r < 0)
++ break;
++ buf[r] = '\0';
++
++ /* If the tag is not found, read the last line again. */
++ const char *s = strstr (buf, "Hugepagesize:");
++ if (s == NULL)
++ {
++ char *nl = strrchr (buf, '\n');
++ if (nl == NULL)
++ break;
++ off += (nl + 1) - buf;
++ continue;
++ }
++
++ /* The default huge page size is in the form:
++ Hugepagesize: NUMBER kB */
++ s += sizeof ("Hugepagesize: ") - 1;
++ for (int i = 0; (s[i] >= '0' && s[i] <= '9') || s[i] == ' '; i++)
++ {
++ if (s[i] == ' ')
++ continue;
++ hpsize *= 10;
++ hpsize += s[i] - '0';
++ }
++ hpsize *= 1024;
++ break;
++ }
++
++ __close_nocancel (fd);
++
++ return hpsize;
++}
++
++static inline int
++hugepage_flags (size_t pagesize)
++{
++ return MAP_HUGETLB | (__builtin_ctzll (pagesize) << MAP_HUGE_SHIFT);
++}
++
++void
++__malloc_hugepage_config (size_t requested, size_t *pagesize, int *flags)
++{
++ *pagesize = 0;
++ *flags = 0;
++
++ if (requested == 0)
++ {
++ *pagesize = malloc_default_hugepage_size ();
++ if (*pagesize != 0)
++ *flags = hugepage_flags (*pagesize);
++ return;
++ }
++
++ /* Each entry represents a supported huge page in the form of:
++ hugepages-<size>kB. */
++ int dirfd = __open64_nocancel ("/sys/kernel/mm/hugepages",
++ O_RDONLY | O_DIRECTORY, 0);
++ if (dirfd == -1)
++ return;
++
++ char buffer[1024];
++ while (true)
++ {
++#if !IS_IN(libc)
++# define __getdents64 getdents64
++#endif
++ ssize_t ret = __getdents64 (dirfd, buffer, sizeof (buffer));
++ if (ret == -1)
++ break;
++ else if (ret == 0)
++ break;
++
++ bool found = false;
++ char *begin = buffer, *end = buffer + ret;
++ while (begin != end)
++ {
++ unsigned short int d_reclen;
++ memcpy (&d_reclen, begin + offsetof (struct dirent64, d_reclen),
++ sizeof (d_reclen));
++ const char *dname = begin + offsetof (struct dirent64, d_name);
++ begin += d_reclen;
++
++ if (dname[0] == '.'
++ || strncmp (dname, "hugepages-", sizeof ("hugepages-") - 1) != 0)
++ continue;
++
++ size_t hpsize = 0;
++ const char *sizestr = dname + sizeof ("hugepages-") - 1;
++ for (int i = 0; sizestr[i] >= '0' && sizestr[i] <= '9'; i++)
++ {
++ hpsize *= 10;
++ hpsize += sizestr[i] - '0';
++ }
++ hpsize *= 1024;
++
++ if (hpsize == requested)
++ {
++ *pagesize = hpsize;
++ *flags = hugepage_flags (*pagesize);
++ found = true;
++ break;
++ }
++ }
++ if (found)
++ break;
++ }
++
++ __close_nocancel (dirfd);
++}
+--
+2.33.0
+