diff options
Diffstat (limited to 'malloc-hugepage-0004-malloc-Add-Huge-Page-support-for-mmap.patch')
-rw-r--r-- | malloc-hugepage-0004-malloc-Add-Huge-Page-support-for-mmap.patch | 476 |
1 files changed, 476 insertions, 0 deletions
diff --git a/malloc-hugepage-0004-malloc-Add-Huge-Page-support-for-mmap.patch b/malloc-hugepage-0004-malloc-Add-Huge-Page-support-for-mmap.patch new file mode 100644 index 0000000..1969a1f --- /dev/null +++ b/malloc-hugepage-0004-malloc-Add-Huge-Page-support-for-mmap.patch @@ -0,0 +1,476 @@ +From 98d5fcb8d099a1a868e032c89891c395a2f365c5 Mon Sep 17 00:00:00 2001 +From: Adhemerval Zanella <adhemerval.zanella@linaro.org> +Date: Mon, 16 Aug 2021 15:08:27 -0300 +Subject: [PATCH 4/7] malloc: Add Huge Page support for mmap + +With the morecore hook removed, there is not easy way to provide huge +pages support on with glibc allocator without resorting to transparent +huge pages. And some users and programs do prefer to use the huge pages +directly instead of THP for multiple reasons: no splitting, re-merging +by the VM, no TLB shootdowns for running processes, fast allocation +from the reserve pool, no competition with the rest of the processes +unlike THP, no swapping all, etc. + +This patch extends the 'glibc.malloc.hugetlb' tunable: the value +'2' means to use huge pages directly with the system default size, +while a positive value means and specific page size that is matched +against the supported ones by the system. + +Currently only memory allocated on sysmalloc() is handled, the arenas +still uses the default system page size. + +To test is a new rule is added tests-malloc-hugetlb2, which run the +addes tests with the required GLIBC_TUNABLE setting. On systems without +a reserved huge pages pool, is just stress the mmap(MAP_HUGETLB) +allocation failure. To improve test coverage it is required to create +a pool with some allocated pages. + +Checked on x86_64-linux-gnu. + +Reviewed-by: DJ Delorie <dj@redhat.com> +--- + NEWS | 8 +- + Rules | 17 +++ + elf/dl-tunables.list | 3 +- + elf/tst-rtld-list-tunables.exp | 2 +- + malloc/Makefile | 8 +- + malloc/arena.c | 4 +- + malloc/malloc.c | 31 ++++- + manual/tunables.texi | 7 ++ + sysdeps/generic/malloc-hugepages.c | 8 ++ + sysdeps/generic/malloc-hugepages.h | 7 ++ + sysdeps/unix/sysv/linux/malloc-hugepages.c | 127 +++++++++++++++++++++ + 11 files changed, 207 insertions(+), 15 deletions(-) + +diff --git a/NEWS b/NEWS +index 3b94dd209c..c7200cd4e8 100644 +--- a/NEWS ++++ b/NEWS +@@ -93,9 +93,11 @@ Major new features: + Restartable Sequences. + + * On Linux, a new tunable, glibc.malloc.hugetlb, can be used to +- make malloc issue madvise plus MADV_HUGEPAGE on mmap and sbrk calls. +- Setting this might improve performance with Transparent Huge Pages madvise +- mode depending of the workload. ++ either make malloc issue madvise plus MADV_HUGEPAGE on mmap and sbrk ++ or to use huge pages directly with mmap calls with the MAP_HUGETLB ++ flags). The former can improve performance when Transparent Huge Pages ++ is set to 'madvise' mode while the latter uses the system reserved ++ huge pages. + + Deprecated and removed features, and other changes affecting compatibility: + +diff --git a/Rules b/Rules +index 5f5d9ba4cc..be34982daa 100644 +--- a/Rules ++++ b/Rules +@@ -158,6 +158,7 @@ tests: $(tests:%=$(objpfx)%.out) $(tests-internal:%=$(objpfx)%.out) \ + $(tests-mcheck:%=$(objpfx)%-mcheck.out) \ + $(tests-malloc-check:%=$(objpfx)%-malloc-check.out) \ + $(tests-malloc-hugetlb1:%=$(objpfx)%-malloc-hugetlb1.out) \ ++ $(tests-malloc-hugetlb2:%=$(objpfx)%-malloc-hugetlb2.out) \ + $(tests-special) $(tests-printers-out) + xtests: tests $(xtests:%=$(objpfx)%.out) $(xtests-special) + endif +@@ -170,6 +171,7 @@ else + tests-expected = $(tests) $(tests-internal) $(tests-printers) \ + $(tests-container) $(tests-malloc-check:%=%-malloc-check) \ + $(tests-malloc-hugetlb1:%=%-malloc-hugetlb1) \ ++ $(tests-malloc-hugetlb2:%=%-malloc-hugetlb2) \ + $(tests-mcheck:%=%-mcheck) + endif + tests: +@@ -199,6 +201,7 @@ endif + binaries-mcheck-tests = $(tests-mcheck:%=%-mcheck) + binaries-malloc-check-tests = $(tests-malloc-check:%=%-malloc-check) + binaries-malloc-hugetlb1-tests = $(tests-malloc-hugetlb1:%=%-malloc-hugetlb1) ++binaries-malloc-hugetlb2-tests = $(tests-malloc-hugetlb2:%=%-malloc-hugetlb2) + else + binaries-all-notests = + binaries-all-tests = $(tests) $(tests-internal) $(xtests) $(test-srcs) +@@ -211,6 +214,7 @@ binaries-pie-notests = + binaries-mcheck-tests = + binaries-malloc-check-tests = + binaries-malloc-hugetlb1-tests = ++binaries-malloc-hugetlb2-tests = + endif + + binaries-pie = $(binaries-pie-tests) $(binaries-pie-notests) +@@ -259,6 +263,14 @@ $(addprefix $(objpfx),$(binaries-malloc-hugetlb1-tests)): %-malloc-hugetlb1: %.o + $(+link-tests) + endif + ++ifneq "$(strip $(binaries-malloc-hugetlb2-tests))" "" ++$(addprefix $(objpfx),$(binaries-malloc-hugetlb2-tests)): %-malloc-hugetlb2: %.o \ ++ $(link-extra-libs-tests) \ ++ $(sort $(filter $(common-objpfx)lib%,$(link-libc))) \ ++ $(addprefix $(csu-objpfx),start.o) $(+preinit) $(+postinit) ++ $(+link-tests) ++endif ++ + ifneq "$(strip $(binaries-pie-tests))" "" + $(addprefix $(objpfx),$(binaries-pie-tests)): %: %.o \ + $(link-extra-libs-tests) \ +@@ -302,6 +314,11 @@ $(1)-malloc-hugetlb1-ENV += GLIBC_TUNABLES=glibc.malloc.hugetlb=1 + endef + $(foreach t,$(tests-malloc-hugetlb1),$(eval $(call malloc-hugetlb1-ENVS,$(t)))) + ++# All malloc-hugetlb2 tests will be run with GLIBC_TUNABLE=glibc.malloc.hugetlb=2 ++define malloc-hugetlb2-ENVS ++$(1)-malloc-hugetlb2-ENV += GLIBC_TUNABLES=glibc.malloc.hugetlb=2 ++endef ++$(foreach t,$(tests-malloc-hugetlb2),$(eval $(call malloc-hugetlb2-ENVS,$(t)))) + + # mcheck tests need the debug DSO to support -lmcheck. + define mcheck-ENVS +diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list +index d1fd3f3e91..845d521a43 100644 +--- a/elf/dl-tunables.list ++++ b/elf/dl-tunables.list +@@ -93,9 +93,8 @@ glibc { + security_level: SXID_IGNORE + } + hugetlb { +- type: INT_32 ++ type: SIZE_T + minval: 0 +- maxval: 1 + } + } + cpu { +diff --git a/elf/tst-rtld-list-tunables.exp b/elf/tst-rtld-list-tunables.exp +index d8e363f2c5..cdfdb56a94 100644 +--- a/elf/tst-rtld-list-tunables.exp ++++ b/elf/tst-rtld-list-tunables.exp +@@ -1,7 +1,7 @@ + glibc.malloc.arena_max: 0x0 (min: 0x1, max: 0x[f]+) + glibc.malloc.arena_test: 0x0 (min: 0x1, max: 0x[f]+) + glibc.malloc.check: 0 (min: 0, max: 3) +-glibc.malloc.hugetlb: 0 (min: 0, max: 1) ++glibc.malloc.hugetlb: 0x0 (min: 0x0, max: 0x[f]+) + glibc.malloc.mmap_max: 0 (min: 0, max: 2147483647) + glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0x[f]+) + glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0x[f]+) +diff --git a/malloc/Makefile b/malloc/Makefile +index 0137595e17..e9a6666d22 100644 +--- a/malloc/Makefile ++++ b/malloc/Makefile +@@ -78,9 +78,9 @@ tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \ + tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \ + $(tests-static),$(tests)) + +-# Run all testes with GLIBC_TUNABLES=glibc.malloc.hugetlb=1 that check the +-# Transparent Huge Pages support. We need exclude some tests that define +-# the ENV vars. ++# Run all tests with GLIBC_TUNABLES=glibc.malloc.hugetlb={1,2} which check ++# the Transparent Huge Pages support (1) or automatic huge page support (2). ++# We need exclude some tests that define the ENV vars. + tests-exclude-hugetlb1 = \ + tst-compathooks-off \ + tst-compathooks-on \ +@@ -93,6 +93,8 @@ tests-exclude-hugetlb1 = \ + tst-mallocstate + tests-malloc-hugetlb1 = \ + $(filter-out $(tests-exclude-hugetlb1), $(tests)) ++tests-malloc-hugetlb2 = \ ++ $(filter-out $(tests-exclude-hugetlb1), $(tests)) + + # -lmcheck needs __malloc_initialize_hook, which was deprecated in 2.24. + ifeq ($(have-GLIBC_2.23)$(build-shared),yesyes) +diff --git a/malloc/arena.c b/malloc/arena.c +index cd00c7bef4..9a6e1af2bd 100644 +--- a/malloc/arena.c ++++ b/malloc/arena.c +@@ -230,7 +230,7 @@ TUNABLE_CALLBACK_FNDECL (set_tcache_count, size_t) + TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t) + #endif + TUNABLE_CALLBACK_FNDECL (set_mxfast, size_t) +-TUNABLE_CALLBACK_FNDECL (set_hugetlb, int32_t) ++TUNABLE_CALLBACK_FNDECL (set_hugetlb, size_t) + #else + /* Initialization routine. */ + #include <string.h> +@@ -331,7 +331,7 @@ ptmalloc_init (void) + TUNABLE_CALLBACK (set_tcache_unsorted_limit)); + # endif + TUNABLE_GET (mxfast, size_t, TUNABLE_CALLBACK (set_mxfast)); +- TUNABLE_GET (hugetlb, int32_t, TUNABLE_CALLBACK (set_hugetlb)); ++ TUNABLE_GET (hugetlb, size_t, TUNABLE_CALLBACK (set_hugetlb)); + #else + if (__glibc_likely (_environ != NULL)) + { +diff --git a/malloc/malloc.c b/malloc/malloc.c +index 6b6ec53db1..75efdc2ee7 100644 +--- a/malloc/malloc.c ++++ b/malloc/malloc.c +@@ -1883,6 +1883,10 @@ struct malloc_par + #if HAVE_TUNABLES + /* Transparent Large Page support. */ + INTERNAL_SIZE_T thp_pagesize; ++ /* A value different than 0 means to align mmap allocation to hp_pagesize ++ add hp_flags on flags. */ ++ INTERNAL_SIZE_T hp_pagesize; ++ int hp_flags; + #endif + + /* Memory map support */ +@@ -2440,7 +2444,10 @@ sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av) + if (mm == MAP_FAILED) + return mm; + +- madvise_thp (mm, size); ++#ifdef MAP_HUGETLB ++ if (!(extra_flags & MAP_HUGETLB)) ++ madvise_thp (mm, size); ++#endif + + /* + The offset to the start of the mmapped region is stored in the prev_size +@@ -2528,7 +2535,18 @@ sysmalloc (INTERNAL_SIZE_T nb, mstate av) + || ((unsigned long) (nb) >= (unsigned long) (mp_.mmap_threshold) + && (mp_.n_mmaps < mp_.n_mmaps_max))) + { +- char *mm = sysmalloc_mmap (nb, pagesize, 0, av); ++ char *mm; ++#if HAVE_TUNABLES ++ if (mp_.hp_pagesize > 0 && nb >= mp_.hp_pagesize) ++ { ++ /* There is no need to isse the THP madvise call if Huge Pages are ++ used directly. */ ++ mm = sysmalloc_mmap (nb, mp_.hp_pagesize, mp_.hp_flags, av); ++ if (mm != MAP_FAILED) ++ return mm; ++ } ++#endif ++ mm = sysmalloc_mmap (nb, pagesize, 0, av); + if (mm != MAP_FAILED) + return mm; + tried_mmap = true; +@@ -2609,7 +2627,9 @@ sysmalloc (INTERNAL_SIZE_T nb, mstate av) + } + else if (!tried_mmap) + { +- /* We can at least try to use to mmap memory. */ ++ /* We can at least try to use to mmap memory. If new_heap fails ++ it is unlikely that trying to allocate huge pages will ++ succeed. */ + char *mm = sysmalloc_mmap (nb, pagesize, 0, av); + if (mm != MAP_FAILED) + return mm; +@@ -5383,7 +5403,7 @@ do_set_mxfast (size_t value) + + #if HAVE_TUNABLES + static __always_inline int +-do_set_hugetlb (int32_t value) ++do_set_hugetlb (size_t value) + { + if (value == 1) + { +@@ -5395,6 +5415,9 @@ do_set_hugetlb (int32_t value) + if (thp_mode == malloc_thp_mode_madvise) + mp_.thp_pagesize = __malloc_default_thp_pagesize (); + } ++ else if (value >= 2) ++ __malloc_hugepage_config (value == 2 ? 0 : value, &mp_.hp_pagesize, ++ &mp_.hp_flags); + return 0; + } + #endif +diff --git a/manual/tunables.texi b/manual/tunables.texi +index 9ca6e3f603..58a47b2e9b 100644 +--- a/manual/tunables.texi ++++ b/manual/tunables.texi +@@ -278,6 +278,13 @@ default value is @code{0}, which disables any additional support on + Setting its value to @code{1} enables the use of @code{madvise} with + @code{MADV_HUGEPAGE} after memory allocation with @code{mmap}. It is enabled + only if the system supports Transparent Huge Page (currently only on Linux). ++ ++Setting its value to @code{2} enables the use of Huge Page directly with ++@code{mmap} with the use of @code{MAP_HUGETLB} flag. The huge page size ++to use will be the default one provided by the system. A value larger than ++@code{2} specifies huge page size, which will be matched against the system ++supported ones. If provided value is invalid, @code{MAP_HUGETLB} will not ++be used. + @end deftp + + @node Dynamic Linking Tunables +diff --git a/sysdeps/generic/malloc-hugepages.c b/sysdeps/generic/malloc-hugepages.c +index 8fb459a263..946284a33c 100644 +--- a/sysdeps/generic/malloc-hugepages.c ++++ b/sysdeps/generic/malloc-hugepages.c +@@ -29,3 +29,11 @@ __malloc_thp_mode (void) + { + return malloc_thp_mode_not_supported; + } ++ ++/* Return the default transparent huge page size. */ ++void ++__malloc_hugepage_config (size_t requested, size_t *pagesize, int *flags) ++{ ++ *pagesize = 0; ++ *flags = 0; ++} +diff --git a/sysdeps/generic/malloc-hugepages.h b/sysdeps/generic/malloc-hugepages.h +index f5a442e328..75cda3796a 100644 +--- a/sysdeps/generic/malloc-hugepages.h ++++ b/sysdeps/generic/malloc-hugepages.h +@@ -34,4 +34,11 @@ enum malloc_thp_mode_t + + enum malloc_thp_mode_t __malloc_thp_mode (void) attribute_hidden; + ++/* Return the supported huge page size from the REQUESTED sizes on PAGESIZE ++ along with the required extra mmap flags on FLAGS, Requesting the value ++ of 0 returns the default huge page size, otherwise the value will be ++ matched against the sizes supported by the system. */ ++void __malloc_hugepage_config (size_t requested, size_t *pagesize, int *flags) ++ attribute_hidden; ++ + #endif /* _MALLOC_HUGEPAGES_H */ +diff --git a/sysdeps/unix/sysv/linux/malloc-hugepages.c b/sysdeps/unix/sysv/linux/malloc-hugepages.c +index 7497e07260..0e05291d61 100644 +--- a/sysdeps/unix/sysv/linux/malloc-hugepages.c ++++ b/sysdeps/unix/sysv/linux/malloc-hugepages.c +@@ -17,8 +17,10 @@ + not, see <https://www.gnu.org/licenses/>. */ + + #include <intprops.h> ++#include <dirent.h> + #include <malloc-hugepages.h> + #include <not-cancel.h> ++#include <sys/mman.h> + + unsigned long int + __malloc_default_thp_pagesize (void) +@@ -72,3 +74,128 @@ __malloc_thp_mode (void) + } + return malloc_thp_mode_not_supported; + } ++ ++static size_t ++malloc_default_hugepage_size (void) ++{ ++ int fd = __open64_nocancel ("/proc/meminfo", O_RDONLY); ++ if (fd == -1) ++ return 0; ++ ++ size_t hpsize = 0; ++ ++ char buf[512]; ++ off64_t off = 0; ++ while (1) ++ { ++ ssize_t r = __pread64_nocancel (fd, buf, sizeof (buf) - 1, off); ++ if (r < 0) ++ break; ++ buf[r] = '\0'; ++ ++ /* If the tag is not found, read the last line again. */ ++ const char *s = strstr (buf, "Hugepagesize:"); ++ if (s == NULL) ++ { ++ char *nl = strrchr (buf, '\n'); ++ if (nl == NULL) ++ break; ++ off += (nl + 1) - buf; ++ continue; ++ } ++ ++ /* The default huge page size is in the form: ++ Hugepagesize: NUMBER kB */ ++ s += sizeof ("Hugepagesize: ") - 1; ++ for (int i = 0; (s[i] >= '0' && s[i] <= '9') || s[i] == ' '; i++) ++ { ++ if (s[i] == ' ') ++ continue; ++ hpsize *= 10; ++ hpsize += s[i] - '0'; ++ } ++ hpsize *= 1024; ++ break; ++ } ++ ++ __close_nocancel (fd); ++ ++ return hpsize; ++} ++ ++static inline int ++hugepage_flags (size_t pagesize) ++{ ++ return MAP_HUGETLB | (__builtin_ctzll (pagesize) << MAP_HUGE_SHIFT); ++} ++ ++void ++__malloc_hugepage_config (size_t requested, size_t *pagesize, int *flags) ++{ ++ *pagesize = 0; ++ *flags = 0; ++ ++ if (requested == 0) ++ { ++ *pagesize = malloc_default_hugepage_size (); ++ if (*pagesize != 0) ++ *flags = hugepage_flags (*pagesize); ++ return; ++ } ++ ++ /* Each entry represents a supported huge page in the form of: ++ hugepages-<size>kB. */ ++ int dirfd = __open64_nocancel ("/sys/kernel/mm/hugepages", ++ O_RDONLY | O_DIRECTORY, 0); ++ if (dirfd == -1) ++ return; ++ ++ char buffer[1024]; ++ while (true) ++ { ++#if !IS_IN(libc) ++# define __getdents64 getdents64 ++#endif ++ ssize_t ret = __getdents64 (dirfd, buffer, sizeof (buffer)); ++ if (ret == -1) ++ break; ++ else if (ret == 0) ++ break; ++ ++ bool found = false; ++ char *begin = buffer, *end = buffer + ret; ++ while (begin != end) ++ { ++ unsigned short int d_reclen; ++ memcpy (&d_reclen, begin + offsetof (struct dirent64, d_reclen), ++ sizeof (d_reclen)); ++ const char *dname = begin + offsetof (struct dirent64, d_name); ++ begin += d_reclen; ++ ++ if (dname[0] == '.' ++ || strncmp (dname, "hugepages-", sizeof ("hugepages-") - 1) != 0) ++ continue; ++ ++ size_t hpsize = 0; ++ const char *sizestr = dname + sizeof ("hugepages-") - 1; ++ for (int i = 0; sizestr[i] >= '0' && sizestr[i] <= '9'; i++) ++ { ++ hpsize *= 10; ++ hpsize += sizestr[i] - '0'; ++ } ++ hpsize *= 1024; ++ ++ if (hpsize == requested) ++ { ++ *pagesize = hpsize; ++ *flags = hugepage_flags (*pagesize); ++ found = true; ++ break; ++ } ++ } ++ if (found) ++ break; ++ } ++ ++ __close_nocancel (dirfd); ++} +-- +2.33.0 + |