1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
|
From 98d5fcb8d099a1a868e032c89891c395a2f365c5 Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Mon, 16 Aug 2021 15:08:27 -0300
Subject: [PATCH 4/7] malloc: Add Huge Page support for mmap
With the morecore hook removed, there is not easy way to provide huge
pages support on with glibc allocator without resorting to transparent
huge pages. And some users and programs do prefer to use the huge pages
directly instead of THP for multiple reasons: no splitting, re-merging
by the VM, no TLB shootdowns for running processes, fast allocation
from the reserve pool, no competition with the rest of the processes
unlike THP, no swapping all, etc.
This patch extends the 'glibc.malloc.hugetlb' tunable: the value
'2' means to use huge pages directly with the system default size,
while a positive value means and specific page size that is matched
against the supported ones by the system.
Currently only memory allocated on sysmalloc() is handled, the arenas
still uses the default system page size.
To test is a new rule is added tests-malloc-hugetlb2, which run the
addes tests with the required GLIBC_TUNABLE setting. On systems without
a reserved huge pages pool, is just stress the mmap(MAP_HUGETLB)
allocation failure. To improve test coverage it is required to create
a pool with some allocated pages.
Checked on x86_64-linux-gnu.
Reviewed-by: DJ Delorie <dj@redhat.com>
---
NEWS | 8 +-
Rules | 17 +++
elf/dl-tunables.list | 3 +-
elf/tst-rtld-list-tunables.exp | 2 +-
malloc/Makefile | 8 +-
malloc/arena.c | 4 +-
malloc/malloc.c | 31 ++++-
manual/tunables.texi | 7 ++
sysdeps/generic/malloc-hugepages.c | 8 ++
sysdeps/generic/malloc-hugepages.h | 7 ++
sysdeps/unix/sysv/linux/malloc-hugepages.c | 127 +++++++++++++++++++++
11 files changed, 207 insertions(+), 15 deletions(-)
diff --git a/NEWS b/NEWS
index 3b94dd209c..c7200cd4e8 100644
--- a/NEWS
+++ b/NEWS
@@ -93,9 +93,11 @@ Major new features:
Restartable Sequences.
* On Linux, a new tunable, glibc.malloc.hugetlb, can be used to
- make malloc issue madvise plus MADV_HUGEPAGE on mmap and sbrk calls.
- Setting this might improve performance with Transparent Huge Pages madvise
- mode depending of the workload.
+ either make malloc issue madvise plus MADV_HUGEPAGE on mmap and sbrk
+ or to use huge pages directly with mmap calls with the MAP_HUGETLB
+ flags). The former can improve performance when Transparent Huge Pages
+ is set to 'madvise' mode while the latter uses the system reserved
+ huge pages.
Deprecated and removed features, and other changes affecting compatibility:
diff --git a/Rules b/Rules
index 5f5d9ba4cc..be34982daa 100644
--- a/Rules
+++ b/Rules
@@ -158,6 +158,7 @@ tests: $(tests:%=$(objpfx)%.out) $(tests-internal:%=$(objpfx)%.out) \
$(tests-mcheck:%=$(objpfx)%-mcheck.out) \
$(tests-malloc-check:%=$(objpfx)%-malloc-check.out) \
$(tests-malloc-hugetlb1:%=$(objpfx)%-malloc-hugetlb1.out) \
+ $(tests-malloc-hugetlb2:%=$(objpfx)%-malloc-hugetlb2.out) \
$(tests-special) $(tests-printers-out)
xtests: tests $(xtests:%=$(objpfx)%.out) $(xtests-special)
endif
@@ -170,6 +171,7 @@ else
tests-expected = $(tests) $(tests-internal) $(tests-printers) \
$(tests-container) $(tests-malloc-check:%=%-malloc-check) \
$(tests-malloc-hugetlb1:%=%-malloc-hugetlb1) \
+ $(tests-malloc-hugetlb2:%=%-malloc-hugetlb2) \
$(tests-mcheck:%=%-mcheck)
endif
tests:
@@ -199,6 +201,7 @@ endif
binaries-mcheck-tests = $(tests-mcheck:%=%-mcheck)
binaries-malloc-check-tests = $(tests-malloc-check:%=%-malloc-check)
binaries-malloc-hugetlb1-tests = $(tests-malloc-hugetlb1:%=%-malloc-hugetlb1)
+binaries-malloc-hugetlb2-tests = $(tests-malloc-hugetlb2:%=%-malloc-hugetlb2)
else
binaries-all-notests =
binaries-all-tests = $(tests) $(tests-internal) $(xtests) $(test-srcs)
@@ -211,6 +214,7 @@ binaries-pie-notests =
binaries-mcheck-tests =
binaries-malloc-check-tests =
binaries-malloc-hugetlb1-tests =
+binaries-malloc-hugetlb2-tests =
endif
binaries-pie = $(binaries-pie-tests) $(binaries-pie-notests)
@@ -259,6 +263,14 @@ $(addprefix $(objpfx),$(binaries-malloc-hugetlb1-tests)): %-malloc-hugetlb1: %.o
$(+link-tests)
endif
+ifneq "$(strip $(binaries-malloc-hugetlb2-tests))" ""
+$(addprefix $(objpfx),$(binaries-malloc-hugetlb2-tests)): %-malloc-hugetlb2: %.o \
+ $(link-extra-libs-tests) \
+ $(sort $(filter $(common-objpfx)lib%,$(link-libc))) \
+ $(addprefix $(csu-objpfx),start.o) $(+preinit) $(+postinit)
+ $(+link-tests)
+endif
+
ifneq "$(strip $(binaries-pie-tests))" ""
$(addprefix $(objpfx),$(binaries-pie-tests)): %: %.o \
$(link-extra-libs-tests) \
@@ -302,6 +314,11 @@ $(1)-malloc-hugetlb1-ENV += GLIBC_TUNABLES=glibc.malloc.hugetlb=1
endef
$(foreach t,$(tests-malloc-hugetlb1),$(eval $(call malloc-hugetlb1-ENVS,$(t))))
+# All malloc-hugetlb2 tests will be run with GLIBC_TUNABLE=glibc.malloc.hugetlb=2
+define malloc-hugetlb2-ENVS
+$(1)-malloc-hugetlb2-ENV += GLIBC_TUNABLES=glibc.malloc.hugetlb=2
+endef
+$(foreach t,$(tests-malloc-hugetlb2),$(eval $(call malloc-hugetlb2-ENVS,$(t))))
# mcheck tests need the debug DSO to support -lmcheck.
define mcheck-ENVS
diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
index d1fd3f3e91..845d521a43 100644
--- a/elf/dl-tunables.list
+++ b/elf/dl-tunables.list
@@ -93,9 +93,8 @@ glibc {
security_level: SXID_IGNORE
}
hugetlb {
- type: INT_32
+ type: SIZE_T
minval: 0
- maxval: 1
}
}
cpu {
diff --git a/elf/tst-rtld-list-tunables.exp b/elf/tst-rtld-list-tunables.exp
index d8e363f2c5..cdfdb56a94 100644
--- a/elf/tst-rtld-list-tunables.exp
+++ b/elf/tst-rtld-list-tunables.exp
@@ -1,7 +1,7 @@
glibc.malloc.arena_max: 0x0 (min: 0x1, max: 0x[f]+)
glibc.malloc.arena_test: 0x0 (min: 0x1, max: 0x[f]+)
glibc.malloc.check: 0 (min: 0, max: 3)
-glibc.malloc.hugetlb: 0 (min: 0, max: 1)
+glibc.malloc.hugetlb: 0x0 (min: 0x0, max: 0x[f]+)
glibc.malloc.mmap_max: 0 (min: 0, max: 2147483647)
glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0x[f]+)
glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0x[f]+)
diff --git a/malloc/Makefile b/malloc/Makefile
index 0137595e17..e9a6666d22 100644
--- a/malloc/Makefile
+++ b/malloc/Makefile
@@ -78,9 +78,9 @@ tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \
tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \
$(tests-static),$(tests))
-# Run all testes with GLIBC_TUNABLES=glibc.malloc.hugetlb=1 that check the
-# Transparent Huge Pages support. We need exclude some tests that define
-# the ENV vars.
+# Run all tests with GLIBC_TUNABLES=glibc.malloc.hugetlb={1,2} which check
+# the Transparent Huge Pages support (1) or automatic huge page support (2).
+# We need exclude some tests that define the ENV vars.
tests-exclude-hugetlb1 = \
tst-compathooks-off \
tst-compathooks-on \
@@ -93,6 +93,8 @@ tests-exclude-hugetlb1 = \
tst-mallocstate
tests-malloc-hugetlb1 = \
$(filter-out $(tests-exclude-hugetlb1), $(tests))
+tests-malloc-hugetlb2 = \
+ $(filter-out $(tests-exclude-hugetlb1), $(tests))
# -lmcheck needs __malloc_initialize_hook, which was deprecated in 2.24.
ifeq ($(have-GLIBC_2.23)$(build-shared),yesyes)
diff --git a/malloc/arena.c b/malloc/arena.c
index cd00c7bef4..9a6e1af2bd 100644
--- a/malloc/arena.c
+++ b/malloc/arena.c
@@ -230,7 +230,7 @@ TUNABLE_CALLBACK_FNDECL (set_tcache_count, size_t)
TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
#endif
TUNABLE_CALLBACK_FNDECL (set_mxfast, size_t)
-TUNABLE_CALLBACK_FNDECL (set_hugetlb, int32_t)
+TUNABLE_CALLBACK_FNDECL (set_hugetlb, size_t)
#else
/* Initialization routine. */
#include <string.h>
@@ -331,7 +331,7 @@ ptmalloc_init (void)
TUNABLE_CALLBACK (set_tcache_unsorted_limit));
# endif
TUNABLE_GET (mxfast, size_t, TUNABLE_CALLBACK (set_mxfast));
- TUNABLE_GET (hugetlb, int32_t, TUNABLE_CALLBACK (set_hugetlb));
+ TUNABLE_GET (hugetlb, size_t, TUNABLE_CALLBACK (set_hugetlb));
#else
if (__glibc_likely (_environ != NULL))
{
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 6b6ec53db1..75efdc2ee7 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -1883,6 +1883,10 @@ struct malloc_par
#if HAVE_TUNABLES
/* Transparent Large Page support. */
INTERNAL_SIZE_T thp_pagesize;
+ /* A value different than 0 means to align mmap allocation to hp_pagesize
+ add hp_flags on flags. */
+ INTERNAL_SIZE_T hp_pagesize;
+ int hp_flags;
#endif
/* Memory map support */
@@ -2440,7 +2444,10 @@ sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av)
if (mm == MAP_FAILED)
return mm;
- madvise_thp (mm, size);
+#ifdef MAP_HUGETLB
+ if (!(extra_flags & MAP_HUGETLB))
+ madvise_thp (mm, size);
+#endif
/*
The offset to the start of the mmapped region is stored in the prev_size
@@ -2528,7 +2535,18 @@ sysmalloc (INTERNAL_SIZE_T nb, mstate av)
|| ((unsigned long) (nb) >= (unsigned long) (mp_.mmap_threshold)
&& (mp_.n_mmaps < mp_.n_mmaps_max)))
{
- char *mm = sysmalloc_mmap (nb, pagesize, 0, av);
+ char *mm;
+#if HAVE_TUNABLES
+ if (mp_.hp_pagesize > 0 && nb >= mp_.hp_pagesize)
+ {
+ /* There is no need to isse the THP madvise call if Huge Pages are
+ used directly. */
+ mm = sysmalloc_mmap (nb, mp_.hp_pagesize, mp_.hp_flags, av);
+ if (mm != MAP_FAILED)
+ return mm;
+ }
+#endif
+ mm = sysmalloc_mmap (nb, pagesize, 0, av);
if (mm != MAP_FAILED)
return mm;
tried_mmap = true;
@@ -2609,7 +2627,9 @@ sysmalloc (INTERNAL_SIZE_T nb, mstate av)
}
else if (!tried_mmap)
{
- /* We can at least try to use to mmap memory. */
+ /* We can at least try to use to mmap memory. If new_heap fails
+ it is unlikely that trying to allocate huge pages will
+ succeed. */
char *mm = sysmalloc_mmap (nb, pagesize, 0, av);
if (mm != MAP_FAILED)
return mm;
@@ -5383,7 +5403,7 @@ do_set_mxfast (size_t value)
#if HAVE_TUNABLES
static __always_inline int
-do_set_hugetlb (int32_t value)
+do_set_hugetlb (size_t value)
{
if (value == 1)
{
@@ -5395,6 +5415,9 @@ do_set_hugetlb (int32_t value)
if (thp_mode == malloc_thp_mode_madvise)
mp_.thp_pagesize = __malloc_default_thp_pagesize ();
}
+ else if (value >= 2)
+ __malloc_hugepage_config (value == 2 ? 0 : value, &mp_.hp_pagesize,
+ &mp_.hp_flags);
return 0;
}
#endif
diff --git a/manual/tunables.texi b/manual/tunables.texi
index 9ca6e3f603..58a47b2e9b 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -278,6 +278,13 @@ default value is @code{0}, which disables any additional support on
Setting its value to @code{1} enables the use of @code{madvise} with
@code{MADV_HUGEPAGE} after memory allocation with @code{mmap}. It is enabled
only if the system supports Transparent Huge Page (currently only on Linux).
+
+Setting its value to @code{2} enables the use of Huge Page directly with
+@code{mmap} with the use of @code{MAP_HUGETLB} flag. The huge page size
+to use will be the default one provided by the system. A value larger than
+@code{2} specifies huge page size, which will be matched against the system
+supported ones. If provided value is invalid, @code{MAP_HUGETLB} will not
+be used.
@end deftp
@node Dynamic Linking Tunables
diff --git a/sysdeps/generic/malloc-hugepages.c b/sysdeps/generic/malloc-hugepages.c
index 8fb459a263..946284a33c 100644
--- a/sysdeps/generic/malloc-hugepages.c
+++ b/sysdeps/generic/malloc-hugepages.c
@@ -29,3 +29,11 @@ __malloc_thp_mode (void)
{
return malloc_thp_mode_not_supported;
}
+
+/* Return the default transparent huge page size. */
+void
+__malloc_hugepage_config (size_t requested, size_t *pagesize, int *flags)
+{
+ *pagesize = 0;
+ *flags = 0;
+}
diff --git a/sysdeps/generic/malloc-hugepages.h b/sysdeps/generic/malloc-hugepages.h
index f5a442e328..75cda3796a 100644
--- a/sysdeps/generic/malloc-hugepages.h
+++ b/sysdeps/generic/malloc-hugepages.h
@@ -34,4 +34,11 @@ enum malloc_thp_mode_t
enum malloc_thp_mode_t __malloc_thp_mode (void) attribute_hidden;
+/* Return the supported huge page size from the REQUESTED sizes on PAGESIZE
+ along with the required extra mmap flags on FLAGS, Requesting the value
+ of 0 returns the default huge page size, otherwise the value will be
+ matched against the sizes supported by the system. */
+void __malloc_hugepage_config (size_t requested, size_t *pagesize, int *flags)
+ attribute_hidden;
+
#endif /* _MALLOC_HUGEPAGES_H */
diff --git a/sysdeps/unix/sysv/linux/malloc-hugepages.c b/sysdeps/unix/sysv/linux/malloc-hugepages.c
index 7497e07260..0e05291d61 100644
--- a/sysdeps/unix/sysv/linux/malloc-hugepages.c
+++ b/sysdeps/unix/sysv/linux/malloc-hugepages.c
@@ -17,8 +17,10 @@
not, see <https://www.gnu.org/licenses/>. */
#include <intprops.h>
+#include <dirent.h>
#include <malloc-hugepages.h>
#include <not-cancel.h>
+#include <sys/mman.h>
unsigned long int
__malloc_default_thp_pagesize (void)
@@ -72,3 +74,128 @@ __malloc_thp_mode (void)
}
return malloc_thp_mode_not_supported;
}
+
+static size_t
+malloc_default_hugepage_size (void)
+{
+ int fd = __open64_nocancel ("/proc/meminfo", O_RDONLY);
+ if (fd == -1)
+ return 0;
+
+ size_t hpsize = 0;
+
+ char buf[512];
+ off64_t off = 0;
+ while (1)
+ {
+ ssize_t r = __pread64_nocancel (fd, buf, sizeof (buf) - 1, off);
+ if (r < 0)
+ break;
+ buf[r] = '\0';
+
+ /* If the tag is not found, read the last line again. */
+ const char *s = strstr (buf, "Hugepagesize:");
+ if (s == NULL)
+ {
+ char *nl = strrchr (buf, '\n');
+ if (nl == NULL)
+ break;
+ off += (nl + 1) - buf;
+ continue;
+ }
+
+ /* The default huge page size is in the form:
+ Hugepagesize: NUMBER kB */
+ s += sizeof ("Hugepagesize: ") - 1;
+ for (int i = 0; (s[i] >= '0' && s[i] <= '9') || s[i] == ' '; i++)
+ {
+ if (s[i] == ' ')
+ continue;
+ hpsize *= 10;
+ hpsize += s[i] - '0';
+ }
+ hpsize *= 1024;
+ break;
+ }
+
+ __close_nocancel (fd);
+
+ return hpsize;
+}
+
+static inline int
+hugepage_flags (size_t pagesize)
+{
+ return MAP_HUGETLB | (__builtin_ctzll (pagesize) << MAP_HUGE_SHIFT);
+}
+
+void
+__malloc_hugepage_config (size_t requested, size_t *pagesize, int *flags)
+{
+ *pagesize = 0;
+ *flags = 0;
+
+ if (requested == 0)
+ {
+ *pagesize = malloc_default_hugepage_size ();
+ if (*pagesize != 0)
+ *flags = hugepage_flags (*pagesize);
+ return;
+ }
+
+ /* Each entry represents a supported huge page in the form of:
+ hugepages-<size>kB. */
+ int dirfd = __open64_nocancel ("/sys/kernel/mm/hugepages",
+ O_RDONLY | O_DIRECTORY, 0);
+ if (dirfd == -1)
+ return;
+
+ char buffer[1024];
+ while (true)
+ {
+#if !IS_IN(libc)
+# define __getdents64 getdents64
+#endif
+ ssize_t ret = __getdents64 (dirfd, buffer, sizeof (buffer));
+ if (ret == -1)
+ break;
+ else if (ret == 0)
+ break;
+
+ bool found = false;
+ char *begin = buffer, *end = buffer + ret;
+ while (begin != end)
+ {
+ unsigned short int d_reclen;
+ memcpy (&d_reclen, begin + offsetof (struct dirent64, d_reclen),
+ sizeof (d_reclen));
+ const char *dname = begin + offsetof (struct dirent64, d_name);
+ begin += d_reclen;
+
+ if (dname[0] == '.'
+ || strncmp (dname, "hugepages-", sizeof ("hugepages-") - 1) != 0)
+ continue;
+
+ size_t hpsize = 0;
+ const char *sizestr = dname + sizeof ("hugepages-") - 1;
+ for (int i = 0; sizestr[i] >= '0' && sizestr[i] <= '9'; i++)
+ {
+ hpsize *= 10;
+ hpsize += sizestr[i] - '0';
+ }
+ hpsize *= 1024;
+
+ if (hpsize == requested)
+ {
+ *pagesize = hpsize;
+ *flags = hugepage_flags (*pagesize);
+ found = true;
+ break;
+ }
+ }
+ if (found)
+ break;
+ }
+
+ __close_nocancel (dirfd);
+}
--
2.33.0
|