1 #define _GNU_SOURCE
2 #include <ctype.h>
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <limits.h>
6 #include <dirent.h>
7 #include <signal.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <stdbool.h>
11 #include <string.h>
12 #include <unistd.h>
13
14 #include <linux/mman.h>
15 #include <sys/mman.h>
16 #include <sys/wait.h>
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <sys/sysmacros.h>
20 #include <sys/vfs.h>
21
22 #include "linux/magic.h"
23
24 #include "vm_util.h"
25 #include "thp_settings.h"
26
27 #define BASE_ADDR ((void *)(1UL << 30))
28 static unsigned long hpage_pmd_size;
29 static unsigned long page_size;
30 static int hpage_pmd_nr;
31 static int anon_order;
32
33 #define PID_SMAPS "/proc/self/smaps"
34 #define TEST_FILE "collapse_test_file"
35
36 #define MAX_LINE_LENGTH 500
37
38 enum vma_type {
39 VMA_ANON,
40 VMA_FILE,
41 VMA_SHMEM,
42 };
43
44 struct mem_ops {
45 void *(*setup_area)(int nr_hpages);
46 void (*cleanup_area)(void *p, unsigned long size);
47 void (*fault)(void *p, unsigned long start, unsigned long end);
48 bool (*check_huge)(void *addr, int nr_hpages);
49 const char *name;
50 };
51
52 static struct mem_ops *file_ops;
53 static struct mem_ops *anon_ops;
54 static struct mem_ops *shmem_ops;
55
56 struct collapse_context {
57 void (*collapse)(const char *msg, char *p, int nr_hpages,
58 struct mem_ops *ops, bool expect);
59 bool enforce_pte_scan_limits;
60 const char *name;
61 };
62
63 static struct collapse_context *khugepaged_context;
64 static struct collapse_context *madvise_context;
65
66 struct file_info {
67 const char *dir;
68 char path[PATH_MAX];
69 enum vma_type type;
70 int fd;
71 char dev_queue_read_ahead_path[PATH_MAX];
72 };
73
74 static struct file_info finfo;
75 static bool skip_settings_restore;
76 static int exit_status;
77
success(const char * msg)78 static void success(const char *msg)
79 {
80 printf(" \e[32m%s\e[0m\n", msg);
81 }
82
fail(const char * msg)83 static void fail(const char *msg)
84 {
85 printf(" \e[31m%s\e[0m\n", msg);
86 exit_status++;
87 }
88
skip(const char * msg)89 static void skip(const char *msg)
90 {
91 printf(" \e[33m%s\e[0m\n", msg);
92 }
93
restore_settings_atexit(void)94 static void restore_settings_atexit(void)
95 {
96 if (skip_settings_restore)
97 return;
98
99 printf("Restore THP and khugepaged settings...");
100 thp_restore_settings();
101 success("OK");
102
103 skip_settings_restore = true;
104 }
105
restore_settings(int sig)106 static void restore_settings(int sig)
107 {
108 /* exit() will invoke the restore_settings_atexit handler. */
109 exit(sig ? EXIT_FAILURE : exit_status);
110 }
111
save_settings(void)112 static void save_settings(void)
113 {
114 printf("Save THP and khugepaged settings...");
115 if (file_ops && finfo.type == VMA_FILE)
116 thp_set_read_ahead_path(finfo.dev_queue_read_ahead_path);
117 thp_save_settings();
118
119 success("OK");
120
121 atexit(restore_settings_atexit);
122 signal(SIGTERM, restore_settings);
123 signal(SIGINT, restore_settings);
124 signal(SIGHUP, restore_settings);
125 signal(SIGQUIT, restore_settings);
126 }
127
get_finfo(const char * dir)128 static void get_finfo(const char *dir)
129 {
130 struct stat path_stat;
131 struct statfs fs;
132 char buf[1 << 10];
133 char path[PATH_MAX];
134 char *str, *end;
135
136 finfo.dir = dir;
137 stat(finfo.dir, &path_stat);
138 if (!S_ISDIR(path_stat.st_mode)) {
139 printf("%s: Not a directory (%s)\n", __func__, finfo.dir);
140 exit(EXIT_FAILURE);
141 }
142 if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE,
143 finfo.dir) >= sizeof(finfo.path)) {
144 printf("%s: Pathname is too long\n", __func__);
145 exit(EXIT_FAILURE);
146 }
147 if (statfs(finfo.dir, &fs)) {
148 perror("statfs()");
149 exit(EXIT_FAILURE);
150 }
151 finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE;
152 if (finfo.type == VMA_SHMEM)
153 return;
154
155 /* Find owning device's queue/read_ahead_kb control */
156 if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent",
157 major(path_stat.st_dev), minor(path_stat.st_dev))
158 >= sizeof(path)) {
159 printf("%s: Pathname is too long\n", __func__);
160 exit(EXIT_FAILURE);
161 }
162 if (read_file(path, buf, sizeof(buf)) < 0) {
163 perror("read_file(read_num)");
164 exit(EXIT_FAILURE);
165 }
166 if (strstr(buf, "DEVTYPE=disk")) {
167 /* Found it */
168 if (snprintf(finfo.dev_queue_read_ahead_path,
169 sizeof(finfo.dev_queue_read_ahead_path),
170 "/sys/dev/block/%d:%d/queue/read_ahead_kb",
171 major(path_stat.st_dev), minor(path_stat.st_dev))
172 >= sizeof(finfo.dev_queue_read_ahead_path)) {
173 printf("%s: Pathname is too long\n", __func__);
174 exit(EXIT_FAILURE);
175 }
176 return;
177 }
178 if (!strstr(buf, "DEVTYPE=partition")) {
179 printf("%s: Unknown device type: %s\n", __func__, path);
180 exit(EXIT_FAILURE);
181 }
182 /*
183 * Partition of block device - need to find actual device.
184 * Using naming convention that devnameN is partition of
185 * device devname.
186 */
187 str = strstr(buf, "DEVNAME=");
188 if (!str) {
189 printf("%s: Could not read: %s", __func__, path);
190 exit(EXIT_FAILURE);
191 }
192 str += 8;
193 end = str;
194 while (*end) {
195 if (isdigit(*end)) {
196 *end = '\0';
197 if (snprintf(finfo.dev_queue_read_ahead_path,
198 sizeof(finfo.dev_queue_read_ahead_path),
199 "/sys/block/%s/queue/read_ahead_kb",
200 str) >= sizeof(finfo.dev_queue_read_ahead_path)) {
201 printf("%s: Pathname is too long\n", __func__);
202 exit(EXIT_FAILURE);
203 }
204 return;
205 }
206 ++end;
207 }
208 printf("%s: Could not read: %s\n", __func__, path);
209 exit(EXIT_FAILURE);
210 }
211
check_swap(void * addr,unsigned long size)212 static bool check_swap(void *addr, unsigned long size)
213 {
214 bool swap = false;
215 int ret;
216 FILE *fp;
217 char buffer[MAX_LINE_LENGTH];
218 char addr_pattern[MAX_LINE_LENGTH];
219
220 ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
221 (unsigned long) addr);
222 if (ret >= MAX_LINE_LENGTH) {
223 printf("%s: Pattern is too long\n", __func__);
224 exit(EXIT_FAILURE);
225 }
226
227
228 fp = fopen(PID_SMAPS, "r");
229 if (!fp) {
230 printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
231 exit(EXIT_FAILURE);
232 }
233 if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
234 goto err_out;
235
236 ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
237 size >> 10);
238 if (ret >= MAX_LINE_LENGTH) {
239 printf("%s: Pattern is too long\n", __func__);
240 exit(EXIT_FAILURE);
241 }
242 /*
243 * Fetch the Swap: in the same block and check whether it got
244 * the expected number of hugeepages next.
245 */
246 if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer)))
247 goto err_out;
248
249 if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
250 goto err_out;
251
252 swap = true;
253 err_out:
254 fclose(fp);
255 return swap;
256 }
257
alloc_mapping(int nr)258 static void *alloc_mapping(int nr)
259 {
260 void *p;
261
262 p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
263 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
264 if (p != BASE_ADDR) {
265 printf("Failed to allocate VMA at %p\n", BASE_ADDR);
266 exit(EXIT_FAILURE);
267 }
268
269 return p;
270 }
271
fill_memory(int * p,unsigned long start,unsigned long end)272 static void fill_memory(int *p, unsigned long start, unsigned long end)
273 {
274 int i;
275
276 for (i = start / page_size; i < end / page_size; i++)
277 p[i * page_size / sizeof(*p)] = i + 0xdead0000;
278 }
279
280 /*
281 * MADV_COLLAPSE is a best-effort request and may fail if an internal
282 * resource is temporarily unavailable, in which case it will set errno to
283 * EAGAIN. In such a case, immediately reattempt the operation one more
284 * time.
285 */
madvise_collapse_retry(void * p,unsigned long size)286 static int madvise_collapse_retry(void *p, unsigned long size)
287 {
288 bool retry = true;
289 int ret;
290
291 retry:
292 ret = madvise(p, size, MADV_COLLAPSE);
293 if (ret && errno == EAGAIN && retry) {
294 retry = false;
295 goto retry;
296 }
297 return ret;
298 }
299
300 /*
301 * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
302 * validate_memory()'able contents.
303 */
alloc_hpage(struct mem_ops * ops)304 static void *alloc_hpage(struct mem_ops *ops)
305 {
306 void *p = ops->setup_area(1);
307
308 ops->fault(p, 0, hpage_pmd_size);
309
310 /*
311 * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE.
312 * The latter is ineligible for collapse by MADV_COLLAPSE
313 * while the former might cause MADV_COLLAPSE to race with
314 * khugepaged on low-load system (like a test machine), which
315 * would cause MADV_COLLAPSE to fail with EAGAIN.
316 */
317 printf("Allocate huge page...");
318 if (madvise_collapse_retry(p, hpage_pmd_size)) {
319 perror("madvise(MADV_COLLAPSE)");
320 exit(EXIT_FAILURE);
321 }
322 if (!ops->check_huge(p, 1)) {
323 perror("madvise(MADV_COLLAPSE)");
324 exit(EXIT_FAILURE);
325 }
326 if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) {
327 perror("madvise(MADV_HUGEPAGE)");
328 exit(EXIT_FAILURE);
329 }
330 success("OK");
331 return p;
332 }
333
validate_memory(int * p,unsigned long start,unsigned long end)334 static void validate_memory(int *p, unsigned long start, unsigned long end)
335 {
336 int i;
337
338 for (i = start / page_size; i < end / page_size; i++) {
339 if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
340 printf("Page %d is corrupted: %#x\n",
341 i, p[i * page_size / sizeof(*p)]);
342 exit(EXIT_FAILURE);
343 }
344 }
345 }
346
anon_setup_area(int nr_hpages)347 static void *anon_setup_area(int nr_hpages)
348 {
349 return alloc_mapping(nr_hpages);
350 }
351
anon_cleanup_area(void * p,unsigned long size)352 static void anon_cleanup_area(void *p, unsigned long size)
353 {
354 munmap(p, size);
355 }
356
anon_fault(void * p,unsigned long start,unsigned long end)357 static void anon_fault(void *p, unsigned long start, unsigned long end)
358 {
359 fill_memory(p, start, end);
360 }
361
anon_check_huge(void * addr,int nr_hpages)362 static bool anon_check_huge(void *addr, int nr_hpages)
363 {
364 return check_huge_anon(addr, nr_hpages, hpage_pmd_size);
365 }
366
file_setup_area(int nr_hpages)367 static void *file_setup_area(int nr_hpages)
368 {
369 int fd;
370 void *p;
371 unsigned long size;
372
373 unlink(finfo.path); /* Cleanup from previous failed tests */
374 printf("Creating %s for collapse%s...", finfo.path,
375 finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
376 fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
377 777);
378 if (fd < 0) {
379 perror("open()");
380 exit(EXIT_FAILURE);
381 }
382
383 size = nr_hpages * hpage_pmd_size;
384 p = alloc_mapping(nr_hpages);
385 fill_memory(p, 0, size);
386 write(fd, p, size);
387 close(fd);
388 munmap(p, size);
389 success("OK");
390
391 printf("Opening %s read only for collapse...", finfo.path);
392 finfo.fd = open(finfo.path, O_RDONLY, 777);
393 if (finfo.fd < 0) {
394 perror("open()");
395 exit(EXIT_FAILURE);
396 }
397 p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC,
398 MAP_PRIVATE, finfo.fd, 0);
399 if (p == MAP_FAILED || p != BASE_ADDR) {
400 perror("mmap()");
401 exit(EXIT_FAILURE);
402 }
403
404 /* Drop page cache */
405 write_file("/proc/sys/vm/drop_caches", "3", 2);
406 success("OK");
407 return p;
408 }
409
file_cleanup_area(void * p,unsigned long size)410 static void file_cleanup_area(void *p, unsigned long size)
411 {
412 munmap(p, size);
413 close(finfo.fd);
414 unlink(finfo.path);
415 }
416
file_fault(void * p,unsigned long start,unsigned long end)417 static void file_fault(void *p, unsigned long start, unsigned long end)
418 {
419 if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) {
420 perror("madvise(MADV_POPULATE_READ");
421 exit(EXIT_FAILURE);
422 }
423 }
424
file_check_huge(void * addr,int nr_hpages)425 static bool file_check_huge(void *addr, int nr_hpages)
426 {
427 switch (finfo.type) {
428 case VMA_FILE:
429 return check_huge_file(addr, nr_hpages, hpage_pmd_size);
430 case VMA_SHMEM:
431 return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
432 default:
433 exit(EXIT_FAILURE);
434 return false;
435 }
436 }
437
shmem_setup_area(int nr_hpages)438 static void *shmem_setup_area(int nr_hpages)
439 {
440 void *p;
441 unsigned long size = nr_hpages * hpage_pmd_size;
442
443 finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0);
444 if (finfo.fd < 0) {
445 perror("memfd_create()");
446 exit(EXIT_FAILURE);
447 }
448 if (ftruncate(finfo.fd, size)) {
449 perror("ftruncate()");
450 exit(EXIT_FAILURE);
451 }
452 p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd,
453 0);
454 if (p != BASE_ADDR) {
455 perror("mmap()");
456 exit(EXIT_FAILURE);
457 }
458 return p;
459 }
460
shmem_cleanup_area(void * p,unsigned long size)461 static void shmem_cleanup_area(void *p, unsigned long size)
462 {
463 munmap(p, size);
464 close(finfo.fd);
465 }
466
shmem_check_huge(void * addr,int nr_hpages)467 static bool shmem_check_huge(void *addr, int nr_hpages)
468 {
469 return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
470 }
471
472 static struct mem_ops __anon_ops = {
473 .setup_area = &anon_setup_area,
474 .cleanup_area = &anon_cleanup_area,
475 .fault = &anon_fault,
476 .check_huge = &anon_check_huge,
477 .name = "anon",
478 };
479
480 static struct mem_ops __file_ops = {
481 .setup_area = &file_setup_area,
482 .cleanup_area = &file_cleanup_area,
483 .fault = &file_fault,
484 .check_huge = &file_check_huge,
485 .name = "file",
486 };
487
488 static struct mem_ops __shmem_ops = {
489 .setup_area = &shmem_setup_area,
490 .cleanup_area = &shmem_cleanup_area,
491 .fault = &anon_fault,
492 .check_huge = &shmem_check_huge,
493 .name = "shmem",
494 };
495
__madvise_collapse(const char * msg,char * p,int nr_hpages,struct mem_ops * ops,bool expect)496 static void __madvise_collapse(const char *msg, char *p, int nr_hpages,
497 struct mem_ops *ops, bool expect)
498 {
499 int ret;
500 struct thp_settings settings = *thp_current_settings();
501
502 printf("%s...", msg);
503
504 /*
505 * Prevent khugepaged interference and tests that MADV_COLLAPSE
506 * ignores /sys/kernel/mm/transparent_hugepage/enabled
507 */
508 settings.thp_enabled = THP_NEVER;
509 settings.shmem_enabled = SHMEM_NEVER;
510 thp_push_settings(&settings);
511
512 /* Clear VM_NOHUGEPAGE */
513 madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
514 ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size);
515 if (((bool)ret) == expect)
516 fail("Fail: Bad return value");
517 else if (!ops->check_huge(p, expect ? nr_hpages : 0))
518 fail("Fail: check_huge()");
519 else
520 success("OK");
521
522 thp_pop_settings();
523 }
524
madvise_collapse(const char * msg,char * p,int nr_hpages,struct mem_ops * ops,bool expect)525 static void madvise_collapse(const char *msg, char *p, int nr_hpages,
526 struct mem_ops *ops, bool expect)
527 {
528 /* Sanity check */
529 if (!ops->check_huge(p, 0)) {
530 printf("Unexpected huge page\n");
531 exit(EXIT_FAILURE);
532 }
533 __madvise_collapse(msg, p, nr_hpages, ops, expect);
534 }
535
536 #define TICK 500000
wait_for_scan(const char * msg,char * p,int nr_hpages,struct mem_ops * ops)537 static bool wait_for_scan(const char *msg, char *p, int nr_hpages,
538 struct mem_ops *ops)
539 {
540 int full_scans;
541 int timeout = 6; /* 3 seconds */
542
543 /* Sanity check */
544 if (!ops->check_huge(p, 0)) {
545 printf("Unexpected huge page\n");
546 exit(EXIT_FAILURE);
547 }
548
549 madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
550
551 /* Wait until the second full_scan completed */
552 full_scans = thp_read_num("khugepaged/full_scans") + 2;
553
554 printf("%s...", msg);
555 while (timeout--) {
556 if (ops->check_huge(p, nr_hpages))
557 break;
558 if (thp_read_num("khugepaged/full_scans") >= full_scans)
559 break;
560 printf(".");
561 usleep(TICK);
562 }
563
564 return timeout == -1;
565 }
566
khugepaged_collapse(const char * msg,char * p,int nr_hpages,struct mem_ops * ops,bool expect)567 static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
568 struct mem_ops *ops, bool expect)
569 {
570 if (wait_for_scan(msg, p, nr_hpages, ops)) {
571 if (expect)
572 fail("Timeout");
573 else
574 success("OK");
575 return;
576 }
577
578 /*
579 * For file and shmem memory, khugepaged only retracts pte entries after
580 * putting the new hugepage in the page cache. The hugepage must be
581 * subsequently refaulted to install the pmd mapping for the mm.
582 */
583 if (ops != &__anon_ops)
584 ops->fault(p, 0, nr_hpages * hpage_pmd_size);
585
586 if (ops->check_huge(p, expect ? nr_hpages : 0))
587 success("OK");
588 else
589 fail("Fail");
590 }
591
592 static struct collapse_context __khugepaged_context = {
593 .collapse = &khugepaged_collapse,
594 .enforce_pte_scan_limits = true,
595 .name = "khugepaged",
596 };
597
598 static struct collapse_context __madvise_context = {
599 .collapse = &madvise_collapse,
600 .enforce_pte_scan_limits = false,
601 .name = "madvise",
602 };
603
is_tmpfs(struct mem_ops * ops)604 static bool is_tmpfs(struct mem_ops *ops)
605 {
606 return ops == &__file_ops && finfo.type == VMA_SHMEM;
607 }
608
is_anon(struct mem_ops * ops)609 static bool is_anon(struct mem_ops *ops)
610 {
611 return ops == &__anon_ops;
612 }
613
alloc_at_fault(void)614 static void alloc_at_fault(void)
615 {
616 struct thp_settings settings = *thp_current_settings();
617 char *p;
618
619 settings.thp_enabled = THP_ALWAYS;
620 thp_push_settings(&settings);
621
622 p = alloc_mapping(1);
623 *p = 1;
624 printf("Allocate huge page on fault...");
625 if (check_huge_anon(p, 1, hpage_pmd_size))
626 success("OK");
627 else
628 fail("Fail");
629
630 thp_pop_settings();
631
632 madvise(p, page_size, MADV_DONTNEED);
633 printf("Split huge PMD on MADV_DONTNEED...");
634 if (check_huge_anon(p, 0, hpage_pmd_size))
635 success("OK");
636 else
637 fail("Fail");
638 munmap(p, hpage_pmd_size);
639 }
640
collapse_full(struct collapse_context * c,struct mem_ops * ops)641 static void collapse_full(struct collapse_context *c, struct mem_ops *ops)
642 {
643 void *p;
644 int nr_hpages = 4;
645 unsigned long size = nr_hpages * hpage_pmd_size;
646
647 p = ops->setup_area(nr_hpages);
648 ops->fault(p, 0, size);
649 c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
650 ops, true);
651 validate_memory(p, 0, size);
652 ops->cleanup_area(p, size);
653 }
654
collapse_empty(struct collapse_context * c,struct mem_ops * ops)655 static void collapse_empty(struct collapse_context *c, struct mem_ops *ops)
656 {
657 void *p;
658
659 p = ops->setup_area(1);
660 c->collapse("Do not collapse empty PTE table", p, 1, ops, false);
661 ops->cleanup_area(p, hpage_pmd_size);
662 }
663
collapse_single_pte_entry(struct collapse_context * c,struct mem_ops * ops)664 static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops)
665 {
666 void *p;
667
668 p = ops->setup_area(1);
669 ops->fault(p, 0, page_size);
670 c->collapse("Collapse PTE table with single PTE entry present", p,
671 1, ops, true);
672 ops->cleanup_area(p, hpage_pmd_size);
673 }
674
collapse_max_ptes_none(struct collapse_context * c,struct mem_ops * ops)675 static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops)
676 {
677 int max_ptes_none = hpage_pmd_nr / 2;
678 struct thp_settings settings = *thp_current_settings();
679 void *p;
680 int fault_nr_pages = is_anon(ops) ? 1 << anon_order : 1;
681
682 settings.khugepaged.max_ptes_none = max_ptes_none;
683 thp_push_settings(&settings);
684
685 p = ops->setup_area(1);
686
687 if (is_tmpfs(ops)) {
688 /* shmem pages always in the page cache */
689 printf("tmpfs...");
690 skip("Skip");
691 goto skip;
692 }
693
694 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size);
695 c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
696 ops, !c->enforce_pte_scan_limits);
697 validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size);
698
699 if (c->enforce_pte_scan_limits) {
700 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
701 c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops,
702 true);
703 validate_memory(p, 0,
704 (hpage_pmd_nr - max_ptes_none) * page_size);
705 }
706 skip:
707 ops->cleanup_area(p, hpage_pmd_size);
708 thp_pop_settings();
709 }
710
collapse_swapin_single_pte(struct collapse_context * c,struct mem_ops * ops)711 static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops)
712 {
713 void *p;
714
715 p = ops->setup_area(1);
716 ops->fault(p, 0, hpage_pmd_size);
717
718 printf("Swapout one page...");
719 if (madvise(p, page_size, MADV_PAGEOUT)) {
720 perror("madvise(MADV_PAGEOUT)");
721 exit(EXIT_FAILURE);
722 }
723 if (check_swap(p, page_size)) {
724 success("OK");
725 } else {
726 fail("Fail");
727 goto out;
728 }
729
730 c->collapse("Collapse with swapping in single PTE entry", p, 1, ops,
731 true);
732 validate_memory(p, 0, hpage_pmd_size);
733 out:
734 ops->cleanup_area(p, hpage_pmd_size);
735 }
736
collapse_max_ptes_swap(struct collapse_context * c,struct mem_ops * ops)737 static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops)
738 {
739 int max_ptes_swap = thp_read_num("khugepaged/max_ptes_swap");
740 void *p;
741
742 p = ops->setup_area(1);
743 ops->fault(p, 0, hpage_pmd_size);
744
745 printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
746 if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
747 perror("madvise(MADV_PAGEOUT)");
748 exit(EXIT_FAILURE);
749 }
750 if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
751 success("OK");
752 } else {
753 fail("Fail");
754 goto out;
755 }
756
757 c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops,
758 !c->enforce_pte_scan_limits);
759 validate_memory(p, 0, hpage_pmd_size);
760
761 if (c->enforce_pte_scan_limits) {
762 ops->fault(p, 0, hpage_pmd_size);
763 printf("Swapout %d of %d pages...", max_ptes_swap,
764 hpage_pmd_nr);
765 if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
766 perror("madvise(MADV_PAGEOUT)");
767 exit(EXIT_FAILURE);
768 }
769 if (check_swap(p, max_ptes_swap * page_size)) {
770 success("OK");
771 } else {
772 fail("Fail");
773 goto out;
774 }
775
776 c->collapse("Collapse with max_ptes_swap pages swapped out", p,
777 1, ops, true);
778 validate_memory(p, 0, hpage_pmd_size);
779 }
780 out:
781 ops->cleanup_area(p, hpage_pmd_size);
782 }
783
collapse_single_pte_entry_compound(struct collapse_context * c,struct mem_ops * ops)784 static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops)
785 {
786 void *p;
787
788 p = alloc_hpage(ops);
789
790 if (is_tmpfs(ops)) {
791 /* MADV_DONTNEED won't evict tmpfs pages */
792 printf("tmpfs...");
793 skip("Skip");
794 goto skip;
795 }
796
797 madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
798 printf("Split huge page leaving single PTE mapping compound page...");
799 madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
800 if (ops->check_huge(p, 0))
801 success("OK");
802 else
803 fail("Fail");
804
805 c->collapse("Collapse PTE table with single PTE mapping compound page",
806 p, 1, ops, true);
807 validate_memory(p, 0, page_size);
808 skip:
809 ops->cleanup_area(p, hpage_pmd_size);
810 }
811
collapse_full_of_compound(struct collapse_context * c,struct mem_ops * ops)812 static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops)
813 {
814 void *p;
815
816 p = alloc_hpage(ops);
817 printf("Split huge page leaving single PTE page table full of compound pages...");
818 madvise(p, page_size, MADV_NOHUGEPAGE);
819 madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
820 if (ops->check_huge(p, 0))
821 success("OK");
822 else
823 fail("Fail");
824
825 c->collapse("Collapse PTE table full of compound pages", p, 1, ops,
826 true);
827 validate_memory(p, 0, hpage_pmd_size);
828 ops->cleanup_area(p, hpage_pmd_size);
829 }
830
collapse_compound_extreme(struct collapse_context * c,struct mem_ops * ops)831 static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops)
832 {
833 void *p;
834 int i;
835
836 p = ops->setup_area(1);
837 for (i = 0; i < hpage_pmd_nr; i++) {
838 printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
839 i + 1, hpage_pmd_nr);
840
841 madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
842 ops->fault(BASE_ADDR, 0, hpage_pmd_size);
843 if (!ops->check_huge(BASE_ADDR, 1)) {
844 printf("Failed to allocate huge page\n");
845 exit(EXIT_FAILURE);
846 }
847 madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
848
849 p = mremap(BASE_ADDR - i * page_size,
850 i * page_size + hpage_pmd_size,
851 (i + 1) * page_size,
852 MREMAP_MAYMOVE | MREMAP_FIXED,
853 BASE_ADDR + 2 * hpage_pmd_size);
854 if (p == MAP_FAILED) {
855 perror("mremap+unmap");
856 exit(EXIT_FAILURE);
857 }
858
859 p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
860 (i + 1) * page_size,
861 (i + 1) * page_size + hpage_pmd_size,
862 MREMAP_MAYMOVE | MREMAP_FIXED,
863 BASE_ADDR - (i + 1) * page_size);
864 if (p == MAP_FAILED) {
865 perror("mremap+alloc");
866 exit(EXIT_FAILURE);
867 }
868 }
869
870 ops->cleanup_area(BASE_ADDR, hpage_pmd_size);
871 ops->fault(p, 0, hpage_pmd_size);
872 if (!ops->check_huge(p, 1))
873 success("OK");
874 else
875 fail("Fail");
876
877 c->collapse("Collapse PTE table full of different compound pages", p, 1,
878 ops, true);
879
880 validate_memory(p, 0, hpage_pmd_size);
881 ops->cleanup_area(p, hpage_pmd_size);
882 }
883
collapse_fork(struct collapse_context * c,struct mem_ops * ops)884 static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
885 {
886 int wstatus;
887 void *p;
888
889 p = ops->setup_area(1);
890
891 printf("Allocate small page...");
892 ops->fault(p, 0, page_size);
893 if (ops->check_huge(p, 0))
894 success("OK");
895 else
896 fail("Fail");
897
898 printf("Share small page over fork()...");
899 if (!fork()) {
900 /* Do not touch settings on child exit */
901 skip_settings_restore = true;
902 exit_status = 0;
903
904 if (ops->check_huge(p, 0))
905 success("OK");
906 else
907 fail("Fail");
908
909 ops->fault(p, page_size, 2 * page_size);
910 c->collapse("Collapse PTE table with single page shared with parent process",
911 p, 1, ops, true);
912
913 validate_memory(p, 0, page_size);
914 ops->cleanup_area(p, hpage_pmd_size);
915 exit(exit_status);
916 }
917
918 wait(&wstatus);
919 exit_status += WEXITSTATUS(wstatus);
920
921 printf("Check if parent still has small page...");
922 if (ops->check_huge(p, 0))
923 success("OK");
924 else
925 fail("Fail");
926 validate_memory(p, 0, page_size);
927 ops->cleanup_area(p, hpage_pmd_size);
928 }
929
collapse_fork_compound(struct collapse_context * c,struct mem_ops * ops)930 static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops)
931 {
932 int wstatus;
933 void *p;
934
935 p = alloc_hpage(ops);
936 printf("Share huge page over fork()...");
937 if (!fork()) {
938 /* Do not touch settings on child exit */
939 skip_settings_restore = true;
940 exit_status = 0;
941
942 if (ops->check_huge(p, 1))
943 success("OK");
944 else
945 fail("Fail");
946
947 printf("Split huge page PMD in child process...");
948 madvise(p, page_size, MADV_NOHUGEPAGE);
949 madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
950 if (ops->check_huge(p, 0))
951 success("OK");
952 else
953 fail("Fail");
954 ops->fault(p, 0, page_size);
955
956 thp_write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
957 c->collapse("Collapse PTE table full of compound pages in child",
958 p, 1, ops, true);
959 thp_write_num("khugepaged/max_ptes_shared",
960 thp_current_settings()->khugepaged.max_ptes_shared);
961
962 validate_memory(p, 0, hpage_pmd_size);
963 ops->cleanup_area(p, hpage_pmd_size);
964 exit(exit_status);
965 }
966
967 wait(&wstatus);
968 exit_status += WEXITSTATUS(wstatus);
969
970 printf("Check if parent still has huge page...");
971 if (ops->check_huge(p, 1))
972 success("OK");
973 else
974 fail("Fail");
975 validate_memory(p, 0, hpage_pmd_size);
976 ops->cleanup_area(p, hpage_pmd_size);
977 }
978
collapse_max_ptes_shared(struct collapse_context * c,struct mem_ops * ops)979 static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops)
980 {
981 int max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared");
982 int wstatus;
983 void *p;
984
985 p = alloc_hpage(ops);
986 printf("Share huge page over fork()...");
987 if (!fork()) {
988 /* Do not touch settings on child exit */
989 skip_settings_restore = true;
990 exit_status = 0;
991
992 if (ops->check_huge(p, 1))
993 success("OK");
994 else
995 fail("Fail");
996
997 printf("Trigger CoW on page %d of %d...",
998 hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
999 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
1000 if (ops->check_huge(p, 0))
1001 success("OK");
1002 else
1003 fail("Fail");
1004
1005 c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
1006 1, ops, !c->enforce_pte_scan_limits);
1007
1008 if (c->enforce_pte_scan_limits) {
1009 printf("Trigger CoW on page %d of %d...",
1010 hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
1011 ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) *
1012 page_size);
1013 if (ops->check_huge(p, 0))
1014 success("OK");
1015 else
1016 fail("Fail");
1017
1018 c->collapse("Collapse with max_ptes_shared PTEs shared",
1019 p, 1, ops, true);
1020 }
1021
1022 validate_memory(p, 0, hpage_pmd_size);
1023 ops->cleanup_area(p, hpage_pmd_size);
1024 exit(exit_status);
1025 }
1026
1027 wait(&wstatus);
1028 exit_status += WEXITSTATUS(wstatus);
1029
1030 printf("Check if parent still has huge page...");
1031 if (ops->check_huge(p, 1))
1032 success("OK");
1033 else
1034 fail("Fail");
1035 validate_memory(p, 0, hpage_pmd_size);
1036 ops->cleanup_area(p, hpage_pmd_size);
1037 }
1038
madvise_collapse_existing_thps(struct collapse_context * c,struct mem_ops * ops)1039 static void madvise_collapse_existing_thps(struct collapse_context *c,
1040 struct mem_ops *ops)
1041 {
1042 void *p;
1043
1044 p = ops->setup_area(1);
1045 ops->fault(p, 0, hpage_pmd_size);
1046 c->collapse("Collapse fully populated PTE table...", p, 1, ops, true);
1047 validate_memory(p, 0, hpage_pmd_size);
1048
1049 /* c->collapse() will find a hugepage and complain - call directly. */
1050 __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true);
1051 validate_memory(p, 0, hpage_pmd_size);
1052 ops->cleanup_area(p, hpage_pmd_size);
1053 }
1054
1055 /*
1056 * Test race with khugepaged where page tables have been retracted and
1057 * pmd cleared.
1058 */
madvise_retracted_page_tables(struct collapse_context * c,struct mem_ops * ops)1059 static void madvise_retracted_page_tables(struct collapse_context *c,
1060 struct mem_ops *ops)
1061 {
1062 void *p;
1063 int nr_hpages = 1;
1064 unsigned long size = nr_hpages * hpage_pmd_size;
1065
1066 p = ops->setup_area(nr_hpages);
1067 ops->fault(p, 0, size);
1068
1069 /* Let khugepaged collapse and leave pmd cleared */
1070 if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages,
1071 ops)) {
1072 fail("Timeout");
1073 return;
1074 }
1075 success("OK");
1076 c->collapse("Install huge PMD from page cache", p, nr_hpages, ops,
1077 true);
1078 validate_memory(p, 0, size);
1079 ops->cleanup_area(p, size);
1080 }
1081
usage(void)1082 static void usage(void)
1083 {
1084 fprintf(stderr, "\nUsage: ./khugepaged [OPTIONS] <test type> [dir]\n\n");
1085 fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n");
1086 fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n");
1087 fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n");
1088 fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n");
1089 fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n");
1090 fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
1091 fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
1092 fprintf(stderr, "\tmounted with huge=advise option for khugepaged tests to work\n");
1093 fprintf(stderr, "\n\tSupported Options:\n");
1094 fprintf(stderr, "\t\t-h: This help message.\n");
1095 fprintf(stderr, "\t\t-s: mTHP size, expressed as page order.\n");
1096 fprintf(stderr, "\t\t Defaults to 0. Use this size for anon or shmem allocations.\n");
1097 exit(1);
1098 }
1099
parse_test_type(int argc,char ** argv)1100 static void parse_test_type(int argc, char **argv)
1101 {
1102 int opt;
1103 char *buf;
1104 const char *token;
1105
1106 while ((opt = getopt(argc, argv, "s:h")) != -1) {
1107 switch (opt) {
1108 case 's':
1109 anon_order = atoi(optarg);
1110 break;
1111 case 'h':
1112 default:
1113 usage();
1114 }
1115 }
1116
1117 argv += optind;
1118 argc -= optind;
1119
1120 if (argc == 0) {
1121 /* Backwards compatibility */
1122 khugepaged_context = &__khugepaged_context;
1123 madvise_context = &__madvise_context;
1124 anon_ops = &__anon_ops;
1125 return;
1126 }
1127
1128 buf = strdup(argv[0]);
1129 token = strsep(&buf, ":");
1130
1131 if (!strcmp(token, "all")) {
1132 khugepaged_context = &__khugepaged_context;
1133 madvise_context = &__madvise_context;
1134 } else if (!strcmp(token, "khugepaged")) {
1135 khugepaged_context = &__khugepaged_context;
1136 } else if (!strcmp(token, "madvise")) {
1137 madvise_context = &__madvise_context;
1138 } else {
1139 usage();
1140 }
1141
1142 if (!buf)
1143 usage();
1144
1145 if (!strcmp(buf, "all")) {
1146 file_ops = &__file_ops;
1147 anon_ops = &__anon_ops;
1148 shmem_ops = &__shmem_ops;
1149 } else if (!strcmp(buf, "anon")) {
1150 anon_ops = &__anon_ops;
1151 } else if (!strcmp(buf, "file")) {
1152 file_ops = &__file_ops;
1153 } else if (!strcmp(buf, "shmem")) {
1154 shmem_ops = &__shmem_ops;
1155 } else {
1156 usage();
1157 }
1158
1159 if (!file_ops)
1160 return;
1161
1162 if (argc != 2)
1163 usage();
1164
1165 get_finfo(argv[1]);
1166 }
1167
main(int argc,char ** argv)1168 int main(int argc, char **argv)
1169 {
1170 int hpage_pmd_order;
1171 struct thp_settings default_settings = {
1172 .thp_enabled = THP_MADVISE,
1173 .thp_defrag = THP_DEFRAG_ALWAYS,
1174 .shmem_enabled = SHMEM_ADVISE,
1175 .use_zero_page = 0,
1176 .khugepaged = {
1177 .defrag = 1,
1178 .alloc_sleep_millisecs = 10,
1179 .scan_sleep_millisecs = 10,
1180 },
1181 /*
1182 * When testing file-backed memory, the collapse path
1183 * looks at how many pages are found in the page cache, not
1184 * what pages are mapped. Disable read ahead optimization so
1185 * pages don't find their way into the page cache unless
1186 * we mem_ops->fault() them in.
1187 */
1188 .read_ahead_kb = 0,
1189 };
1190
1191 if (!thp_is_enabled()) {
1192 printf("Transparent Hugepages not available\n");
1193 return KSFT_SKIP;
1194 }
1195
1196 parse_test_type(argc, argv);
1197
1198 setbuf(stdout, NULL);
1199
1200 page_size = getpagesize();
1201 hpage_pmd_size = read_pmd_pagesize();
1202 if (!hpage_pmd_size) {
1203 printf("Reading PMD pagesize failed");
1204 exit(EXIT_FAILURE);
1205 }
1206 hpage_pmd_nr = hpage_pmd_size / page_size;
1207 hpage_pmd_order = __builtin_ctz(hpage_pmd_nr);
1208
1209 default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
1210 default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
1211 default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
1212 default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
1213 default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT;
1214 default_settings.hugepages[anon_order].enabled = THP_ALWAYS;
1215 default_settings.shmem_hugepages[hpage_pmd_order].enabled = SHMEM_INHERIT;
1216 default_settings.shmem_hugepages[anon_order].enabled = SHMEM_ALWAYS;
1217
1218 save_settings();
1219 thp_push_settings(&default_settings);
1220
1221 alloc_at_fault();
1222
1223 #define TEST(t, c, o) do { \
1224 if (c && o) { \
1225 printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \
1226 t(c, o); \
1227 } \
1228 } while (0)
1229
1230 TEST(collapse_full, khugepaged_context, anon_ops);
1231 TEST(collapse_full, khugepaged_context, file_ops);
1232 TEST(collapse_full, khugepaged_context, shmem_ops);
1233 TEST(collapse_full, madvise_context, anon_ops);
1234 TEST(collapse_full, madvise_context, file_ops);
1235 TEST(collapse_full, madvise_context, shmem_ops);
1236
1237 TEST(collapse_empty, khugepaged_context, anon_ops);
1238 TEST(collapse_empty, madvise_context, anon_ops);
1239
1240 TEST(collapse_single_pte_entry, khugepaged_context, anon_ops);
1241 TEST(collapse_single_pte_entry, khugepaged_context, file_ops);
1242 TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops);
1243 TEST(collapse_single_pte_entry, madvise_context, anon_ops);
1244 TEST(collapse_single_pte_entry, madvise_context, file_ops);
1245 TEST(collapse_single_pte_entry, madvise_context, shmem_ops);
1246
1247 TEST(collapse_max_ptes_none, khugepaged_context, anon_ops);
1248 TEST(collapse_max_ptes_none, khugepaged_context, file_ops);
1249 TEST(collapse_max_ptes_none, madvise_context, anon_ops);
1250 TEST(collapse_max_ptes_none, madvise_context, file_ops);
1251
1252 TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops);
1253 TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops);
1254 TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops);
1255 TEST(collapse_single_pte_entry_compound, madvise_context, file_ops);
1256
1257 TEST(collapse_full_of_compound, khugepaged_context, anon_ops);
1258 TEST(collapse_full_of_compound, khugepaged_context, file_ops);
1259 TEST(collapse_full_of_compound, khugepaged_context, shmem_ops);
1260 TEST(collapse_full_of_compound, madvise_context, anon_ops);
1261 TEST(collapse_full_of_compound, madvise_context, file_ops);
1262 TEST(collapse_full_of_compound, madvise_context, shmem_ops);
1263
1264 TEST(collapse_compound_extreme, khugepaged_context, anon_ops);
1265 TEST(collapse_compound_extreme, madvise_context, anon_ops);
1266
1267 TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops);
1268 TEST(collapse_swapin_single_pte, madvise_context, anon_ops);
1269
1270 TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops);
1271 TEST(collapse_max_ptes_swap, madvise_context, anon_ops);
1272
1273 TEST(collapse_fork, khugepaged_context, anon_ops);
1274 TEST(collapse_fork, madvise_context, anon_ops);
1275
1276 TEST(collapse_fork_compound, khugepaged_context, anon_ops);
1277 TEST(collapse_fork_compound, madvise_context, anon_ops);
1278
1279 TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops);
1280 TEST(collapse_max_ptes_shared, madvise_context, anon_ops);
1281
1282 TEST(madvise_collapse_existing_thps, madvise_context, anon_ops);
1283 TEST(madvise_collapse_existing_thps, madvise_context, file_ops);
1284 TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops);
1285
1286 TEST(madvise_retracted_page_tables, madvise_context, file_ops);
1287 TEST(madvise_retracted_page_tables, madvise_context, shmem_ops);
1288
1289 restore_settings(0);
1290 }
1291