1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * COW (Copy On Write) tests.
4 *
5 * Copyright 2022, Red Hat, Inc.
6 *
7 * Author(s): David Hildenbrand <david@redhat.com>
8 */
9 #define _GNU_SOURCE
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <unistd.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <assert.h>
18 #include <linux/mman.h>
19 #include <sys/mman.h>
20 #include <sys/ioctl.h>
21 #include <sys/wait.h>
22 #include <linux/memfd.h>
23
24 #include "local_config.h"
25 #ifdef LOCAL_CONFIG_HAVE_LIBURING
26 #include <liburing.h>
27 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
28
29 #include "../../../../mm/gup_test.h"
30 #include "../kselftest.h"
31 #include "vm_util.h"
32 #include "thp_settings.h"
33
34 static size_t pagesize;
35 static int pagemap_fd;
36 static size_t pmdsize;
37 static int nr_thpsizes;
38 static size_t thpsizes[20];
39 static int nr_hugetlbsizes;
40 static size_t hugetlbsizes[10];
41 static int gup_fd;
42 static bool has_huge_zeropage;
43
sz2ord(size_t size)44 static int sz2ord(size_t size)
45 {
46 return __builtin_ctzll(size / pagesize);
47 }
48
detect_thp_sizes(size_t sizes[],int max)49 static int detect_thp_sizes(size_t sizes[], int max)
50 {
51 int count = 0;
52 unsigned long orders;
53 size_t kb;
54 int i;
55
56 /* thp not supported at all. */
57 if (!pmdsize)
58 return 0;
59
60 orders = 1UL << sz2ord(pmdsize);
61 orders |= thp_supported_orders();
62
63 for (i = 0; orders && count < max; i++) {
64 if (!(orders & (1UL << i)))
65 continue;
66 orders &= ~(1UL << i);
67 kb = (pagesize >> 10) << i;
68 sizes[count++] = kb * 1024;
69 ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb);
70 }
71
72 return count;
73 }
74
range_is_swapped(void * addr,size_t size)75 static bool range_is_swapped(void *addr, size_t size)
76 {
77 for (; size; addr += pagesize, size -= pagesize)
78 if (!pagemap_is_swapped(pagemap_fd, addr))
79 return false;
80 return true;
81 }
82
83 struct comm_pipes {
84 int child_ready[2];
85 int parent_ready[2];
86 };
87
setup_comm_pipes(struct comm_pipes * comm_pipes)88 static int setup_comm_pipes(struct comm_pipes *comm_pipes)
89 {
90 if (pipe(comm_pipes->child_ready) < 0) {
91 ksft_perror("pipe() failed");
92 return -errno;
93 }
94 if (pipe(comm_pipes->parent_ready) < 0) {
95 ksft_perror("pipe() failed");
96 close(comm_pipes->child_ready[0]);
97 close(comm_pipes->child_ready[1]);
98 return -errno;
99 }
100
101 return 0;
102 }
103
close_comm_pipes(struct comm_pipes * comm_pipes)104 static void close_comm_pipes(struct comm_pipes *comm_pipes)
105 {
106 close(comm_pipes->child_ready[0]);
107 close(comm_pipes->child_ready[1]);
108 close(comm_pipes->parent_ready[0]);
109 close(comm_pipes->parent_ready[1]);
110 }
111
child_memcmp_fn(char * mem,size_t size,struct comm_pipes * comm_pipes)112 static int child_memcmp_fn(char *mem, size_t size,
113 struct comm_pipes *comm_pipes)
114 {
115 char *old = malloc(size);
116 char buf;
117
118 /* Backup the original content. */
119 memcpy(old, mem, size);
120
121 /* Wait until the parent modified the page. */
122 write(comm_pipes->child_ready[1], "0", 1);
123 while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
124 ;
125
126 /* See if we still read the old values. */
127 return memcmp(old, mem, size);
128 }
129
child_vmsplice_memcmp_fn(char * mem,size_t size,struct comm_pipes * comm_pipes)130 static int child_vmsplice_memcmp_fn(char *mem, size_t size,
131 struct comm_pipes *comm_pipes)
132 {
133 struct iovec iov = {
134 .iov_base = mem,
135 .iov_len = size,
136 };
137 ssize_t cur, total, transferred;
138 char *old, *new;
139 int fds[2];
140 char buf;
141
142 old = malloc(size);
143 new = malloc(size);
144
145 /* Backup the original content. */
146 memcpy(old, mem, size);
147
148 if (pipe(fds) < 0)
149 return -errno;
150
151 /* Trigger a read-only pin. */
152 transferred = vmsplice(fds[1], &iov, 1, 0);
153 if (transferred < 0)
154 return -errno;
155 if (transferred == 0)
156 return -EINVAL;
157
158 /* Unmap it from our page tables. */
159 if (munmap(mem, size) < 0)
160 return -errno;
161
162 /* Wait until the parent modified it. */
163 write(comm_pipes->child_ready[1], "0", 1);
164 while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
165 ;
166
167 /* See if we still read the old values via the pipe. */
168 for (total = 0; total < transferred; total += cur) {
169 cur = read(fds[0], new + total, transferred - total);
170 if (cur < 0)
171 return -errno;
172 }
173
174 return memcmp(old, new, transferred);
175 }
176
177 typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
178
do_test_cow_in_parent(char * mem,size_t size,bool do_mprotect,child_fn fn,bool xfail)179 static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
180 child_fn fn, bool xfail)
181 {
182 struct comm_pipes comm_pipes;
183 char buf;
184 int ret;
185
186 ret = setup_comm_pipes(&comm_pipes);
187 if (ret) {
188 log_test_result(KSFT_FAIL);
189 return;
190 }
191
192 ret = fork();
193 if (ret < 0) {
194 ksft_perror("fork() failed");
195 log_test_result(KSFT_FAIL);
196 goto close_comm_pipes;
197 } else if (!ret) {
198 exit(fn(mem, size, &comm_pipes));
199 }
200
201 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
202 ;
203
204 if (do_mprotect) {
205 /*
206 * mprotect() optimizations might try avoiding
207 * write-faults by directly mapping pages writable.
208 */
209 ret = mprotect(mem, size, PROT_READ);
210 if (ret) {
211 ksft_perror("mprotect() failed");
212 log_test_result(KSFT_FAIL);
213 write(comm_pipes.parent_ready[1], "0", 1);
214 wait(&ret);
215 goto close_comm_pipes;
216 }
217
218 ret = mprotect(mem, size, PROT_READ|PROT_WRITE);
219 if (ret) {
220 ksft_perror("mprotect() failed");
221 log_test_result(KSFT_FAIL);
222 write(comm_pipes.parent_ready[1], "0", 1);
223 wait(&ret);
224 goto close_comm_pipes;
225 }
226 }
227
228 /* Modify the page. */
229 memset(mem, 0xff, size);
230 write(comm_pipes.parent_ready[1], "0", 1);
231
232 wait(&ret);
233 if (WIFEXITED(ret))
234 ret = WEXITSTATUS(ret);
235 else
236 ret = -EINVAL;
237
238 if (!ret) {
239 log_test_result(KSFT_PASS);
240 } else if (xfail) {
241 /*
242 * With hugetlb, some vmsplice() tests are currently expected to
243 * fail because (a) harder to fix and (b) nobody really cares.
244 * Flag them as expected failure for now.
245 */
246 ksft_print_msg("Leak from parent into child\n");
247 log_test_result(KSFT_XFAIL);
248 } else {
249 ksft_print_msg("Leak from parent into child\n");
250 log_test_result(KSFT_FAIL);
251 }
252 close_comm_pipes:
253 close_comm_pipes(&comm_pipes);
254 }
255
test_cow_in_parent(char * mem,size_t size,bool is_hugetlb)256 static void test_cow_in_parent(char *mem, size_t size, bool is_hugetlb)
257 {
258 do_test_cow_in_parent(mem, size, false, child_memcmp_fn, false);
259 }
260
test_cow_in_parent_mprotect(char * mem,size_t size,bool is_hugetlb)261 static void test_cow_in_parent_mprotect(char *mem, size_t size, bool is_hugetlb)
262 {
263 do_test_cow_in_parent(mem, size, true, child_memcmp_fn, false);
264 }
265
test_vmsplice_in_child(char * mem,size_t size,bool is_hugetlb)266 static void test_vmsplice_in_child(char *mem, size_t size, bool is_hugetlb)
267 {
268 do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn,
269 is_hugetlb);
270 }
271
test_vmsplice_in_child_mprotect(char * mem,size_t size,bool is_hugetlb)272 static void test_vmsplice_in_child_mprotect(char *mem, size_t size,
273 bool is_hugetlb)
274 {
275 do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn,
276 is_hugetlb);
277 }
278
do_test_vmsplice_in_parent(char * mem,size_t size,bool before_fork,bool xfail)279 static void do_test_vmsplice_in_parent(char *mem, size_t size,
280 bool before_fork, bool xfail)
281 {
282 struct iovec iov = {
283 .iov_base = mem,
284 .iov_len = size,
285 };
286 ssize_t cur, total, transferred = 0;
287 struct comm_pipes comm_pipes;
288 char *old, *new;
289 int ret, fds[2];
290 char buf;
291
292 old = malloc(size);
293 new = malloc(size);
294
295 memcpy(old, mem, size);
296
297 ret = setup_comm_pipes(&comm_pipes);
298 if (ret) {
299 log_test_result(KSFT_FAIL);
300 goto free;
301 }
302
303 if (pipe(fds) < 0) {
304 ksft_perror("pipe() failed");
305 log_test_result(KSFT_FAIL);
306 goto close_comm_pipes;
307 }
308
309 if (before_fork) {
310 transferred = vmsplice(fds[1], &iov, 1, 0);
311 if (transferred <= 0) {
312 ksft_perror("vmsplice() failed\n");
313 log_test_result(KSFT_FAIL);
314 goto close_pipe;
315 }
316 }
317
318 ret = fork();
319 if (ret < 0) {
320 ksft_perror("fork() failed\n");
321 log_test_result(KSFT_FAIL);
322 goto close_pipe;
323 } else if (!ret) {
324 write(comm_pipes.child_ready[1], "0", 1);
325 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
326 ;
327 /* Modify page content in the child. */
328 memset(mem, 0xff, size);
329 exit(0);
330 }
331
332 if (!before_fork) {
333 transferred = vmsplice(fds[1], &iov, 1, 0);
334 if (transferred <= 0) {
335 ksft_perror("vmsplice() failed");
336 log_test_result(KSFT_FAIL);
337 wait(&ret);
338 goto close_pipe;
339 }
340 }
341
342 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
343 ;
344 if (munmap(mem, size) < 0) {
345 ksft_perror("munmap() failed");
346 log_test_result(KSFT_FAIL);
347 goto close_pipe;
348 }
349 write(comm_pipes.parent_ready[1], "0", 1);
350
351 /* Wait until the child is done writing. */
352 wait(&ret);
353 if (!WIFEXITED(ret)) {
354 ksft_perror("wait() failed");
355 log_test_result(KSFT_FAIL);
356 goto close_pipe;
357 }
358
359 /* See if we still read the old values. */
360 for (total = 0; total < transferred; total += cur) {
361 cur = read(fds[0], new + total, transferred - total);
362 if (cur < 0) {
363 ksft_perror("read() failed");
364 log_test_result(KSFT_FAIL);
365 goto close_pipe;
366 }
367 }
368
369 if (!memcmp(old, new, transferred)) {
370 log_test_result(KSFT_PASS);
371 } else if (xfail) {
372 /*
373 * With hugetlb, some vmsplice() tests are currently expected to
374 * fail because (a) harder to fix and (b) nobody really cares.
375 * Flag them as expected failure for now.
376 */
377 ksft_print_msg("Leak from child into parent\n");
378 log_test_result(KSFT_XFAIL);
379 } else {
380 ksft_print_msg("Leak from child into parent\n");
381 log_test_result(KSFT_FAIL);
382 }
383 close_pipe:
384 close(fds[0]);
385 close(fds[1]);
386 close_comm_pipes:
387 close_comm_pipes(&comm_pipes);
388 free:
389 free(old);
390 free(new);
391 }
392
test_vmsplice_before_fork(char * mem,size_t size,bool is_hugetlb)393 static void test_vmsplice_before_fork(char *mem, size_t size, bool is_hugetlb)
394 {
395 do_test_vmsplice_in_parent(mem, size, true, is_hugetlb);
396 }
397
test_vmsplice_after_fork(char * mem,size_t size,bool is_hugetlb)398 static void test_vmsplice_after_fork(char *mem, size_t size, bool is_hugetlb)
399 {
400 do_test_vmsplice_in_parent(mem, size, false, is_hugetlb);
401 }
402
403 #ifdef LOCAL_CONFIG_HAVE_LIBURING
do_test_iouring(char * mem,size_t size,bool use_fork)404 static void do_test_iouring(char *mem, size_t size, bool use_fork)
405 {
406 struct comm_pipes comm_pipes;
407 struct io_uring_cqe *cqe;
408 struct io_uring_sqe *sqe;
409 struct io_uring ring;
410 ssize_t cur, total;
411 struct iovec iov;
412 char *buf, *tmp;
413 int ret, fd;
414 FILE *file;
415
416 ret = setup_comm_pipes(&comm_pipes);
417 if (ret) {
418 log_test_result(KSFT_FAIL);
419 return;
420 }
421
422 file = tmpfile();
423 if (!file) {
424 ksft_perror("tmpfile() failed");
425 log_test_result(KSFT_FAIL);
426 goto close_comm_pipes;
427 }
428 fd = fileno(file);
429 assert(fd);
430
431 tmp = malloc(size);
432 if (!tmp) {
433 ksft_print_msg("malloc() failed\n");
434 log_test_result(KSFT_FAIL);
435 goto close_file;
436 }
437
438 /* Skip on errors, as we might just lack kernel support. */
439 ret = io_uring_queue_init(1, &ring, 0);
440 if (ret < 0) {
441 ksft_print_msg("io_uring_queue_init() failed\n");
442 log_test_result(KSFT_SKIP);
443 goto free_tmp;
444 }
445
446 /*
447 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
448 * | FOLL_LONGTERM the range.
449 *
450 * Skip on errors, as we might just lack kernel support or might not
451 * have sufficient MEMLOCK permissions.
452 */
453 iov.iov_base = mem;
454 iov.iov_len = size;
455 ret = io_uring_register_buffers(&ring, &iov, 1);
456 if (ret) {
457 ksft_print_msg("io_uring_register_buffers() failed\n");
458 log_test_result(KSFT_SKIP);
459 goto queue_exit;
460 }
461
462 if (use_fork) {
463 /*
464 * fork() and keep the child alive until we're done. Note that
465 * we expect the pinned page to not get shared with the child.
466 */
467 ret = fork();
468 if (ret < 0) {
469 ksft_perror("fork() failed");
470 log_test_result(KSFT_FAIL);
471 goto unregister_buffers;
472 } else if (!ret) {
473 write(comm_pipes.child_ready[1], "0", 1);
474 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
475 ;
476 exit(0);
477 }
478
479 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
480 ;
481 } else {
482 /*
483 * Map the page R/O into the page table. Enable softdirty
484 * tracking to stop the page from getting mapped R/W immediately
485 * again by mprotect() optimizations. Note that we don't have an
486 * easy way to test if that worked (the pagemap does not export
487 * if the page is mapped R/O vs. R/W).
488 */
489 ret = mprotect(mem, size, PROT_READ);
490 if (ret) {
491 ksft_perror("mprotect() failed");
492 log_test_result(KSFT_FAIL);
493 goto unregister_buffers;
494 }
495
496 clear_softdirty();
497 ret = mprotect(mem, size, PROT_READ | PROT_WRITE);
498 if (ret) {
499 ksft_perror("mprotect() failed");
500 log_test_result(KSFT_FAIL);
501 goto unregister_buffers;
502 }
503 }
504
505 /*
506 * Modify the page and write page content as observed by the fixed
507 * buffer pin to the file so we can verify it.
508 */
509 memset(mem, 0xff, size);
510 sqe = io_uring_get_sqe(&ring);
511 if (!sqe) {
512 ksft_print_msg("io_uring_get_sqe() failed\n");
513 log_test_result(KSFT_FAIL);
514 goto quit_child;
515 }
516 io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
517
518 ret = io_uring_submit(&ring);
519 if (ret < 0) {
520 ksft_print_msg("io_uring_submit() failed\n");
521 log_test_result(KSFT_FAIL);
522 goto quit_child;
523 }
524
525 ret = io_uring_wait_cqe(&ring, &cqe);
526 if (ret < 0) {
527 ksft_print_msg("io_uring_wait_cqe() failed\n");
528 log_test_result(KSFT_FAIL);
529 goto quit_child;
530 }
531
532 if (cqe->res != size) {
533 ksft_print_msg("write_fixed failed\n");
534 log_test_result(KSFT_FAIL);
535 goto quit_child;
536 }
537 io_uring_cqe_seen(&ring, cqe);
538
539 /* Read back the file content to the temporary buffer. */
540 total = 0;
541 while (total < size) {
542 cur = pread(fd, tmp + total, size - total, total);
543 if (cur < 0) {
544 ksft_perror("pread() failed\n");
545 log_test_result(KSFT_FAIL);
546 goto quit_child;
547 }
548 total += cur;
549 }
550
551 /* Finally, check if we read what we expected. */
552 if (!memcmp(mem, tmp, size)) {
553 log_test_result(KSFT_PASS);
554 } else {
555 ksft_print_msg("Longtom R/W pin is not reliable\n");
556 log_test_result(KSFT_FAIL);
557 }
558
559 quit_child:
560 if (use_fork) {
561 write(comm_pipes.parent_ready[1], "0", 1);
562 wait(&ret);
563 }
564 unregister_buffers:
565 io_uring_unregister_buffers(&ring);
566 queue_exit:
567 io_uring_queue_exit(&ring);
568 free_tmp:
569 free(tmp);
570 close_file:
571 fclose(file);
572 close_comm_pipes:
573 close_comm_pipes(&comm_pipes);
574 }
575
test_iouring_ro(char * mem,size_t size,bool is_hugetlb)576 static void test_iouring_ro(char *mem, size_t size, bool is_hugetlb)
577 {
578 do_test_iouring(mem, size, false);
579 }
580
test_iouring_fork(char * mem,size_t size,bool is_hugetlb)581 static void test_iouring_fork(char *mem, size_t size, bool is_hugetlb)
582 {
583 do_test_iouring(mem, size, true);
584 }
585
586 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
587
588 enum ro_pin_test {
589 RO_PIN_TEST,
590 RO_PIN_TEST_SHARED,
591 RO_PIN_TEST_PREVIOUSLY_SHARED,
592 RO_PIN_TEST_RO_EXCLUSIVE,
593 };
594
do_test_ro_pin(char * mem,size_t size,enum ro_pin_test test,bool fast)595 static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
596 bool fast)
597 {
598 struct pin_longterm_test args;
599 struct comm_pipes comm_pipes;
600 char *tmp, buf;
601 __u64 tmp_val;
602 int ret;
603
604 if (gup_fd < 0) {
605 ksft_print_msg("gup_test not available\n");
606 log_test_result(KSFT_SKIP);
607 return;
608 }
609
610 tmp = malloc(size);
611 if (!tmp) {
612 ksft_perror("malloc() failed\n");
613 log_test_result(KSFT_FAIL);
614 return;
615 }
616
617 ret = setup_comm_pipes(&comm_pipes);
618 if (ret) {
619 log_test_result(KSFT_FAIL);
620 goto free_tmp;
621 }
622
623 switch (test) {
624 case RO_PIN_TEST:
625 break;
626 case RO_PIN_TEST_SHARED:
627 case RO_PIN_TEST_PREVIOUSLY_SHARED:
628 /*
629 * Share the pages with our child. As the pages are not pinned,
630 * this should just work.
631 */
632 ret = fork();
633 if (ret < 0) {
634 ksft_perror("fork() failed");
635 log_test_result(KSFT_FAIL);
636 goto close_comm_pipes;
637 } else if (!ret) {
638 write(comm_pipes.child_ready[1], "0", 1);
639 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
640 ;
641 exit(0);
642 }
643
644 /* Wait until our child is ready. */
645 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
646 ;
647
648 if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
649 /*
650 * Tell the child to quit now and wait until it quit.
651 * The pages should now be mapped R/O into our page
652 * tables, but they are no longer shared.
653 */
654 write(comm_pipes.parent_ready[1], "0", 1);
655 wait(&ret);
656 if (!WIFEXITED(ret))
657 ksft_print_msg("[INFO] wait() failed\n");
658 }
659 break;
660 case RO_PIN_TEST_RO_EXCLUSIVE:
661 /*
662 * Map the page R/O into the page table. Enable softdirty
663 * tracking to stop the page from getting mapped R/W immediately
664 * again by mprotect() optimizations. Note that we don't have an
665 * easy way to test if that worked (the pagemap does not export
666 * if the page is mapped R/O vs. R/W).
667 */
668 ret = mprotect(mem, size, PROT_READ);
669 clear_softdirty();
670 ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
671 if (ret) {
672 ksft_perror("mprotect() failed");
673 log_test_result(KSFT_FAIL);
674 goto close_comm_pipes;
675 }
676 break;
677 default:
678 assert(false);
679 }
680
681 /* Take a R/O pin. This should trigger unsharing. */
682 args.addr = (__u64)(uintptr_t)mem;
683 args.size = size;
684 args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
685 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
686 if (ret) {
687 if (errno == EINVAL)
688 ret = KSFT_SKIP;
689 else
690 ret = KSFT_FAIL;
691 ksft_perror("PIN_LONGTERM_TEST_START failed");
692 log_test_result(ret);
693 goto wait;
694 }
695
696 /* Modify the page. */
697 memset(mem, 0xff, size);
698
699 /*
700 * Read back the content via the pin to the temporary buffer and
701 * test if we observed the modification.
702 */
703 tmp_val = (__u64)(uintptr_t)tmp;
704 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
705 if (ret) {
706 ksft_perror("PIN_LONGTERM_TEST_READ failed");
707 log_test_result(KSFT_FAIL);
708 } else {
709 if (!memcmp(mem, tmp, size)) {
710 log_test_result(KSFT_PASS);
711 } else {
712 ksft_print_msg("Longterm R/O pin is not reliable\n");
713 log_test_result(KSFT_FAIL);
714 }
715 }
716
717 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
718 if (ret)
719 ksft_perror("PIN_LONGTERM_TEST_STOP failed");
720 wait:
721 switch (test) {
722 case RO_PIN_TEST_SHARED:
723 write(comm_pipes.parent_ready[1], "0", 1);
724 wait(&ret);
725 if (!WIFEXITED(ret))
726 ksft_perror("wait() failed");
727 break;
728 default:
729 break;
730 }
731 close_comm_pipes:
732 close_comm_pipes(&comm_pipes);
733 free_tmp:
734 free(tmp);
735 }
736
test_ro_pin_on_shared(char * mem,size_t size,bool is_hugetlb)737 static void test_ro_pin_on_shared(char *mem, size_t size, bool is_hugetlb)
738 {
739 do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
740 }
741
test_ro_fast_pin_on_shared(char * mem,size_t size,bool is_hugetlb)742 static void test_ro_fast_pin_on_shared(char *mem, size_t size, bool is_hugetlb)
743 {
744 do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
745 }
746
test_ro_pin_on_ro_previously_shared(char * mem,size_t size,bool is_hugetlb)747 static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size,
748 bool is_hugetlb)
749 {
750 do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
751 }
752
test_ro_fast_pin_on_ro_previously_shared(char * mem,size_t size,bool is_hugetlb)753 static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size,
754 bool is_hugetlb)
755 {
756 do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
757 }
758
test_ro_pin_on_ro_exclusive(char * mem,size_t size,bool is_hugetlb)759 static void test_ro_pin_on_ro_exclusive(char *mem, size_t size,
760 bool is_hugetlb)
761 {
762 do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
763 }
764
test_ro_fast_pin_on_ro_exclusive(char * mem,size_t size,bool is_hugetlb)765 static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size,
766 bool is_hugetlb)
767 {
768 do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
769 }
770
771 typedef void (*test_fn)(char *mem, size_t size, bool hugetlb);
772
do_run_with_base_page(test_fn fn,bool swapout)773 static void do_run_with_base_page(test_fn fn, bool swapout)
774 {
775 char *mem;
776 int ret;
777
778 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
779 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
780 if (mem == MAP_FAILED) {
781 ksft_perror("mmap() failed");
782 log_test_result(KSFT_FAIL);
783 return;
784 }
785
786 ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
787 /* Ignore if not around on a kernel. */
788 if (ret && errno != EINVAL) {
789 ksft_perror("MADV_NOHUGEPAGE failed");
790 log_test_result(KSFT_FAIL);
791 goto munmap;
792 }
793
794 /* Populate a base page. */
795 memset(mem, 1, pagesize);
796
797 if (swapout) {
798 madvise(mem, pagesize, MADV_PAGEOUT);
799 if (!pagemap_is_swapped(pagemap_fd, mem)) {
800 ksft_print_msg("MADV_PAGEOUT did not work, is swap enabled?\n");
801 log_test_result(KSFT_SKIP);
802 goto munmap;
803 }
804 }
805
806 fn(mem, pagesize, false);
807 munmap:
808 munmap(mem, pagesize);
809 }
810
run_with_base_page(test_fn fn,const char * desc)811 static void run_with_base_page(test_fn fn, const char *desc)
812 {
813 log_test_start("%s ... with base page", desc);
814 do_run_with_base_page(fn, false);
815 }
816
run_with_base_page_swap(test_fn fn,const char * desc)817 static void run_with_base_page_swap(test_fn fn, const char *desc)
818 {
819 log_test_start("%s ... with swapped out base page", desc);
820 do_run_with_base_page(fn, true);
821 }
822
823 enum thp_run {
824 THP_RUN_PMD,
825 THP_RUN_PMD_SWAPOUT,
826 THP_RUN_PTE,
827 THP_RUN_PTE_SWAPOUT,
828 THP_RUN_SINGLE_PTE,
829 THP_RUN_SINGLE_PTE_SWAPOUT,
830 THP_RUN_PARTIAL_MREMAP,
831 THP_RUN_PARTIAL_SHARED,
832 };
833
do_run_with_thp(test_fn fn,enum thp_run thp_run,size_t thpsize)834 static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
835 {
836 char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
837 size_t size, mmap_size, mremap_size;
838 int ret;
839
840 /* For alignment purposes, we need twice the thp size. */
841 mmap_size = 2 * thpsize;
842 mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
843 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
844 if (mmap_mem == MAP_FAILED) {
845 ksft_perror("mmap() failed");
846 log_test_result(KSFT_FAIL);
847 return;
848 }
849
850 /* We need a THP-aligned memory area. */
851 mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
852
853 ret = madvise(mem, thpsize, MADV_HUGEPAGE);
854 if (ret) {
855 ksft_perror("MADV_HUGEPAGE failed");
856 log_test_result(KSFT_FAIL);
857 goto munmap;
858 }
859
860 /*
861 * Try to populate a THP. Touch the first sub-page and test if
862 * we get the last sub-page populated automatically.
863 */
864 mem[0] = 1;
865 if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
866 ksft_print_msg("Did not get a THP populated\n");
867 log_test_result(KSFT_SKIP);
868 goto munmap;
869 }
870 memset(mem, 1, thpsize);
871
872 size = thpsize;
873 switch (thp_run) {
874 case THP_RUN_PMD:
875 case THP_RUN_PMD_SWAPOUT:
876 assert(thpsize == pmdsize);
877 break;
878 case THP_RUN_PTE:
879 case THP_RUN_PTE_SWAPOUT:
880 /*
881 * Trigger PTE-mapping the THP by temporarily mapping a single
882 * subpage R/O. This is a noop if the THP is not pmdsize (and
883 * therefore already PTE-mapped).
884 */
885 ret = mprotect(mem + pagesize, pagesize, PROT_READ);
886 if (ret) {
887 ksft_perror("mprotect() failed");
888 log_test_result(KSFT_FAIL);
889 goto munmap;
890 }
891 ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
892 if (ret) {
893 ksft_perror("mprotect() failed");
894 log_test_result(KSFT_FAIL);
895 goto munmap;
896 }
897 break;
898 case THP_RUN_SINGLE_PTE:
899 case THP_RUN_SINGLE_PTE_SWAPOUT:
900 /*
901 * Discard all but a single subpage of that PTE-mapped THP. What
902 * remains is a single PTE mapping a single subpage.
903 */
904 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
905 if (ret) {
906 ksft_perror("MADV_DONTNEED failed");
907 log_test_result(KSFT_FAIL);
908 goto munmap;
909 }
910 size = pagesize;
911 break;
912 case THP_RUN_PARTIAL_MREMAP:
913 /*
914 * Remap half of the THP. We need some new memory location
915 * for that.
916 */
917 mremap_size = thpsize / 2;
918 mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
919 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
920 if (mremap_mem == MAP_FAILED) {
921 ksft_perror("mmap() failed");
922 log_test_result(KSFT_FAIL);
923 goto munmap;
924 }
925 tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
926 MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
927 if (tmp != mremap_mem) {
928 ksft_perror("mremap() failed");
929 log_test_result(KSFT_FAIL);
930 goto munmap;
931 }
932 size = mremap_size;
933 break;
934 case THP_RUN_PARTIAL_SHARED:
935 /*
936 * Share the first page of the THP with a child and quit the
937 * child. This will result in some parts of the THP never
938 * have been shared.
939 */
940 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
941 if (ret) {
942 ksft_perror("MADV_DONTFORK failed");
943 log_test_result(KSFT_FAIL);
944 goto munmap;
945 }
946 ret = fork();
947 if (ret < 0) {
948 ksft_perror("fork() failed");
949 log_test_result(KSFT_FAIL);
950 goto munmap;
951 } else if (!ret) {
952 exit(0);
953 }
954 wait(&ret);
955 /* Allow for sharing all pages again. */
956 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
957 if (ret) {
958 ksft_perror("MADV_DOFORK failed");
959 log_test_result(KSFT_FAIL);
960 goto munmap;
961 }
962 break;
963 default:
964 assert(false);
965 }
966
967 switch (thp_run) {
968 case THP_RUN_PMD_SWAPOUT:
969 case THP_RUN_PTE_SWAPOUT:
970 case THP_RUN_SINGLE_PTE_SWAPOUT:
971 madvise(mem, size, MADV_PAGEOUT);
972 if (!range_is_swapped(mem, size)) {
973 ksft_print_msg("MADV_PAGEOUT did not work, is swap enabled?\n");
974 log_test_result(KSFT_SKIP);
975 goto munmap;
976 }
977 break;
978 default:
979 break;
980 }
981
982 fn(mem, size, false);
983 munmap:
984 munmap(mmap_mem, mmap_size);
985 if (mremap_mem != MAP_FAILED)
986 munmap(mremap_mem, mremap_size);
987 }
988
run_with_thp(test_fn fn,const char * desc,size_t size)989 static void run_with_thp(test_fn fn, const char *desc, size_t size)
990 {
991 log_test_start("%s ... with THP (%zu kB)",
992 desc, size / 1024);
993 do_run_with_thp(fn, THP_RUN_PMD, size);
994 }
995
run_with_thp_swap(test_fn fn,const char * desc,size_t size)996 static void run_with_thp_swap(test_fn fn, const char *desc, size_t size)
997 {
998 log_test_start("%s ... with swapped-out THP (%zu kB)",
999 desc, size / 1024);
1000 do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size);
1001 }
1002
run_with_pte_mapped_thp(test_fn fn,const char * desc,size_t size)1003 static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size)
1004 {
1005 log_test_start("%s ... with PTE-mapped THP (%zu kB)",
1006 desc, size / 1024);
1007 do_run_with_thp(fn, THP_RUN_PTE, size);
1008 }
1009
run_with_pte_mapped_thp_swap(test_fn fn,const char * desc,size_t size)1010 static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size)
1011 {
1012 log_test_start("%s ... with swapped-out, PTE-mapped THP (%zu kB)",
1013 desc, size / 1024);
1014 do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size);
1015 }
1016
run_with_single_pte_of_thp(test_fn fn,const char * desc,size_t size)1017 static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size)
1018 {
1019 log_test_start("%s ... with single PTE of THP (%zu kB)",
1020 desc, size / 1024);
1021 do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size);
1022 }
1023
run_with_single_pte_of_thp_swap(test_fn fn,const char * desc,size_t size)1024 static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size)
1025 {
1026 log_test_start("%s ... with single PTE of swapped-out THP (%zu kB)",
1027 desc, size / 1024);
1028 do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size);
1029 }
1030
run_with_partial_mremap_thp(test_fn fn,const char * desc,size_t size)1031 static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size)
1032 {
1033 log_test_start("%s ... with partially mremap()'ed THP (%zu kB)",
1034 desc, size / 1024);
1035 do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size);
1036 }
1037
run_with_partial_shared_thp(test_fn fn,const char * desc,size_t size)1038 static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size)
1039 {
1040 log_test_start("%s ... with partially shared THP (%zu kB)",
1041 desc, size / 1024);
1042 do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size);
1043 }
1044
run_with_hugetlb(test_fn fn,const char * desc,size_t hugetlbsize)1045 static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
1046 {
1047 int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
1048 char *mem, *dummy;
1049
1050 log_test_start("%s ... with hugetlb (%zu kB)", desc,
1051 hugetlbsize / 1024);
1052
1053 flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
1054
1055 mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
1056 if (mem == MAP_FAILED) {
1057 ksft_perror("need more free huge pages");
1058 log_test_result(KSFT_SKIP);
1059 return;
1060 }
1061
1062 /* Populate an huge page. */
1063 memset(mem, 1, hugetlbsize);
1064
1065 /*
1066 * We need a total of two hugetlb pages to handle COW/unsharing
1067 * properly, otherwise we might get zapped by a SIGBUS.
1068 */
1069 dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
1070 if (dummy == MAP_FAILED) {
1071 ksft_perror("need more free huge pages");
1072 log_test_result(KSFT_SKIP);
1073 goto munmap;
1074 }
1075 munmap(dummy, hugetlbsize);
1076
1077 fn(mem, hugetlbsize, true);
1078 munmap:
1079 munmap(mem, hugetlbsize);
1080 }
1081
1082 struct test_case {
1083 const char *desc;
1084 test_fn fn;
1085 };
1086
1087 /*
1088 * Test cases that are specific to anonymous pages: pages in private mappings
1089 * that may get shared via COW during fork().
1090 */
1091 static const struct test_case anon_test_cases[] = {
1092 /*
1093 * Basic COW tests for fork() without any GUP. If we miss to break COW,
1094 * either the child can observe modifications by the parent or the
1095 * other way around.
1096 */
1097 {
1098 "Basic COW after fork()",
1099 test_cow_in_parent,
1100 },
1101 /*
1102 * Basic test, but do an additional mprotect(PROT_READ)+
1103 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1104 */
1105 {
1106 "Basic COW after fork() with mprotect() optimization",
1107 test_cow_in_parent_mprotect,
1108 },
1109 /*
1110 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
1111 * we miss to break COW, the child observes modifications by the parent.
1112 * This is CVE-2020-29374 reported by Jann Horn.
1113 */
1114 {
1115 "vmsplice() + unmap in child",
1116 test_vmsplice_in_child,
1117 },
1118 /*
1119 * vmsplice() test, but do an additional mprotect(PROT_READ)+
1120 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1121 */
1122 {
1123 "vmsplice() + unmap in child with mprotect() optimization",
1124 test_vmsplice_in_child_mprotect,
1125 },
1126 /*
1127 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
1128 * fork(); modify in the child. If we miss to break COW, the parent
1129 * observes modifications by the child.
1130 */
1131 {
1132 "vmsplice() before fork(), unmap in parent after fork()",
1133 test_vmsplice_before_fork,
1134 },
1135 /*
1136 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
1137 * child. If we miss to break COW, the parent observes modifications by
1138 * the child.
1139 */
1140 {
1141 "vmsplice() + unmap in parent after fork()",
1142 test_vmsplice_after_fork,
1143 },
1144 #ifdef LOCAL_CONFIG_HAVE_LIBURING
1145 /*
1146 * Take a R/W longterm pin and then map the page R/O into the page
1147 * table to trigger a write fault on next access. When modifying the
1148 * page, the page content must be visible via the pin.
1149 */
1150 {
1151 "R/O-mapping a page registered as iouring fixed buffer",
1152 test_iouring_ro,
1153 },
1154 /*
1155 * Take a R/W longterm pin and then fork() a child. When modifying the
1156 * page, the page content must be visible via the pin. We expect the
1157 * pinned page to not get shared with the child.
1158 */
1159 {
1160 "fork() with an iouring fixed buffer",
1161 test_iouring_fork,
1162 },
1163
1164 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
1165 /*
1166 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
1167 * When modifying the page via the page table, the page content change
1168 * must be visible via the pin.
1169 */
1170 {
1171 "R/O GUP pin on R/O-mapped shared page",
1172 test_ro_pin_on_shared,
1173 },
1174 /* Same as above, but using GUP-fast. */
1175 {
1176 "R/O GUP-fast pin on R/O-mapped shared page",
1177 test_ro_fast_pin_on_shared,
1178 },
1179 /*
1180 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
1181 * was previously shared. When modifying the page via the page table,
1182 * the page content change must be visible via the pin.
1183 */
1184 {
1185 "R/O GUP pin on R/O-mapped previously-shared page",
1186 test_ro_pin_on_ro_previously_shared,
1187 },
1188 /* Same as above, but using GUP-fast. */
1189 {
1190 "R/O GUP-fast pin on R/O-mapped previously-shared page",
1191 test_ro_fast_pin_on_ro_previously_shared,
1192 },
1193 /*
1194 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
1195 * When modifying the page via the page table, the page content change
1196 * must be visible via the pin.
1197 */
1198 {
1199 "R/O GUP pin on R/O-mapped exclusive page",
1200 test_ro_pin_on_ro_exclusive,
1201 },
1202 /* Same as above, but using GUP-fast. */
1203 {
1204 "R/O GUP-fast pin on R/O-mapped exclusive page",
1205 test_ro_fast_pin_on_ro_exclusive,
1206 },
1207 };
1208
run_anon_test_case(struct test_case const * test_case)1209 static void run_anon_test_case(struct test_case const *test_case)
1210 {
1211 int i;
1212
1213 run_with_base_page(test_case->fn, test_case->desc);
1214 run_with_base_page_swap(test_case->fn, test_case->desc);
1215 for (i = 0; i < nr_thpsizes; i++) {
1216 size_t size = thpsizes[i];
1217 struct thp_settings settings = *thp_current_settings();
1218
1219 settings.hugepages[sz2ord(pmdsize)].enabled = THP_NEVER;
1220 settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS;
1221 thp_push_settings(&settings);
1222
1223 if (size == pmdsize) {
1224 run_with_thp(test_case->fn, test_case->desc, size);
1225 run_with_thp_swap(test_case->fn, test_case->desc, size);
1226 }
1227
1228 run_with_pte_mapped_thp(test_case->fn, test_case->desc, size);
1229 run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, size);
1230 run_with_single_pte_of_thp(test_case->fn, test_case->desc, size);
1231 run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, size);
1232 run_with_partial_mremap_thp(test_case->fn, test_case->desc, size);
1233 run_with_partial_shared_thp(test_case->fn, test_case->desc, size);
1234
1235 thp_pop_settings();
1236 }
1237 for (i = 0; i < nr_hugetlbsizes; i++)
1238 run_with_hugetlb(test_case->fn, test_case->desc,
1239 hugetlbsizes[i]);
1240 }
1241
run_anon_test_cases(void)1242 static void run_anon_test_cases(void)
1243 {
1244 int i;
1245
1246 ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
1247
1248 for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
1249 run_anon_test_case(&anon_test_cases[i]);
1250 }
1251
tests_per_anon_test_case(void)1252 static int tests_per_anon_test_case(void)
1253 {
1254 int tests = 2 + nr_hugetlbsizes;
1255
1256 tests += 6 * nr_thpsizes;
1257 if (pmdsize)
1258 tests += 2;
1259 return tests;
1260 }
1261
1262 enum anon_thp_collapse_test {
1263 ANON_THP_COLLAPSE_UNSHARED,
1264 ANON_THP_COLLAPSE_FULLY_SHARED,
1265 ANON_THP_COLLAPSE_LOWER_SHARED,
1266 ANON_THP_COLLAPSE_UPPER_SHARED,
1267 };
1268
do_test_anon_thp_collapse(char * mem,size_t size,enum anon_thp_collapse_test test)1269 static void do_test_anon_thp_collapse(char *mem, size_t size,
1270 enum anon_thp_collapse_test test)
1271 {
1272 struct comm_pipes comm_pipes;
1273 char buf;
1274 int ret;
1275
1276 ret = setup_comm_pipes(&comm_pipes);
1277 if (ret) {
1278 log_test_result(KSFT_FAIL);
1279 return;
1280 }
1281
1282 /*
1283 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
1284 * R/O, such that we can try collapsing it later.
1285 */
1286 ret = mprotect(mem + pagesize, pagesize, PROT_READ);
1287 if (ret) {
1288 ksft_perror("mprotect() failed");
1289 log_test_result(KSFT_FAIL);
1290 goto close_comm_pipes;
1291 }
1292 ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
1293 if (ret) {
1294 ksft_perror("mprotect() failed");
1295 log_test_result(KSFT_FAIL);
1296 goto close_comm_pipes;
1297 }
1298
1299 switch (test) {
1300 case ANON_THP_COLLAPSE_UNSHARED:
1301 /* Collapse before actually COW-sharing the page. */
1302 ret = madvise(mem, size, MADV_COLLAPSE);
1303 if (ret) {
1304 ksft_perror("MADV_COLLAPSE failed");
1305 log_test_result(KSFT_SKIP);
1306 goto close_comm_pipes;
1307 }
1308 break;
1309 case ANON_THP_COLLAPSE_FULLY_SHARED:
1310 /* COW-share the full PTE-mapped THP. */
1311 break;
1312 case ANON_THP_COLLAPSE_LOWER_SHARED:
1313 /* Don't COW-share the upper part of the THP. */
1314 ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
1315 if (ret) {
1316 ksft_perror("MADV_DONTFORK failed");
1317 log_test_result(KSFT_FAIL);
1318 goto close_comm_pipes;
1319 }
1320 break;
1321 case ANON_THP_COLLAPSE_UPPER_SHARED:
1322 /* Don't COW-share the lower part of the THP. */
1323 ret = madvise(mem, size / 2, MADV_DONTFORK);
1324 if (ret) {
1325 ksft_perror("MADV_DONTFORK failed");
1326 log_test_result(KSFT_FAIL);
1327 goto close_comm_pipes;
1328 }
1329 break;
1330 default:
1331 assert(false);
1332 }
1333
1334 ret = fork();
1335 if (ret < 0) {
1336 ksft_perror("fork() failed");
1337 log_test_result(KSFT_FAIL);
1338 goto close_comm_pipes;
1339 } else if (!ret) {
1340 switch (test) {
1341 case ANON_THP_COLLAPSE_UNSHARED:
1342 case ANON_THP_COLLAPSE_FULLY_SHARED:
1343 exit(child_memcmp_fn(mem, size, &comm_pipes));
1344 break;
1345 case ANON_THP_COLLAPSE_LOWER_SHARED:
1346 exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
1347 break;
1348 case ANON_THP_COLLAPSE_UPPER_SHARED:
1349 exit(child_memcmp_fn(mem + size / 2, size / 2,
1350 &comm_pipes));
1351 break;
1352 default:
1353 assert(false);
1354 }
1355 }
1356
1357 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
1358 ;
1359
1360 switch (test) {
1361 case ANON_THP_COLLAPSE_UNSHARED:
1362 break;
1363 case ANON_THP_COLLAPSE_UPPER_SHARED:
1364 case ANON_THP_COLLAPSE_LOWER_SHARED:
1365 /*
1366 * Revert MADV_DONTFORK such that we merge the VMAs and are
1367 * able to actually collapse.
1368 */
1369 ret = madvise(mem, size, MADV_DOFORK);
1370 if (ret) {
1371 ksft_perror("MADV_DOFORK failed");
1372 log_test_result(KSFT_FAIL);
1373 write(comm_pipes.parent_ready[1], "0", 1);
1374 wait(&ret);
1375 goto close_comm_pipes;
1376 }
1377 /* FALLTHROUGH */
1378 case ANON_THP_COLLAPSE_FULLY_SHARED:
1379 /* Collapse before anyone modified the COW-shared page. */
1380 ret = madvise(mem, size, MADV_COLLAPSE);
1381 if (ret) {
1382 ksft_perror("MADV_COLLAPSE failed");
1383 log_test_result(KSFT_SKIP);
1384 write(comm_pipes.parent_ready[1], "0", 1);
1385 wait(&ret);
1386 goto close_comm_pipes;
1387 }
1388 break;
1389 default:
1390 assert(false);
1391 }
1392
1393 /* Modify the page. */
1394 memset(mem, 0xff, size);
1395 write(comm_pipes.parent_ready[1], "0", 1);
1396
1397 wait(&ret);
1398 if (WIFEXITED(ret))
1399 ret = WEXITSTATUS(ret);
1400 else
1401 ret = -EINVAL;
1402
1403 if (!ret) {
1404 log_test_result(KSFT_PASS);
1405 } else {
1406 ksft_print_msg("Leak from parent into child\n");
1407 log_test_result(KSFT_FAIL);
1408 }
1409 close_comm_pipes:
1410 close_comm_pipes(&comm_pipes);
1411 }
1412
test_anon_thp_collapse_unshared(char * mem,size_t size,bool is_hugetlb)1413 static void test_anon_thp_collapse_unshared(char *mem, size_t size,
1414 bool is_hugetlb)
1415 {
1416 assert(!is_hugetlb);
1417 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
1418 }
1419
test_anon_thp_collapse_fully_shared(char * mem,size_t size,bool is_hugetlb)1420 static void test_anon_thp_collapse_fully_shared(char *mem, size_t size,
1421 bool is_hugetlb)
1422 {
1423 assert(!is_hugetlb);
1424 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
1425 }
1426
test_anon_thp_collapse_lower_shared(char * mem,size_t size,bool is_hugetlb)1427 static void test_anon_thp_collapse_lower_shared(char *mem, size_t size,
1428 bool is_hugetlb)
1429 {
1430 assert(!is_hugetlb);
1431 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
1432 }
1433
test_anon_thp_collapse_upper_shared(char * mem,size_t size,bool is_hugetlb)1434 static void test_anon_thp_collapse_upper_shared(char *mem, size_t size,
1435 bool is_hugetlb)
1436 {
1437 assert(!is_hugetlb);
1438 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
1439 }
1440
1441 /*
1442 * Test cases that are specific to anonymous THP: pages in private mappings
1443 * that may get shared via COW during fork().
1444 */
1445 static const struct test_case anon_thp_test_cases[] = {
1446 /*
1447 * Basic COW test for fork() without any GUP when collapsing a THP
1448 * before fork().
1449 *
1450 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
1451 * collapse") might easily get COW handling wrong when not collapsing
1452 * exclusivity information properly.
1453 */
1454 {
1455 "Basic COW after fork() when collapsing before fork()",
1456 test_anon_thp_collapse_unshared,
1457 },
1458 /* Basic COW test, but collapse after COW-sharing a full THP. */
1459 {
1460 "Basic COW after fork() when collapsing after fork() (fully shared)",
1461 test_anon_thp_collapse_fully_shared,
1462 },
1463 /*
1464 * Basic COW test, but collapse after COW-sharing the lower half of a
1465 * THP.
1466 */
1467 {
1468 "Basic COW after fork() when collapsing after fork() (lower shared)",
1469 test_anon_thp_collapse_lower_shared,
1470 },
1471 /*
1472 * Basic COW test, but collapse after COW-sharing the upper half of a
1473 * THP.
1474 */
1475 {
1476 "Basic COW after fork() when collapsing after fork() (upper shared)",
1477 test_anon_thp_collapse_upper_shared,
1478 },
1479 };
1480
run_anon_thp_test_cases(void)1481 static void run_anon_thp_test_cases(void)
1482 {
1483 int i;
1484
1485 if (!pmdsize)
1486 return;
1487
1488 ksft_print_msg("[INFO] Anonymous THP tests\n");
1489
1490 for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
1491 struct test_case const *test_case = &anon_thp_test_cases[i];
1492
1493 log_test_start("%s", test_case->desc);
1494 do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize);
1495 }
1496 }
1497
tests_per_anon_thp_test_case(void)1498 static int tests_per_anon_thp_test_case(void)
1499 {
1500 return pmdsize ? 1 : 0;
1501 }
1502
1503 typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
1504
test_cow(char * mem,const char * smem,size_t size)1505 static void test_cow(char *mem, const char *smem, size_t size)
1506 {
1507 char *old = malloc(size);
1508
1509 /* Backup the original content. */
1510 memcpy(old, smem, size);
1511
1512 /* Modify the page. */
1513 memset(mem, 0xff, size);
1514
1515 /* See if we still read the old values via the other mapping. */
1516 if (!memcmp(smem, old, size)) {
1517 log_test_result(KSFT_PASS);
1518 } else {
1519 ksft_print_msg("Other mapping modified\n");
1520 log_test_result(KSFT_FAIL);
1521 }
1522 free(old);
1523 }
1524
test_ro_pin(char * mem,const char * smem,size_t size)1525 static void test_ro_pin(char *mem, const char *smem, size_t size)
1526 {
1527 do_test_ro_pin(mem, size, RO_PIN_TEST, false);
1528 }
1529
test_ro_fast_pin(char * mem,const char * smem,size_t size)1530 static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
1531 {
1532 do_test_ro_pin(mem, size, RO_PIN_TEST, true);
1533 }
1534
run_with_zeropage(non_anon_test_fn fn,const char * desc)1535 static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
1536 {
1537 char *mem, *smem;
1538
1539 log_test_start("%s ... with shared zeropage", desc);
1540
1541 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
1542 MAP_PRIVATE | MAP_ANON, -1, 0);
1543 if (mem == MAP_FAILED) {
1544 ksft_perror("mmap() failed");
1545 log_test_result(KSFT_FAIL);
1546 return;
1547 }
1548
1549 smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
1550 if (smem == MAP_FAILED) {
1551 ksft_perror("mmap() failed");
1552 log_test_result(KSFT_FAIL);
1553 goto munmap;
1554 }
1555
1556 /* Read from the page to populate the shared zeropage. */
1557 FORCE_READ(mem);
1558 FORCE_READ(smem);
1559
1560 fn(mem, smem, pagesize);
1561 munmap:
1562 munmap(mem, pagesize);
1563 if (smem != MAP_FAILED)
1564 munmap(smem, pagesize);
1565 }
1566
run_with_huge_zeropage(non_anon_test_fn fn,const char * desc)1567 static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
1568 {
1569 char *mem, *smem, *mmap_mem, *mmap_smem;
1570 size_t mmap_size;
1571 int ret;
1572
1573 log_test_start("%s ... with huge zeropage", desc);
1574
1575 if (!has_huge_zeropage) {
1576 ksft_print_msg("Huge zeropage not enabled\n");
1577 log_test_result(KSFT_SKIP);
1578 return;
1579 }
1580
1581 /* For alignment purposes, we need twice the thp size. */
1582 mmap_size = 2 * pmdsize;
1583 mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1584 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1585 if (mmap_mem == MAP_FAILED) {
1586 ksft_perror("mmap() failed");
1587 log_test_result(KSFT_FAIL);
1588 return;
1589 }
1590 mmap_smem = mmap(NULL, mmap_size, PROT_READ,
1591 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1592 if (mmap_smem == MAP_FAILED) {
1593 ksft_perror("mmap() failed");
1594 log_test_result(KSFT_FAIL);
1595 goto munmap;
1596 }
1597
1598 /* We need a THP-aligned memory area. */
1599 mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1));
1600 smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1));
1601
1602 ret = madvise(mem, pmdsize, MADV_HUGEPAGE);
1603 if (ret) {
1604 ksft_perror("madvise()");
1605 log_test_result(KSFT_FAIL);
1606 goto munmap;
1607 }
1608 ret = madvise(smem, pmdsize, MADV_HUGEPAGE);
1609 if (ret) {
1610 ksft_perror("madvise()");
1611 log_test_result(KSFT_FAIL);
1612 goto munmap;
1613 }
1614
1615 /*
1616 * Read from the memory to populate the huge shared zeropage. Read from
1617 * the first sub-page and test if we get another sub-page populated
1618 * automatically.
1619 */
1620 FORCE_READ(mem);
1621 FORCE_READ(smem);
1622 if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
1623 !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
1624 ksft_test_result_skip("Did not get THPs populated\n");
1625 goto munmap;
1626 }
1627
1628 fn(mem, smem, pmdsize);
1629 munmap:
1630 munmap(mmap_mem, mmap_size);
1631 if (mmap_smem != MAP_FAILED)
1632 munmap(mmap_smem, mmap_size);
1633 }
1634
run_with_memfd(non_anon_test_fn fn,const char * desc)1635 static void run_with_memfd(non_anon_test_fn fn, const char *desc)
1636 {
1637 char *mem, *smem;
1638 int fd;
1639
1640 log_test_start("%s ... with memfd", desc);
1641
1642 fd = memfd_create("test", 0);
1643 if (fd < 0) {
1644 ksft_perror("memfd_create() failed");
1645 log_test_result(KSFT_FAIL);
1646 return;
1647 }
1648
1649 /* File consists of a single page filled with zeroes. */
1650 if (fallocate(fd, 0, 0, pagesize)) {
1651 ksft_perror("fallocate() failed");
1652 log_test_result(KSFT_FAIL);
1653 goto close;
1654 }
1655
1656 /* Create a private mapping of the memfd. */
1657 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1658 if (mem == MAP_FAILED) {
1659 ksft_perror("mmap() failed");
1660 log_test_result(KSFT_FAIL);
1661 goto close;
1662 }
1663 smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1664 if (smem == MAP_FAILED) {
1665 ksft_perror("mmap() failed");
1666 log_test_result(KSFT_FAIL);
1667 goto munmap;
1668 }
1669
1670 /* Fault the page in. */
1671 FORCE_READ(mem);
1672 FORCE_READ(smem);
1673
1674 fn(mem, smem, pagesize);
1675 munmap:
1676 munmap(mem, pagesize);
1677 if (smem != MAP_FAILED)
1678 munmap(smem, pagesize);
1679 close:
1680 close(fd);
1681 }
1682
run_with_tmpfile(non_anon_test_fn fn,const char * desc)1683 static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
1684 {
1685 char *mem, *smem;
1686 FILE *file;
1687 int fd;
1688
1689 log_test_start("%s ... with tmpfile", desc);
1690
1691 file = tmpfile();
1692 if (!file) {
1693 ksft_perror("tmpfile() failed");
1694 log_test_result(KSFT_FAIL);
1695 return;
1696 }
1697
1698 fd = fileno(file);
1699 if (fd < 0) {
1700 ksft_perror("fileno() failed");
1701 log_test_result(KSFT_SKIP);
1702 return;
1703 }
1704
1705 /* File consists of a single page filled with zeroes. */
1706 if (fallocate(fd, 0, 0, pagesize)) {
1707 ksft_perror("fallocate() failed");
1708 log_test_result(KSFT_FAIL);
1709 goto close;
1710 }
1711
1712 /* Create a private mapping of the memfd. */
1713 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1714 if (mem == MAP_FAILED) {
1715 ksft_perror("mmap() failed");
1716 log_test_result(KSFT_FAIL);
1717 goto close;
1718 }
1719 smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1720 if (smem == MAP_FAILED) {
1721 ksft_perror("mmap() failed");
1722 log_test_result(KSFT_FAIL);
1723 goto munmap;
1724 }
1725
1726 /* Fault the page in. */
1727 FORCE_READ(mem);
1728 FORCE_READ(smem);
1729
1730 fn(mem, smem, pagesize);
1731 munmap:
1732 munmap(mem, pagesize);
1733 if (smem != MAP_FAILED)
1734 munmap(smem, pagesize);
1735 close:
1736 fclose(file);
1737 }
1738
run_with_memfd_hugetlb(non_anon_test_fn fn,const char * desc,size_t hugetlbsize)1739 static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
1740 size_t hugetlbsize)
1741 {
1742 int flags = MFD_HUGETLB;
1743 char *mem, *smem;
1744 int fd;
1745
1746 log_test_start("%s ... with memfd hugetlb (%zu kB)", desc,
1747 hugetlbsize / 1024);
1748
1749 flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
1750
1751 fd = memfd_create("test", flags);
1752 if (fd < 0) {
1753 ksft_perror("memfd_create() failed");
1754 log_test_result(KSFT_SKIP);
1755 return;
1756 }
1757
1758 /* File consists of a single page filled with zeroes. */
1759 if (fallocate(fd, 0, 0, hugetlbsize)) {
1760 ksft_perror("need more free huge pages");
1761 log_test_result(KSFT_SKIP);
1762 goto close;
1763 }
1764
1765 /* Create a private mapping of the memfd. */
1766 mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
1767 0);
1768 if (mem == MAP_FAILED) {
1769 ksft_perror("need more free huge pages");
1770 log_test_result(KSFT_SKIP);
1771 goto close;
1772 }
1773 smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
1774 if (smem == MAP_FAILED) {
1775 ksft_perror("mmap() failed");
1776 log_test_result(KSFT_FAIL);
1777 goto munmap;
1778 }
1779
1780 /* Fault the page in. */
1781 FORCE_READ(mem);
1782 FORCE_READ(smem);
1783
1784 fn(mem, smem, hugetlbsize);
1785 munmap:
1786 munmap(mem, hugetlbsize);
1787 if (smem != MAP_FAILED)
1788 munmap(smem, hugetlbsize);
1789 close:
1790 close(fd);
1791 }
1792
1793 struct non_anon_test_case {
1794 const char *desc;
1795 non_anon_test_fn fn;
1796 };
1797
1798 /*
1799 * Test cases that target any pages in private mappings that are not anonymous:
1800 * pages that may get shared via COW ndependent of fork(). This includes
1801 * the shared zeropage(s), pagecache pages, ...
1802 */
1803 static const struct non_anon_test_case non_anon_test_cases[] = {
1804 /*
1805 * Basic COW test without any GUP. If we miss to break COW, changes are
1806 * visible via other private/shared mappings.
1807 */
1808 {
1809 "Basic COW",
1810 test_cow,
1811 },
1812 /*
1813 * Take a R/O longterm pin. When modifying the page via the page table,
1814 * the page content change must be visible via the pin.
1815 */
1816 {
1817 "R/O longterm GUP pin",
1818 test_ro_pin,
1819 },
1820 /* Same as above, but using GUP-fast. */
1821 {
1822 "R/O longterm GUP-fast pin",
1823 test_ro_fast_pin,
1824 },
1825 };
1826
run_non_anon_test_case(struct non_anon_test_case const * test_case)1827 static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
1828 {
1829 int i;
1830
1831 run_with_zeropage(test_case->fn, test_case->desc);
1832 run_with_memfd(test_case->fn, test_case->desc);
1833 run_with_tmpfile(test_case->fn, test_case->desc);
1834 if (pmdsize)
1835 run_with_huge_zeropage(test_case->fn, test_case->desc);
1836 for (i = 0; i < nr_hugetlbsizes; i++)
1837 run_with_memfd_hugetlb(test_case->fn, test_case->desc,
1838 hugetlbsizes[i]);
1839 }
1840
run_non_anon_test_cases(void)1841 static void run_non_anon_test_cases(void)
1842 {
1843 int i;
1844
1845 ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
1846
1847 for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
1848 run_non_anon_test_case(&non_anon_test_cases[i]);
1849 }
1850
tests_per_non_anon_test_case(void)1851 static int tests_per_non_anon_test_case(void)
1852 {
1853 int tests = 3 + nr_hugetlbsizes;
1854
1855 if (pmdsize)
1856 tests += 1;
1857 return tests;
1858 }
1859
main(int argc,char ** argv)1860 int main(int argc, char **argv)
1861 {
1862 struct thp_settings default_settings;
1863
1864 ksft_print_header();
1865
1866 pagesize = getpagesize();
1867 pmdsize = read_pmd_pagesize();
1868 if (pmdsize) {
1869 /* Only if THP is supported. */
1870 thp_read_settings(&default_settings);
1871 default_settings.hugepages[sz2ord(pmdsize)].enabled = THP_INHERIT;
1872 thp_save_settings();
1873 thp_push_settings(&default_settings);
1874
1875 ksft_print_msg("[INFO] detected PMD size: %zu KiB\n",
1876 pmdsize / 1024);
1877 nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes));
1878 }
1879 nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
1880 ARRAY_SIZE(hugetlbsizes));
1881 has_huge_zeropage = detect_huge_zeropage();
1882
1883 ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
1884 ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
1885 ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
1886
1887 gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
1888 pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
1889 if (pagemap_fd < 0)
1890 ksft_exit_fail_msg("opening pagemap failed\n");
1891
1892 run_anon_test_cases();
1893 run_anon_thp_test_cases();
1894 run_non_anon_test_cases();
1895
1896 if (pmdsize) {
1897 /* Only if THP is supported. */
1898 thp_restore_settings();
1899 }
1900
1901 ksft_finished();
1902 }
1903