Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Stress userfaultfd syscall.
4 *
5 * Copyright (C) 2015 Red Hat, Inc.
6 *
7 * This test allocates two virtual areas and bounces the physical
8 * memory across the two virtual areas (from area_src to area_dst)
9 * using userfaultfd.
10 *
11 * There are three threads running per CPU:
12 *
13 * 1) one per-CPU thread takes a per-page pthread_mutex in a random
14 * page of the area_dst (while the physical page may still be in
15 * area_src), and increments a per-page counter in the same page,
16 * and checks its value against a verification region.
17 *
18 * 2) another per-CPU thread handles the userfaults generated by
19 * thread 1 above. userfaultfd blocking reads or poll() modes are
20 * exercised interleaved.
21 *
22 * 3) one last per-CPU thread transfers the memory in the background
23 * at maximum bandwidth (if not already transferred by thread
24 * 2). Each cpu thread takes cares of transferring a portion of the
25 * area.
26 *
27 * When all threads of type 3 completed the transfer, one bounce is
28 * complete. area_src and area_dst are then swapped. All threads are
29 * respawned and so the bounce is immediately restarted in the
30 * opposite direction.
31 *
32 * per-CPU threads 1 by triggering userfaults inside
33 * pthread_mutex_lock will also verify the atomicity of the memory
34 * transfer (UFFDIO_COPY).
35 */
36
37#define _GNU_SOURCE
38#include <stdio.h>
39#include <errno.h>
40#include <unistd.h>
41#include <stdlib.h>
42#include <sys/types.h>
43#include <sys/stat.h>
44#include <fcntl.h>
45#include <time.h>
46#include <signal.h>
47#include <poll.h>
48#include <string.h>
49#include <sys/mman.h>
50#include <sys/syscall.h>
51#include <sys/ioctl.h>
52#include <sys/wait.h>
53#include <pthread.h>
54#include <linux/userfaultfd.h>
55#include <setjmp.h>
56#include <stdbool.h>
57#include <assert.h>
58#include <inttypes.h>
59#include <stdint.h>
60#include <sys/random.h>
61
62#include "../kselftest.h"
63
64#ifdef __NR_userfaultfd
65
66static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
67
68#define BOUNCE_RANDOM (1<<0)
69#define BOUNCE_RACINGFAULTS (1<<1)
70#define BOUNCE_VERIFY (1<<2)
71#define BOUNCE_POLL (1<<3)
72static int bounces;
73
74#define TEST_ANON 1
75#define TEST_HUGETLB 2
76#define TEST_SHMEM 3
77static int test_type;
78
79/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
80#define ALARM_INTERVAL_SECS 10
81static volatile bool test_uffdio_copy_eexist = true;
82static volatile bool test_uffdio_zeropage_eexist = true;
83/* Whether to test uffd write-protection */
84static bool test_uffdio_wp = false;
85/* Whether to test uffd minor faults */
86static bool test_uffdio_minor = false;
87
88static bool map_shared;
89static int shm_fd;
90static int huge_fd;
91static char *huge_fd_off0;
92static unsigned long long *count_verify;
93static int uffd = -1;
94static int uffd_flags, finished, *pipefd;
95static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
96static char *zeropage;
97pthread_attr_t attr;
98
99/* Userfaultfd test statistics */
100struct uffd_stats {
101 int cpu;
102 unsigned long missing_faults;
103 unsigned long wp_faults;
104 unsigned long minor_faults;
105};
106
107/* pthread_mutex_t starts at page offset 0 */
108#define area_mutex(___area, ___nr) \
109 ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
110/*
111 * count is placed in the page after pthread_mutex_t naturally aligned
112 * to avoid non alignment faults on non-x86 archs.
113 */
114#define area_count(___area, ___nr) \
115 ((volatile unsigned long long *) ((unsigned long) \
116 ((___area) + (___nr)*page_size + \
117 sizeof(pthread_mutex_t) + \
118 sizeof(unsigned long long) - 1) & \
119 ~(unsigned long)(sizeof(unsigned long long) \
120 - 1)))
121
122const char *examples =
123 "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
124 "./userfaultfd anon 100 99999\n\n"
125 "# Run share memory test on 1GiB region with 99 bounces:\n"
126 "./userfaultfd shmem 1000 99\n\n"
127 "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n"
128 "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n"
129 "# Run the same hugetlb test but using shmem:\n"
130 "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
131 "# 10MiB-~6GiB 999 bounces anonymous test, "
132 "continue forever unless an error triggers\n"
133 "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
134
135static void usage(void)
136{
137 fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
138 "[hugetlbfs_file]\n\n");
139 fprintf(stderr, "Supported <test type>: anon, hugetlb, "
140 "hugetlb_shared, shmem\n\n");
141 fprintf(stderr, "Examples:\n\n");
142 fprintf(stderr, "%s", examples);
143 exit(1);
144}
145
146#define _err(fmt, ...) \
147 do { \
148 int ret = errno; \
149 fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \
150 fprintf(stderr, " (errno=%d, line=%d)\n", \
151 ret, __LINE__); \
152 } while (0)
153
154#define err(fmt, ...) \
155 do { \
156 _err(fmt, ##__VA_ARGS__); \
157 exit(1); \
158 } while (0)
159
160static void uffd_stats_reset(struct uffd_stats *uffd_stats,
161 unsigned long n_cpus)
162{
163 int i;
164
165 for (i = 0; i < n_cpus; i++) {
166 uffd_stats[i].cpu = i;
167 uffd_stats[i].missing_faults = 0;
168 uffd_stats[i].wp_faults = 0;
169 uffd_stats[i].minor_faults = 0;
170 }
171}
172
173static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
174{
175 int i;
176 unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
177
178 for (i = 0; i < n_cpus; i++) {
179 miss_total += stats[i].missing_faults;
180 wp_total += stats[i].wp_faults;
181 minor_total += stats[i].minor_faults;
182 }
183
184 printf("userfaults: ");
185 if (miss_total) {
186 printf("%llu missing (", miss_total);
187 for (i = 0; i < n_cpus; i++)
188 printf("%lu+", stats[i].missing_faults);
189 printf("\b) ");
190 }
191 if (wp_total) {
192 printf("%llu wp (", wp_total);
193 for (i = 0; i < n_cpus; i++)
194 printf("%lu+", stats[i].wp_faults);
195 printf("\b) ");
196 }
197 if (minor_total) {
198 printf("%llu minor (", minor_total);
199 for (i = 0; i < n_cpus; i++)
200 printf("%lu+", stats[i].minor_faults);
201 printf("\b)");
202 }
203 printf("\n");
204}
205
206static void anon_release_pages(char *rel_area)
207{
208 if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
209 err("madvise(MADV_DONTNEED) failed");
210}
211
212static void anon_allocate_area(void **alloc_area)
213{
214 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
215 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
216 if (*alloc_area == MAP_FAILED)
217 err("mmap of anonymous memory failed");
218}
219
220static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
221{
222}
223
224static void hugetlb_release_pages(char *rel_area)
225{
226 if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
227 rel_area == huge_fd_off0 ? 0 : nr_pages * page_size,
228 nr_pages * page_size))
229 err("fallocate() failed");
230}
231
232static void hugetlb_allocate_area(void **alloc_area)
233{
234 void *area_alias = NULL;
235 char **alloc_area_alias;
236
237 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
238 (map_shared ? MAP_SHARED : MAP_PRIVATE) |
239 MAP_HUGETLB,
240 huge_fd, *alloc_area == area_src ? 0 :
241 nr_pages * page_size);
242 if (*alloc_area == MAP_FAILED)
243 err("mmap of hugetlbfs file failed");
244
245 if (map_shared) {
246 area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
247 MAP_SHARED | MAP_HUGETLB,
248 huge_fd, *alloc_area == area_src ? 0 :
249 nr_pages * page_size);
250 if (area_alias == MAP_FAILED)
251 err("mmap of hugetlb file alias failed");
252 }
253
254 if (*alloc_area == area_src) {
255 huge_fd_off0 = *alloc_area;
256 alloc_area_alias = &area_src_alias;
257 } else {
258 alloc_area_alias = &area_dst_alias;
259 }
260 if (area_alias)
261 *alloc_area_alias = area_alias;
262}
263
264static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
265{
266 if (!map_shared)
267 return;
268 /*
269 * We can't zap just the pagetable with hugetlbfs because
270 * MADV_DONTEED won't work. So exercise -EEXIST on a alias
271 * mapping where the pagetables are not established initially,
272 * this way we'll exercise the -EEXEC at the fs level.
273 */
274 *start = (unsigned long) area_dst_alias + offset;
275}
276
277static void shmem_release_pages(char *rel_area)
278{
279 if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
280 err("madvise(MADV_REMOVE) failed");
281}
282
283static void shmem_allocate_area(void **alloc_area)
284{
285 void *area_alias = NULL;
286 bool is_src = alloc_area == (void **)&area_src;
287 unsigned long offset = is_src ? 0 : nr_pages * page_size;
288
289 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
290 MAP_SHARED, shm_fd, offset);
291 if (*alloc_area == MAP_FAILED)
292 err("mmap of memfd failed");
293
294 area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
295 MAP_SHARED, shm_fd, offset);
296 if (area_alias == MAP_FAILED)
297 err("mmap of memfd alias failed");
298
299 if (is_src)
300 area_src_alias = area_alias;
301 else
302 area_dst_alias = area_alias;
303}
304
305static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
306{
307 *start = (unsigned long)area_dst_alias + offset;
308}
309
310struct uffd_test_ops {
311 void (*allocate_area)(void **alloc_area);
312 void (*release_pages)(char *rel_area);
313 void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
314};
315
316static struct uffd_test_ops anon_uffd_test_ops = {
317 .allocate_area = anon_allocate_area,
318 .release_pages = anon_release_pages,
319 .alias_mapping = noop_alias_mapping,
320};
321
322static struct uffd_test_ops shmem_uffd_test_ops = {
323 .allocate_area = shmem_allocate_area,
324 .release_pages = shmem_release_pages,
325 .alias_mapping = shmem_alias_mapping,
326};
327
328static struct uffd_test_ops hugetlb_uffd_test_ops = {
329 .allocate_area = hugetlb_allocate_area,
330 .release_pages = hugetlb_release_pages,
331 .alias_mapping = hugetlb_alias_mapping,
332};
333
334static struct uffd_test_ops *uffd_test_ops;
335
336static inline uint64_t uffd_minor_feature(void)
337{
338 if (test_type == TEST_HUGETLB && map_shared)
339 return UFFD_FEATURE_MINOR_HUGETLBFS;
340 else if (test_type == TEST_SHMEM)
341 return UFFD_FEATURE_MINOR_SHMEM;
342 else
343 return 0;
344}
345
346static uint64_t get_expected_ioctls(uint64_t mode)
347{
348 uint64_t ioctls = UFFD_API_RANGE_IOCTLS;
349
350 if (test_type == TEST_HUGETLB)
351 ioctls &= ~(1 << _UFFDIO_ZEROPAGE);
352
353 if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp))
354 ioctls &= ~(1 << _UFFDIO_WRITEPROTECT);
355
356 if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor))
357 ioctls &= ~(1 << _UFFDIO_CONTINUE);
358
359 return ioctls;
360}
361
362static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
363{
364 uint64_t expected = get_expected_ioctls(mode);
365 uint64_t actual = ioctls & expected;
366
367 if (actual != expected) {
368 err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64,
369 expected, actual);
370 }
371}
372
373static void userfaultfd_open(uint64_t *features)
374{
375 struct uffdio_api uffdio_api;
376
377 uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
378 if (uffd < 0)
379 err("userfaultfd syscall not available in this kernel");
380 uffd_flags = fcntl(uffd, F_GETFD, NULL);
381
382 uffdio_api.api = UFFD_API;
383 uffdio_api.features = *features;
384 if (ioctl(uffd, UFFDIO_API, &uffdio_api))
385 err("UFFDIO_API failed.\nPlease make sure to "
386 "run with either root or ptrace capability.");
387 if (uffdio_api.api != UFFD_API)
388 err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
389
390 *features = uffdio_api.features;
391}
392
393static inline void munmap_area(void **area)
394{
395 if (*area)
396 if (munmap(*area, nr_pages * page_size))
397 err("munmap");
398
399 *area = NULL;
400}
401
402static void uffd_test_ctx_clear(void)
403{
404 size_t i;
405
406 if (pipefd) {
407 for (i = 0; i < nr_cpus * 2; ++i) {
408 if (close(pipefd[i]))
409 err("close pipefd");
410 }
411 free(pipefd);
412 pipefd = NULL;
413 }
414
415 if (count_verify) {
416 free(count_verify);
417 count_verify = NULL;
418 }
419
420 if (uffd != -1) {
421 if (close(uffd))
422 err("close uffd");
423 uffd = -1;
424 }
425
426 huge_fd_off0 = NULL;
427 munmap_area((void **)&area_src);
428 munmap_area((void **)&area_src_alias);
429 munmap_area((void **)&area_dst);
430 munmap_area((void **)&area_dst_alias);
431}
432
433static void uffd_test_ctx_init(uint64_t features)
434{
435 unsigned long nr, cpu;
436
437 uffd_test_ctx_clear();
438
439 uffd_test_ops->allocate_area((void **)&area_src);
440 uffd_test_ops->allocate_area((void **)&area_dst);
441
442 userfaultfd_open(&features);
443
444 count_verify = malloc(nr_pages * sizeof(unsigned long long));
445 if (!count_verify)
446 err("count_verify");
447
448 for (nr = 0; nr < nr_pages; nr++) {
449 *area_mutex(area_src, nr) =
450 (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
451 count_verify[nr] = *area_count(area_src, nr) = 1;
452 /*
453 * In the transition between 255 to 256, powerpc will
454 * read out of order in my_bcmp and see both bytes as
455 * zero, so leave a placeholder below always non-zero
456 * after the count, to avoid my_bcmp to trigger false
457 * positives.
458 */
459 *(area_count(area_src, nr) + 1) = 1;
460 }
461
462 /*
463 * After initialization of area_src, we must explicitly release pages
464 * for area_dst to make sure it's fully empty. Otherwise we could have
465 * some area_dst pages be errornously initialized with zero pages,
466 * hence we could hit memory corruption later in the test.
467 *
468 * One example is when THP is globally enabled, above allocate_area()
469 * calls could have the two areas merged into a single VMA (as they
470 * will have the same VMA flags so they're mergeable). When we
471 * initialize the area_src above, it's possible that some part of
472 * area_dst could have been faulted in via one huge THP that will be
473 * shared between area_src and area_dst. It could cause some of the
474 * area_dst won't be trapped by missing userfaults.
475 *
476 * This release_pages() will guarantee even if that happened, we'll
477 * proactively split the thp and drop any accidentally initialized
478 * pages within area_dst.
479 */
480 uffd_test_ops->release_pages(area_dst);
481
482 pipefd = malloc(sizeof(int) * nr_cpus * 2);
483 if (!pipefd)
484 err("pipefd");
485 for (cpu = 0; cpu < nr_cpus; cpu++)
486 if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
487 err("pipe");
488}
489
490static int my_bcmp(char *str1, char *str2, size_t n)
491{
492 unsigned long i;
493 for (i = 0; i < n; i++)
494 if (str1[i] != str2[i])
495 return 1;
496 return 0;
497}
498
499static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
500{
501 struct uffdio_writeprotect prms;
502
503 /* Write protection page faults */
504 prms.range.start = start;
505 prms.range.len = len;
506 /* Undo write-protect, do wakeup after that */
507 prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
508
509 if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
510 err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
511}
512
513static void continue_range(int ufd, __u64 start, __u64 len)
514{
515 struct uffdio_continue req;
516 int ret;
517
518 req.range.start = start;
519 req.range.len = len;
520 req.mode = 0;
521
522 if (ioctl(ufd, UFFDIO_CONTINUE, &req))
523 err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
524 (uint64_t)start);
525
526 /*
527 * Error handling within the kernel for continue is subtly different
528 * from copy or zeropage, so it may be a source of bugs. Trigger an
529 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
530 */
531 req.mapped = 0;
532 ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
533 if (ret >= 0 || req.mapped != -EEXIST)
534 err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
535 ret, (int64_t) req.mapped);
536}
537
538static void *locking_thread(void *arg)
539{
540 unsigned long cpu = (unsigned long) arg;
541 unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
542 unsigned long long count;
543
544 if (!(bounces & BOUNCE_RANDOM)) {
545 page_nr = -bounces;
546 if (!(bounces & BOUNCE_RACINGFAULTS))
547 page_nr += cpu * nr_pages_per_cpu;
548 }
549
550 while (!finished) {
551 if (bounces & BOUNCE_RANDOM) {
552 if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr))
553 err("getrandom failed");
554 } else
555 page_nr += 1;
556 page_nr %= nr_pages;
557 pthread_mutex_lock(area_mutex(area_dst, page_nr));
558 count = *area_count(area_dst, page_nr);
559 if (count != count_verify[page_nr])
560 err("page_nr %lu memory corruption %llu %llu",
561 page_nr, count, count_verify[page_nr]);
562 count++;
563 *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
564 pthread_mutex_unlock(area_mutex(area_dst, page_nr));
565 }
566
567 return NULL;
568}
569
570static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
571 unsigned long offset)
572{
573 uffd_test_ops->alias_mapping(&uffdio_copy->dst,
574 uffdio_copy->len,
575 offset);
576 if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
577 /* real retval in ufdio_copy.copy */
578 if (uffdio_copy->copy != -EEXIST)
579 err("UFFDIO_COPY retry error: %"PRId64,
580 (int64_t)uffdio_copy->copy);
581 } else {
582 err("UFFDIO_COPY retry unexpected: %"PRId64,
583 (int64_t)uffdio_copy->copy);
584 }
585}
586
587static void wake_range(int ufd, unsigned long addr, unsigned long len)
588{
589 struct uffdio_range uffdio_wake;
590
591 uffdio_wake.start = addr;
592 uffdio_wake.len = len;
593
594 if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
595 fprintf(stderr, "error waking %lu\n",
596 addr), exit(1);
597}
598
599static int __copy_page(int ufd, unsigned long offset, bool retry)
600{
601 struct uffdio_copy uffdio_copy;
602
603 if (offset >= nr_pages * page_size)
604 err("unexpected offset %lu\n", offset);
605 uffdio_copy.dst = (unsigned long) area_dst + offset;
606 uffdio_copy.src = (unsigned long) area_src + offset;
607 uffdio_copy.len = page_size;
608 if (test_uffdio_wp)
609 uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
610 else
611 uffdio_copy.mode = 0;
612 uffdio_copy.copy = 0;
613 if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
614 /* real retval in ufdio_copy.copy */
615 if (uffdio_copy.copy != -EEXIST)
616 err("UFFDIO_COPY error: %"PRId64,
617 (int64_t)uffdio_copy.copy);
618 wake_range(ufd, uffdio_copy.dst, page_size);
619 } else if (uffdio_copy.copy != page_size) {
620 err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
621 } else {
622 if (test_uffdio_copy_eexist && retry) {
623 test_uffdio_copy_eexist = false;
624 retry_copy_page(ufd, &uffdio_copy, offset);
625 }
626 return 1;
627 }
628 return 0;
629}
630
631static int copy_page_retry(int ufd, unsigned long offset)
632{
633 return __copy_page(ufd, offset, true);
634}
635
636static int copy_page(int ufd, unsigned long offset)
637{
638 return __copy_page(ufd, offset, false);
639}
640
641static int uffd_read_msg(int ufd, struct uffd_msg *msg)
642{
643 int ret = read(uffd, msg, sizeof(*msg));
644
645 if (ret != sizeof(*msg)) {
646 if (ret < 0) {
647 if (errno == EAGAIN)
648 return 1;
649 err("blocking read error");
650 } else {
651 err("short read");
652 }
653 }
654
655 return 0;
656}
657
658static void uffd_handle_page_fault(struct uffd_msg *msg,
659 struct uffd_stats *stats)
660{
661 unsigned long offset;
662
663 if (msg->event != UFFD_EVENT_PAGEFAULT)
664 err("unexpected msg event %u", msg->event);
665
666 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
667 /* Write protect page faults */
668 wp_range(uffd, msg->arg.pagefault.address, page_size, false);
669 stats->wp_faults++;
670 } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
671 uint8_t *area;
672 int b;
673
674 /*
675 * Minor page faults
676 *
677 * To prove we can modify the original range for testing
678 * purposes, we're going to bit flip this range before
679 * continuing.
680 *
681 * Note that this requires all minor page fault tests operate on
682 * area_dst (non-UFFD-registered) and area_dst_alias
683 * (UFFD-registered).
684 */
685
686 area = (uint8_t *)(area_dst +
687 ((char *)msg->arg.pagefault.address -
688 area_dst_alias));
689 for (b = 0; b < page_size; ++b)
690 area[b] = ~area[b];
691 continue_range(uffd, msg->arg.pagefault.address, page_size);
692 stats->minor_faults++;
693 } else {
694 /* Missing page faults */
695 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
696 err("unexpected write fault");
697
698 offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
699 offset &= ~(page_size-1);
700
701 if (copy_page(uffd, offset))
702 stats->missing_faults++;
703 }
704}
705
706static void *uffd_poll_thread(void *arg)
707{
708 struct uffd_stats *stats = (struct uffd_stats *)arg;
709 unsigned long cpu = stats->cpu;
710 struct pollfd pollfd[2];
711 struct uffd_msg msg;
712 struct uffdio_register uffd_reg;
713 int ret;
714 char tmp_chr;
715
716 pollfd[0].fd = uffd;
717 pollfd[0].events = POLLIN;
718 pollfd[1].fd = pipefd[cpu*2];
719 pollfd[1].events = POLLIN;
720
721 for (;;) {
722 ret = poll(pollfd, 2, -1);
723 if (ret <= 0)
724 err("poll error: %d", ret);
725 if (pollfd[1].revents & POLLIN) {
726 if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
727 err("read pipefd error");
728 break;
729 }
730 if (!(pollfd[0].revents & POLLIN))
731 err("pollfd[0].revents %d", pollfd[0].revents);
732 if (uffd_read_msg(uffd, &msg))
733 continue;
734 switch (msg.event) {
735 default:
736 err("unexpected msg event %u\n", msg.event);
737 break;
738 case UFFD_EVENT_PAGEFAULT:
739 uffd_handle_page_fault(&msg, stats);
740 break;
741 case UFFD_EVENT_FORK:
742 close(uffd);
743 uffd = msg.arg.fork.ufd;
744 pollfd[0].fd = uffd;
745 break;
746 case UFFD_EVENT_REMOVE:
747 uffd_reg.range.start = msg.arg.remove.start;
748 uffd_reg.range.len = msg.arg.remove.end -
749 msg.arg.remove.start;
750 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
751 err("remove failure");
752 break;
753 case UFFD_EVENT_REMAP:
754 area_dst = (char *)(unsigned long)msg.arg.remap.to;
755 break;
756 }
757 }
758
759 return NULL;
760}
761
762pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
763
764static void *uffd_read_thread(void *arg)
765{
766 struct uffd_stats *stats = (struct uffd_stats *)arg;
767 struct uffd_msg msg;
768
769 pthread_mutex_unlock(&uffd_read_mutex);
770 /* from here cancellation is ok */
771
772 for (;;) {
773 if (uffd_read_msg(uffd, &msg))
774 continue;
775 uffd_handle_page_fault(&msg, stats);
776 }
777
778 return NULL;
779}
780
781static void *background_thread(void *arg)
782{
783 unsigned long cpu = (unsigned long) arg;
784 unsigned long page_nr, start_nr, mid_nr, end_nr;
785
786 start_nr = cpu * nr_pages_per_cpu;
787 end_nr = (cpu+1) * nr_pages_per_cpu;
788 mid_nr = (start_nr + end_nr) / 2;
789
790 /* Copy the first half of the pages */
791 for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
792 copy_page_retry(uffd, page_nr * page_size);
793
794 /*
795 * If we need to test uffd-wp, set it up now. Then we'll have
796 * at least the first half of the pages mapped already which
797 * can be write-protected for testing
798 */
799 if (test_uffdio_wp)
800 wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
801 nr_pages_per_cpu * page_size, true);
802
803 /*
804 * Continue the 2nd half of the page copying, handling write
805 * protection faults if any
806 */
807 for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
808 copy_page_retry(uffd, page_nr * page_size);
809
810 return NULL;
811}
812
813static int stress(struct uffd_stats *uffd_stats)
814{
815 unsigned long cpu;
816 pthread_t locking_threads[nr_cpus];
817 pthread_t uffd_threads[nr_cpus];
818 pthread_t background_threads[nr_cpus];
819
820 finished = 0;
821 for (cpu = 0; cpu < nr_cpus; cpu++) {
822 if (pthread_create(&locking_threads[cpu], &attr,
823 locking_thread, (void *)cpu))
824 return 1;
825 if (bounces & BOUNCE_POLL) {
826 if (pthread_create(&uffd_threads[cpu], &attr,
827 uffd_poll_thread,
828 (void *)&uffd_stats[cpu]))
829 return 1;
830 } else {
831 if (pthread_create(&uffd_threads[cpu], &attr,
832 uffd_read_thread,
833 (void *)&uffd_stats[cpu]))
834 return 1;
835 pthread_mutex_lock(&uffd_read_mutex);
836 }
837 if (pthread_create(&background_threads[cpu], &attr,
838 background_thread, (void *)cpu))
839 return 1;
840 }
841 for (cpu = 0; cpu < nr_cpus; cpu++)
842 if (pthread_join(background_threads[cpu], NULL))
843 return 1;
844
845 /*
846 * Be strict and immediately zap area_src, the whole area has
847 * been transferred already by the background treads. The
848 * area_src could then be faulted in in a racy way by still
849 * running uffdio_threads reading zeropages after we zapped
850 * area_src (but they're guaranteed to get -EEXIST from
851 * UFFDIO_COPY without writing zero pages into area_dst
852 * because the background threads already completed).
853 */
854 uffd_test_ops->release_pages(area_src);
855
856 finished = 1;
857 for (cpu = 0; cpu < nr_cpus; cpu++)
858 if (pthread_join(locking_threads[cpu], NULL))
859 return 1;
860
861 for (cpu = 0; cpu < nr_cpus; cpu++) {
862 char c;
863 if (bounces & BOUNCE_POLL) {
864 if (write(pipefd[cpu*2+1], &c, 1) != 1)
865 err("pipefd write error");
866 if (pthread_join(uffd_threads[cpu],
867 (void *)&uffd_stats[cpu]))
868 return 1;
869 } else {
870 if (pthread_cancel(uffd_threads[cpu]))
871 return 1;
872 if (pthread_join(uffd_threads[cpu], NULL))
873 return 1;
874 }
875 }
876
877 return 0;
878}
879
880sigjmp_buf jbuf, *sigbuf;
881
882static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
883{
884 if (sig == SIGBUS) {
885 if (sigbuf)
886 siglongjmp(*sigbuf, 1);
887 abort();
888 }
889}
890
891/*
892 * For non-cooperative userfaultfd test we fork() a process that will
893 * generate pagefaults, will mremap the area monitored by the
894 * userfaultfd and at last this process will release the monitored
895 * area.
896 * For the anonymous and shared memory the area is divided into two
897 * parts, the first part is accessed before mremap, and the second
898 * part is accessed after mremap. Since hugetlbfs does not support
899 * mremap, the entire monitored area is accessed in a single pass for
900 * HUGETLB_TEST.
901 * The release of the pages currently generates event for shmem and
902 * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
903 * for hugetlb.
904 * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
905 * monitored area, generate pagefaults and test that signal is delivered.
906 * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
907 * test robustness use case - we release monitored area, fork a process
908 * that will generate pagefaults and verify signal is generated.
909 * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
910 * feature. Using monitor thread, verify no userfault events are generated.
911 */
912static int faulting_process(int signal_test)
913{
914 unsigned long nr;
915 unsigned long long count;
916 unsigned long split_nr_pages;
917 unsigned long lastnr;
918 struct sigaction act;
919 unsigned long signalled = 0;
920
921 if (test_type != TEST_HUGETLB)
922 split_nr_pages = (nr_pages + 1) / 2;
923 else
924 split_nr_pages = nr_pages;
925
926 if (signal_test) {
927 sigbuf = &jbuf;
928 memset(&act, 0, sizeof(act));
929 act.sa_sigaction = sighndl;
930 act.sa_flags = SA_SIGINFO;
931 if (sigaction(SIGBUS, &act, 0))
932 err("sigaction");
933 lastnr = (unsigned long)-1;
934 }
935
936 for (nr = 0; nr < split_nr_pages; nr++) {
937 int steps = 1;
938 unsigned long offset = nr * page_size;
939
940 if (signal_test) {
941 if (sigsetjmp(*sigbuf, 1) != 0) {
942 if (steps == 1 && nr == lastnr)
943 err("Signal repeated");
944
945 lastnr = nr;
946 if (signal_test == 1) {
947 if (steps == 1) {
948 /* This is a MISSING request */
949 steps++;
950 if (copy_page(uffd, offset))
951 signalled++;
952 } else {
953 /* This is a WP request */
954 assert(steps == 2);
955 wp_range(uffd,
956 (__u64)area_dst +
957 offset,
958 page_size, false);
959 }
960 } else {
961 signalled++;
962 continue;
963 }
964 }
965 }
966
967 count = *area_count(area_dst, nr);
968 if (count != count_verify[nr])
969 err("nr %lu memory corruption %llu %llu\n",
970 nr, count, count_verify[nr]);
971 /*
972 * Trigger write protection if there is by writing
973 * the same value back.
974 */
975 *area_count(area_dst, nr) = count;
976 }
977
978 if (signal_test)
979 return signalled != split_nr_pages;
980
981 if (test_type == TEST_HUGETLB)
982 return 0;
983
984 area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size,
985 MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
986 if (area_dst == MAP_FAILED)
987 err("mremap");
988 /* Reset area_src since we just clobbered it */
989 area_src = NULL;
990
991 for (; nr < nr_pages; nr++) {
992 count = *area_count(area_dst, nr);
993 if (count != count_verify[nr]) {
994 err("nr %lu memory corruption %llu %llu\n",
995 nr, count, count_verify[nr]);
996 }
997 /*
998 * Trigger write protection if there is by writing
999 * the same value back.
1000 */
1001 *area_count(area_dst, nr) = count;
1002 }
1003
1004 uffd_test_ops->release_pages(area_dst);
1005
1006 for (nr = 0; nr < nr_pages; nr++)
1007 if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
1008 err("nr %lu is not zero", nr);
1009
1010 return 0;
1011}
1012
1013static void retry_uffdio_zeropage(int ufd,
1014 struct uffdio_zeropage *uffdio_zeropage,
1015 unsigned long offset)
1016{
1017 uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
1018 uffdio_zeropage->range.len,
1019 offset);
1020 if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
1021 if (uffdio_zeropage->zeropage != -EEXIST)
1022 err("UFFDIO_ZEROPAGE error: %"PRId64,
1023 (int64_t)uffdio_zeropage->zeropage);
1024 } else {
1025 err("UFFDIO_ZEROPAGE error: %"PRId64,
1026 (int64_t)uffdio_zeropage->zeropage);
1027 }
1028}
1029
1030static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
1031{
1032 struct uffdio_zeropage uffdio_zeropage;
1033 int ret;
1034 bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE);
1035 __s64 res;
1036
1037 if (offset >= nr_pages * page_size)
1038 err("unexpected offset %lu", offset);
1039 uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
1040 uffdio_zeropage.range.len = page_size;
1041 uffdio_zeropage.mode = 0;
1042 ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
1043 res = uffdio_zeropage.zeropage;
1044 if (ret) {
1045 /* real retval in ufdio_zeropage.zeropage */
1046 if (has_zeropage)
1047 err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res);
1048 else if (res != -EINVAL)
1049 err("UFFDIO_ZEROPAGE not -EINVAL");
1050 } else if (has_zeropage) {
1051 if (res != page_size) {
1052 err("UFFDIO_ZEROPAGE unexpected size");
1053 } else {
1054 if (test_uffdio_zeropage_eexist && retry) {
1055 test_uffdio_zeropage_eexist = false;
1056 retry_uffdio_zeropage(ufd, &uffdio_zeropage,
1057 offset);
1058 }
1059 return 1;
1060 }
1061 } else
1062 err("UFFDIO_ZEROPAGE succeeded");
1063
1064 return 0;
1065}
1066
1067static int uffdio_zeropage(int ufd, unsigned long offset)
1068{
1069 return __uffdio_zeropage(ufd, offset, false);
1070}
1071
1072/* exercise UFFDIO_ZEROPAGE */
1073static int userfaultfd_zeropage_test(void)
1074{
1075 struct uffdio_register uffdio_register;
1076
1077 printf("testing UFFDIO_ZEROPAGE: ");
1078 fflush(stdout);
1079
1080 uffd_test_ctx_init(0);
1081
1082 uffdio_register.range.start = (unsigned long) area_dst;
1083 uffdio_register.range.len = nr_pages * page_size;
1084 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1085 if (test_uffdio_wp)
1086 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1087 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1088 err("register failure");
1089
1090 assert_expected_ioctls_present(
1091 uffdio_register.mode, uffdio_register.ioctls);
1092
1093 if (uffdio_zeropage(uffd, 0))
1094 if (my_bcmp(area_dst, zeropage, page_size))
1095 err("zeropage is not zero");
1096
1097 printf("done.\n");
1098 return 0;
1099}
1100
1101static int userfaultfd_events_test(void)
1102{
1103 struct uffdio_register uffdio_register;
1104 pthread_t uffd_mon;
1105 int err, features;
1106 pid_t pid;
1107 char c;
1108 struct uffd_stats stats = { 0 };
1109
1110 printf("testing events (fork, remap, remove): ");
1111 fflush(stdout);
1112
1113 features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
1114 UFFD_FEATURE_EVENT_REMOVE;
1115 uffd_test_ctx_init(features);
1116
1117 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1118
1119 uffdio_register.range.start = (unsigned long) area_dst;
1120 uffdio_register.range.len = nr_pages * page_size;
1121 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1122 if (test_uffdio_wp)
1123 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1124 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1125 err("register failure");
1126
1127 assert_expected_ioctls_present(
1128 uffdio_register.mode, uffdio_register.ioctls);
1129
1130 if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1131 err("uffd_poll_thread create");
1132
1133 pid = fork();
1134 if (pid < 0)
1135 err("fork");
1136
1137 if (!pid)
1138 exit(faulting_process(0));
1139
1140 waitpid(pid, &err, 0);
1141 if (err)
1142 err("faulting process failed");
1143 if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1144 err("pipe write");
1145 if (pthread_join(uffd_mon, NULL))
1146 return 1;
1147
1148 uffd_stats_report(&stats, 1);
1149
1150 return stats.missing_faults != nr_pages;
1151}
1152
1153static int userfaultfd_sig_test(void)
1154{
1155 struct uffdio_register uffdio_register;
1156 unsigned long userfaults;
1157 pthread_t uffd_mon;
1158 int err, features;
1159 pid_t pid;
1160 char c;
1161 struct uffd_stats stats = { 0 };
1162
1163 printf("testing signal delivery: ");
1164 fflush(stdout);
1165
1166 features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
1167 uffd_test_ctx_init(features);
1168
1169 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1170
1171 uffdio_register.range.start = (unsigned long) area_dst;
1172 uffdio_register.range.len = nr_pages * page_size;
1173 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1174 if (test_uffdio_wp)
1175 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1176 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1177 err("register failure");
1178
1179 assert_expected_ioctls_present(
1180 uffdio_register.mode, uffdio_register.ioctls);
1181
1182 if (faulting_process(1))
1183 err("faulting process failed");
1184
1185 uffd_test_ops->release_pages(area_dst);
1186
1187 if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1188 err("uffd_poll_thread create");
1189
1190 pid = fork();
1191 if (pid < 0)
1192 err("fork");
1193
1194 if (!pid)
1195 exit(faulting_process(2));
1196
1197 waitpid(pid, &err, 0);
1198 if (err)
1199 err("faulting process failed");
1200 if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1201 err("pipe write");
1202 if (pthread_join(uffd_mon, (void **)&userfaults))
1203 return 1;
1204
1205 printf("done.\n");
1206 if (userfaults)
1207 err("Signal test failed, userfaults: %ld", userfaults);
1208
1209 return userfaults != 0;
1210}
1211
1212static int userfaultfd_minor_test(void)
1213{
1214 struct uffdio_register uffdio_register;
1215 unsigned long p;
1216 pthread_t uffd_mon;
1217 uint8_t expected_byte;
1218 void *expected_page;
1219 char c;
1220 struct uffd_stats stats = { 0 };
1221
1222 if (!test_uffdio_minor)
1223 return 0;
1224
1225 printf("testing minor faults: ");
1226 fflush(stdout);
1227
1228 uffd_test_ctx_init(uffd_minor_feature());
1229
1230 uffdio_register.range.start = (unsigned long)area_dst_alias;
1231 uffdio_register.range.len = nr_pages * page_size;
1232 uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
1233 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1234 err("register failure");
1235
1236 assert_expected_ioctls_present(
1237 uffdio_register.mode, uffdio_register.ioctls);
1238
1239 /*
1240 * After registering with UFFD, populate the non-UFFD-registered side of
1241 * the shared mapping. This should *not* trigger any UFFD minor faults.
1242 */
1243 for (p = 0; p < nr_pages; ++p) {
1244 memset(area_dst + (p * page_size), p % ((uint8_t)-1),
1245 page_size);
1246 }
1247
1248 if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1249 err("uffd_poll_thread create");
1250
1251 /*
1252 * Read each of the pages back using the UFFD-registered mapping. We
1253 * expect that the first time we touch a page, it will result in a minor
1254 * fault. uffd_poll_thread will resolve the fault by bit-flipping the
1255 * page's contents, and then issuing a CONTINUE ioctl.
1256 */
1257
1258 if (posix_memalign(&expected_page, page_size, page_size))
1259 err("out of memory");
1260
1261 for (p = 0; p < nr_pages; ++p) {
1262 expected_byte = ~((uint8_t)(p % ((uint8_t)-1)));
1263 memset(expected_page, expected_byte, page_size);
1264 if (my_bcmp(expected_page, area_dst_alias + (p * page_size),
1265 page_size))
1266 err("unexpected page contents after minor fault");
1267 }
1268
1269 if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1270 err("pipe write");
1271 if (pthread_join(uffd_mon, NULL))
1272 return 1;
1273
1274 uffd_stats_report(&stats, 1);
1275
1276 return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
1277}
1278
1279#define BIT_ULL(nr) (1ULL << (nr))
1280#define PM_SOFT_DIRTY BIT_ULL(55)
1281#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
1282#define PM_UFFD_WP BIT_ULL(57)
1283#define PM_FILE BIT_ULL(61)
1284#define PM_SWAP BIT_ULL(62)
1285#define PM_PRESENT BIT_ULL(63)
1286
1287static int pagemap_open(void)
1288{
1289 int fd = open("/proc/self/pagemap", O_RDONLY);
1290
1291 if (fd < 0)
1292 err("open pagemap");
1293
1294 return fd;
1295}
1296
1297static uint64_t pagemap_read_vaddr(int fd, void *vaddr)
1298{
1299 uint64_t value;
1300 int ret;
1301
1302 ret = pread(fd, &value, sizeof(uint64_t),
1303 ((uint64_t)vaddr >> 12) * sizeof(uint64_t));
1304 if (ret != sizeof(uint64_t))
1305 err("pread() on pagemap failed");
1306
1307 return value;
1308}
1309
1310/* This macro let __LINE__ works in err() */
1311#define pagemap_check_wp(value, wp) do { \
1312 if (!!(value & PM_UFFD_WP) != wp) \
1313 err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \
1314 } while (0)
1315
1316static int pagemap_test_fork(bool present)
1317{
1318 pid_t child = fork();
1319 uint64_t value;
1320 int fd, result;
1321
1322 if (!child) {
1323 /* Open the pagemap fd of the child itself */
1324 fd = pagemap_open();
1325 value = pagemap_read_vaddr(fd, area_dst);
1326 /*
1327 * After fork() uffd-wp bit should be gone as long as we're
1328 * without UFFD_FEATURE_EVENT_FORK
1329 */
1330 pagemap_check_wp(value, false);
1331 /* Succeed */
1332 exit(0);
1333 }
1334 waitpid(child, &result, 0);
1335 return result;
1336}
1337
1338static void userfaultfd_pagemap_test(unsigned int test_pgsize)
1339{
1340 struct uffdio_register uffdio_register;
1341 int pagemap_fd;
1342 uint64_t value;
1343
1344 /* Pagemap tests uffd-wp only */
1345 if (!test_uffdio_wp)
1346 return;
1347
1348 /* Not enough memory to test this page size */
1349 if (test_pgsize > nr_pages * page_size)
1350 return;
1351
1352 printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize);
1353 /* Flush so it doesn't flush twice in parent/child later */
1354 fflush(stdout);
1355
1356 uffd_test_ctx_init(0);
1357
1358 if (test_pgsize > page_size) {
1359 /* This is a thp test */
1360 if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE))
1361 err("madvise(MADV_HUGEPAGE) failed");
1362 } else if (test_pgsize == page_size) {
1363 /* This is normal page test; force no thp */
1364 if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE))
1365 err("madvise(MADV_NOHUGEPAGE) failed");
1366 }
1367
1368 uffdio_register.range.start = (unsigned long) area_dst;
1369 uffdio_register.range.len = nr_pages * page_size;
1370 uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
1371 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1372 err("register failed");
1373
1374 pagemap_fd = pagemap_open();
1375
1376 /* Touch the page */
1377 *area_dst = 1;
1378 wp_range(uffd, (uint64_t)area_dst, test_pgsize, true);
1379 value = pagemap_read_vaddr(pagemap_fd, area_dst);
1380 pagemap_check_wp(value, true);
1381 /* Make sure uffd-wp bit dropped when fork */
1382 if (pagemap_test_fork(true))
1383 err("Detected stall uffd-wp bit in child");
1384
1385 /* Exclusive required or PAGEOUT won't work */
1386 if (!(value & PM_MMAP_EXCLUSIVE))
1387 err("multiple mapping detected: 0x%"PRIx64, value);
1388
1389 if (madvise(area_dst, test_pgsize, MADV_PAGEOUT))
1390 err("madvise(MADV_PAGEOUT) failed");
1391
1392 /* Uffd-wp should persist even swapped out */
1393 value = pagemap_read_vaddr(pagemap_fd, area_dst);
1394 pagemap_check_wp(value, true);
1395 /* Make sure uffd-wp bit dropped when fork */
1396 if (pagemap_test_fork(false))
1397 err("Detected stall uffd-wp bit in child");
1398
1399 /* Unprotect; this tests swap pte modifications */
1400 wp_range(uffd, (uint64_t)area_dst, page_size, false);
1401 value = pagemap_read_vaddr(pagemap_fd, area_dst);
1402 pagemap_check_wp(value, false);
1403
1404 /* Fault in the page from disk */
1405 *area_dst = 2;
1406 value = pagemap_read_vaddr(pagemap_fd, area_dst);
1407 pagemap_check_wp(value, false);
1408
1409 close(pagemap_fd);
1410 printf("done\n");
1411}
1412
1413static int userfaultfd_stress(void)
1414{
1415 void *area;
1416 char *tmp_area;
1417 unsigned long nr;
1418 struct uffdio_register uffdio_register;
1419 struct uffd_stats uffd_stats[nr_cpus];
1420
1421 uffd_test_ctx_init(0);
1422
1423 if (posix_memalign(&area, page_size, page_size))
1424 err("out of memory");
1425 zeropage = area;
1426 bzero(zeropage, page_size);
1427
1428 pthread_mutex_lock(&uffd_read_mutex);
1429
1430 pthread_attr_init(&attr);
1431 pthread_attr_setstacksize(&attr, 16*1024*1024);
1432
1433 while (bounces--) {
1434 printf("bounces: %d, mode:", bounces);
1435 if (bounces & BOUNCE_RANDOM)
1436 printf(" rnd");
1437 if (bounces & BOUNCE_RACINGFAULTS)
1438 printf(" racing");
1439 if (bounces & BOUNCE_VERIFY)
1440 printf(" ver");
1441 if (bounces & BOUNCE_POLL)
1442 printf(" poll");
1443 else
1444 printf(" read");
1445 printf(", ");
1446 fflush(stdout);
1447
1448 if (bounces & BOUNCE_POLL)
1449 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1450 else
1451 fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
1452
1453 /* register */
1454 uffdio_register.range.start = (unsigned long) area_dst;
1455 uffdio_register.range.len = nr_pages * page_size;
1456 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1457 if (test_uffdio_wp)
1458 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1459 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1460 err("register failure");
1461 assert_expected_ioctls_present(
1462 uffdio_register.mode, uffdio_register.ioctls);
1463
1464 if (area_dst_alias) {
1465 uffdio_register.range.start = (unsigned long)
1466 area_dst_alias;
1467 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1468 err("register failure alias");
1469 }
1470
1471 /*
1472 * The madvise done previously isn't enough: some
1473 * uffd_thread could have read userfaults (one of
1474 * those already resolved by the background thread)
1475 * and it may be in the process of calling
1476 * UFFDIO_COPY. UFFDIO_COPY will read the zapped
1477 * area_src and it would map a zero page in it (of
1478 * course such a UFFDIO_COPY is perfectly safe as it'd
1479 * return -EEXIST). The problem comes at the next
1480 * bounce though: that racing UFFDIO_COPY would
1481 * generate zeropages in the area_src, so invalidating
1482 * the previous MADV_DONTNEED. Without this additional
1483 * MADV_DONTNEED those zeropages leftovers in the
1484 * area_src would lead to -EEXIST failure during the
1485 * next bounce, effectively leaving a zeropage in the
1486 * area_dst.
1487 *
1488 * Try to comment this out madvise to see the memory
1489 * corruption being caught pretty quick.
1490 *
1491 * khugepaged is also inhibited to collapse THP after
1492 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
1493 * required to MADV_DONTNEED here.
1494 */
1495 uffd_test_ops->release_pages(area_dst);
1496
1497 uffd_stats_reset(uffd_stats, nr_cpus);
1498
1499 /* bounce pass */
1500 if (stress(uffd_stats))
1501 return 1;
1502
1503 /* Clear all the write protections if there is any */
1504 if (test_uffdio_wp)
1505 wp_range(uffd, (unsigned long)area_dst,
1506 nr_pages * page_size, false);
1507
1508 /* unregister */
1509 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range))
1510 err("unregister failure");
1511 if (area_dst_alias) {
1512 uffdio_register.range.start = (unsigned long) area_dst;
1513 if (ioctl(uffd, UFFDIO_UNREGISTER,
1514 &uffdio_register.range))
1515 err("unregister failure alias");
1516 }
1517
1518 /* verification */
1519 if (bounces & BOUNCE_VERIFY)
1520 for (nr = 0; nr < nr_pages; nr++)
1521 if (*area_count(area_dst, nr) != count_verify[nr])
1522 err("error area_count %llu %llu %lu\n",
1523 *area_count(area_src, nr),
1524 count_verify[nr], nr);
1525
1526 /* prepare next bounce */
1527 tmp_area = area_src;
1528 area_src = area_dst;
1529 area_dst = tmp_area;
1530
1531 tmp_area = area_src_alias;
1532 area_src_alias = area_dst_alias;
1533 area_dst_alias = tmp_area;
1534
1535 uffd_stats_report(uffd_stats, nr_cpus);
1536 }
1537
1538 if (test_type == TEST_ANON) {
1539 /*
1540 * shmem/hugetlb won't be able to run since they have different
1541 * behavior on fork() (file-backed memory normally drops ptes
1542 * directly when fork), meanwhile the pagemap test will verify
1543 * pgtable entry of fork()ed child.
1544 */
1545 userfaultfd_pagemap_test(page_size);
1546 /*
1547 * Hard-code for x86_64 for now for 2M THP, as x86_64 is
1548 * currently the only one that supports uffd-wp
1549 */
1550 userfaultfd_pagemap_test(page_size * 512);
1551 }
1552
1553 return userfaultfd_zeropage_test() || userfaultfd_sig_test()
1554 || userfaultfd_events_test() || userfaultfd_minor_test();
1555}
1556
1557/*
1558 * Copied from mlock2-tests.c
1559 */
1560unsigned long default_huge_page_size(void)
1561{
1562 unsigned long hps = 0;
1563 char *line = NULL;
1564 size_t linelen = 0;
1565 FILE *f = fopen("/proc/meminfo", "r");
1566
1567 if (!f)
1568 return 0;
1569 while (getline(&line, &linelen, f) > 0) {
1570 if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
1571 hps <<= 10;
1572 break;
1573 }
1574 }
1575
1576 free(line);
1577 fclose(f);
1578 return hps;
1579}
1580
1581static void set_test_type(const char *type)
1582{
1583 uint64_t features = UFFD_API_FEATURES;
1584
1585 if (!strcmp(type, "anon")) {
1586 test_type = TEST_ANON;
1587 uffd_test_ops = &anon_uffd_test_ops;
1588 /* Only enable write-protect test for anonymous test */
1589 test_uffdio_wp = true;
1590 } else if (!strcmp(type, "hugetlb")) {
1591 test_type = TEST_HUGETLB;
1592 uffd_test_ops = &hugetlb_uffd_test_ops;
1593 } else if (!strcmp(type, "hugetlb_shared")) {
1594 map_shared = true;
1595 test_type = TEST_HUGETLB;
1596 uffd_test_ops = &hugetlb_uffd_test_ops;
1597 /* Minor faults require shared hugetlb; only enable here. */
1598 test_uffdio_minor = true;
1599 } else if (!strcmp(type, "shmem")) {
1600 map_shared = true;
1601 test_type = TEST_SHMEM;
1602 uffd_test_ops = &shmem_uffd_test_ops;
1603 test_uffdio_minor = true;
1604 } else {
1605 err("Unknown test type: %s", type);
1606 }
1607
1608 if (test_type == TEST_HUGETLB)
1609 page_size = default_huge_page_size();
1610 else
1611 page_size = sysconf(_SC_PAGE_SIZE);
1612
1613 if (!page_size)
1614 err("Unable to determine page size");
1615 if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
1616 > page_size)
1617 err("Impossible to run this test");
1618
1619 /*
1620 * Whether we can test certain features depends not just on test type,
1621 * but also on whether or not this particular kernel supports the
1622 * feature.
1623 */
1624
1625 userfaultfd_open(&features);
1626
1627 test_uffdio_wp = test_uffdio_wp &&
1628 (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP);
1629 test_uffdio_minor = test_uffdio_minor &&
1630 (features & uffd_minor_feature());
1631
1632 close(uffd);
1633 uffd = -1;
1634}
1635
1636static void sigalrm(int sig)
1637{
1638 if (sig != SIGALRM)
1639 abort();
1640 test_uffdio_copy_eexist = true;
1641 test_uffdio_zeropage_eexist = true;
1642 alarm(ALARM_INTERVAL_SECS);
1643}
1644
1645int main(int argc, char **argv)
1646{
1647 if (argc < 4)
1648 usage();
1649
1650 if (signal(SIGALRM, sigalrm) == SIG_ERR)
1651 err("failed to arm SIGALRM");
1652 alarm(ALARM_INTERVAL_SECS);
1653
1654 set_test_type(argv[1]);
1655
1656 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1657 nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
1658 nr_cpus;
1659 if (!nr_pages_per_cpu) {
1660 _err("invalid MiB");
1661 usage();
1662 }
1663
1664 bounces = atoi(argv[3]);
1665 if (bounces <= 0) {
1666 _err("invalid bounces");
1667 usage();
1668 }
1669 nr_pages = nr_pages_per_cpu * nr_cpus;
1670
1671 if (test_type == TEST_HUGETLB) {
1672 if (argc < 5)
1673 usage();
1674 huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
1675 if (huge_fd < 0)
1676 err("Open of %s failed", argv[4]);
1677 if (ftruncate(huge_fd, 0))
1678 err("ftruncate %s to size 0 failed", argv[4]);
1679 } else if (test_type == TEST_SHMEM) {
1680 shm_fd = memfd_create(argv[0], 0);
1681 if (shm_fd < 0)
1682 err("memfd_create");
1683 if (ftruncate(shm_fd, nr_pages * page_size * 2))
1684 err("ftruncate");
1685 if (fallocate(shm_fd,
1686 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
1687 nr_pages * page_size * 2))
1688 err("fallocate");
1689 }
1690 printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
1691 nr_pages, nr_pages_per_cpu);
1692 return userfaultfd_stress();
1693}
1694
1695#else /* __NR_userfaultfd */
1696
1697#warning "missing __NR_userfaultfd definition"
1698
1699int main(void)
1700{
1701 printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
1702 return KSFT_SKIP;
1703}
1704
1705#endif /* __NR_userfaultfd */