at v5.15-rc2 1699 lines 46 kB view raw
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Stress userfaultfd syscall. 4 * 5 * Copyright (C) 2015 Red Hat, Inc. 6 * 7 * This test allocates two virtual areas and bounces the physical 8 * memory across the two virtual areas (from area_src to area_dst) 9 * using userfaultfd. 10 * 11 * There are three threads running per CPU: 12 * 13 * 1) one per-CPU thread takes a per-page pthread_mutex in a random 14 * page of the area_dst (while the physical page may still be in 15 * area_src), and increments a per-page counter in the same page, 16 * and checks its value against a verification region. 17 * 18 * 2) another per-CPU thread handles the userfaults generated by 19 * thread 1 above. userfaultfd blocking reads or poll() modes are 20 * exercised interleaved. 21 * 22 * 3) one last per-CPU thread transfers the memory in the background 23 * at maximum bandwidth (if not already transferred by thread 24 * 2). Each cpu thread takes cares of transferring a portion of the 25 * area. 26 * 27 * When all threads of type 3 completed the transfer, one bounce is 28 * complete. area_src and area_dst are then swapped. All threads are 29 * respawned and so the bounce is immediately restarted in the 30 * opposite direction. 31 * 32 * per-CPU threads 1 by triggering userfaults inside 33 * pthread_mutex_lock will also verify the atomicity of the memory 34 * transfer (UFFDIO_COPY). 35 */ 36 37#define _GNU_SOURCE 38#include <stdio.h> 39#include <errno.h> 40#include <unistd.h> 41#include <stdlib.h> 42#include <sys/types.h> 43#include <sys/stat.h> 44#include <fcntl.h> 45#include <time.h> 46#include <signal.h> 47#include <poll.h> 48#include <string.h> 49#include <sys/mman.h> 50#include <sys/syscall.h> 51#include <sys/ioctl.h> 52#include <sys/wait.h> 53#include <pthread.h> 54#include <linux/userfaultfd.h> 55#include <setjmp.h> 56#include <stdbool.h> 57#include <assert.h> 58#include <inttypes.h> 59#include <stdint.h> 60 61#include "../kselftest.h" 62 63#ifdef __NR_userfaultfd 64 65static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; 66 67#define BOUNCE_RANDOM (1<<0) 68#define BOUNCE_RACINGFAULTS (1<<1) 69#define BOUNCE_VERIFY (1<<2) 70#define BOUNCE_POLL (1<<3) 71static int bounces; 72 73#define TEST_ANON 1 74#define TEST_HUGETLB 2 75#define TEST_SHMEM 3 76static int test_type; 77 78/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ 79#define ALARM_INTERVAL_SECS 10 80static volatile bool test_uffdio_copy_eexist = true; 81static volatile bool test_uffdio_zeropage_eexist = true; 82/* Whether to test uffd write-protection */ 83static bool test_uffdio_wp = false; 84/* Whether to test uffd minor faults */ 85static bool test_uffdio_minor = false; 86 87static bool map_shared; 88static int shm_fd; 89static int huge_fd; 90static char *huge_fd_off0; 91static unsigned long long *count_verify; 92static int uffd = -1; 93static int uffd_flags, finished, *pipefd; 94static char *area_src, *area_src_alias, *area_dst, *area_dst_alias; 95static char *zeropage; 96pthread_attr_t attr; 97 98/* Userfaultfd test statistics */ 99struct uffd_stats { 100 int cpu; 101 unsigned long missing_faults; 102 unsigned long wp_faults; 103 unsigned long minor_faults; 104}; 105 106/* pthread_mutex_t starts at page offset 0 */ 107#define area_mutex(___area, ___nr) \ 108 ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) 109/* 110 * count is placed in the page after pthread_mutex_t naturally aligned 111 * to avoid non alignment faults on non-x86 archs. 112 */ 113#define area_count(___area, ___nr) \ 114 ((volatile unsigned long long *) ((unsigned long) \ 115 ((___area) + (___nr)*page_size + \ 116 sizeof(pthread_mutex_t) + \ 117 sizeof(unsigned long long) - 1) & \ 118 ~(unsigned long)(sizeof(unsigned long long) \ 119 - 1))) 120 121const char *examples = 122 "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" 123 "./userfaultfd anon 100 99999\n\n" 124 "# Run share memory test on 1GiB region with 99 bounces:\n" 125 "./userfaultfd shmem 1000 99\n\n" 126 "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n" 127 "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n" 128 "# Run the same hugetlb test but using shmem:\n" 129 "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n" 130 "# 10MiB-~6GiB 999 bounces anonymous test, " 131 "continue forever unless an error triggers\n" 132 "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; 133 134static void usage(void) 135{ 136 fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> " 137 "[hugetlbfs_file]\n\n"); 138 fprintf(stderr, "Supported <test type>: anon, hugetlb, " 139 "hugetlb_shared, shmem\n\n"); 140 fprintf(stderr, "Examples:\n\n"); 141 fprintf(stderr, "%s", examples); 142 exit(1); 143} 144 145#define _err(fmt, ...) \ 146 do { \ 147 int ret = errno; \ 148 fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ 149 fprintf(stderr, " (errno=%d, line=%d)\n", \ 150 ret, __LINE__); \ 151 } while (0) 152 153#define err(fmt, ...) \ 154 do { \ 155 _err(fmt, ##__VA_ARGS__); \ 156 exit(1); \ 157 } while (0) 158 159static void uffd_stats_reset(struct uffd_stats *uffd_stats, 160 unsigned long n_cpus) 161{ 162 int i; 163 164 for (i = 0; i < n_cpus; i++) { 165 uffd_stats[i].cpu = i; 166 uffd_stats[i].missing_faults = 0; 167 uffd_stats[i].wp_faults = 0; 168 uffd_stats[i].minor_faults = 0; 169 } 170} 171 172static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) 173{ 174 int i; 175 unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; 176 177 for (i = 0; i < n_cpus; i++) { 178 miss_total += stats[i].missing_faults; 179 wp_total += stats[i].wp_faults; 180 minor_total += stats[i].minor_faults; 181 } 182 183 printf("userfaults: "); 184 if (miss_total) { 185 printf("%llu missing (", miss_total); 186 for (i = 0; i < n_cpus; i++) 187 printf("%lu+", stats[i].missing_faults); 188 printf("\b) "); 189 } 190 if (wp_total) { 191 printf("%llu wp (", wp_total); 192 for (i = 0; i < n_cpus; i++) 193 printf("%lu+", stats[i].wp_faults); 194 printf("\b) "); 195 } 196 if (minor_total) { 197 printf("%llu minor (", minor_total); 198 for (i = 0; i < n_cpus; i++) 199 printf("%lu+", stats[i].minor_faults); 200 printf("\b)"); 201 } 202 printf("\n"); 203} 204 205static void anon_release_pages(char *rel_area) 206{ 207 if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) 208 err("madvise(MADV_DONTNEED) failed"); 209} 210 211static void anon_allocate_area(void **alloc_area) 212{ 213 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, 214 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); 215 if (*alloc_area == MAP_FAILED) 216 err("mmap of anonymous memory failed"); 217} 218 219static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) 220{ 221} 222 223static void hugetlb_release_pages(char *rel_area) 224{ 225 if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 226 rel_area == huge_fd_off0 ? 0 : nr_pages * page_size, 227 nr_pages * page_size)) 228 err("fallocate() failed"); 229} 230 231static void hugetlb_allocate_area(void **alloc_area) 232{ 233 void *area_alias = NULL; 234 char **alloc_area_alias; 235 236 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, 237 (map_shared ? MAP_SHARED : MAP_PRIVATE) | 238 MAP_HUGETLB, 239 huge_fd, *alloc_area == area_src ? 0 : 240 nr_pages * page_size); 241 if (*alloc_area == MAP_FAILED) 242 err("mmap of hugetlbfs file failed"); 243 244 if (map_shared) { 245 area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, 246 MAP_SHARED | MAP_HUGETLB, 247 huge_fd, *alloc_area == area_src ? 0 : 248 nr_pages * page_size); 249 if (area_alias == MAP_FAILED) 250 err("mmap of hugetlb file alias failed"); 251 } 252 253 if (*alloc_area == area_src) { 254 huge_fd_off0 = *alloc_area; 255 alloc_area_alias = &area_src_alias; 256 } else { 257 alloc_area_alias = &area_dst_alias; 258 } 259 if (area_alias) 260 *alloc_area_alias = area_alias; 261} 262 263static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) 264{ 265 if (!map_shared) 266 return; 267 /* 268 * We can't zap just the pagetable with hugetlbfs because 269 * MADV_DONTEED won't work. So exercise -EEXIST on a alias 270 * mapping where the pagetables are not established initially, 271 * this way we'll exercise the -EEXEC at the fs level. 272 */ 273 *start = (unsigned long) area_dst_alias + offset; 274} 275 276static void shmem_release_pages(char *rel_area) 277{ 278 if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) 279 err("madvise(MADV_REMOVE) failed"); 280} 281 282static void shmem_allocate_area(void **alloc_area) 283{ 284 void *area_alias = NULL; 285 bool is_src = alloc_area == (void **)&area_src; 286 unsigned long offset = is_src ? 0 : nr_pages * page_size; 287 288 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, 289 MAP_SHARED, shm_fd, offset); 290 if (*alloc_area == MAP_FAILED) 291 err("mmap of memfd failed"); 292 293 area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, 294 MAP_SHARED, shm_fd, offset); 295 if (area_alias == MAP_FAILED) 296 err("mmap of memfd alias failed"); 297 298 if (is_src) 299 area_src_alias = area_alias; 300 else 301 area_dst_alias = area_alias; 302} 303 304static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) 305{ 306 *start = (unsigned long)area_dst_alias + offset; 307} 308 309struct uffd_test_ops { 310 unsigned long expected_ioctls; 311 void (*allocate_area)(void **alloc_area); 312 void (*release_pages)(char *rel_area); 313 void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); 314}; 315 316#define SHMEM_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ 317 (1 << _UFFDIO_COPY) | \ 318 (1 << _UFFDIO_ZEROPAGE)) 319 320#define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ 321 (1 << _UFFDIO_COPY) | \ 322 (1 << _UFFDIO_ZEROPAGE) | \ 323 (1 << _UFFDIO_WRITEPROTECT)) 324 325static struct uffd_test_ops anon_uffd_test_ops = { 326 .expected_ioctls = ANON_EXPECTED_IOCTLS, 327 .allocate_area = anon_allocate_area, 328 .release_pages = anon_release_pages, 329 .alias_mapping = noop_alias_mapping, 330}; 331 332static struct uffd_test_ops shmem_uffd_test_ops = { 333 .expected_ioctls = SHMEM_EXPECTED_IOCTLS, 334 .allocate_area = shmem_allocate_area, 335 .release_pages = shmem_release_pages, 336 .alias_mapping = shmem_alias_mapping, 337}; 338 339static struct uffd_test_ops hugetlb_uffd_test_ops = { 340 .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC & ~(1 << _UFFDIO_CONTINUE), 341 .allocate_area = hugetlb_allocate_area, 342 .release_pages = hugetlb_release_pages, 343 .alias_mapping = hugetlb_alias_mapping, 344}; 345 346static struct uffd_test_ops *uffd_test_ops; 347 348static void userfaultfd_open(uint64_t *features) 349{ 350 struct uffdio_api uffdio_api; 351 352 uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); 353 if (uffd < 0) 354 err("userfaultfd syscall not available in this kernel"); 355 uffd_flags = fcntl(uffd, F_GETFD, NULL); 356 357 uffdio_api.api = UFFD_API; 358 uffdio_api.features = *features; 359 if (ioctl(uffd, UFFDIO_API, &uffdio_api)) 360 err("UFFDIO_API failed.\nPlease make sure to " 361 "run with either root or ptrace capability."); 362 if (uffdio_api.api != UFFD_API) 363 err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); 364 365 *features = uffdio_api.features; 366} 367 368static inline void munmap_area(void **area) 369{ 370 if (*area) 371 if (munmap(*area, nr_pages * page_size)) 372 err("munmap"); 373 374 *area = NULL; 375} 376 377static void uffd_test_ctx_clear(void) 378{ 379 size_t i; 380 381 if (pipefd) { 382 for (i = 0; i < nr_cpus * 2; ++i) { 383 if (close(pipefd[i])) 384 err("close pipefd"); 385 } 386 free(pipefd); 387 pipefd = NULL; 388 } 389 390 if (count_verify) { 391 free(count_verify); 392 count_verify = NULL; 393 } 394 395 if (uffd != -1) { 396 if (close(uffd)) 397 err("close uffd"); 398 uffd = -1; 399 } 400 401 huge_fd_off0 = NULL; 402 munmap_area((void **)&area_src); 403 munmap_area((void **)&area_src_alias); 404 munmap_area((void **)&area_dst); 405 munmap_area((void **)&area_dst_alias); 406} 407 408static void uffd_test_ctx_init_ext(uint64_t *features) 409{ 410 unsigned long nr, cpu; 411 412 uffd_test_ctx_clear(); 413 414 uffd_test_ops->allocate_area((void **)&area_src); 415 uffd_test_ops->allocate_area((void **)&area_dst); 416 417 uffd_test_ops->release_pages(area_src); 418 uffd_test_ops->release_pages(area_dst); 419 420 userfaultfd_open(features); 421 422 count_verify = malloc(nr_pages * sizeof(unsigned long long)); 423 if (!count_verify) 424 err("count_verify"); 425 426 for (nr = 0; nr < nr_pages; nr++) { 427 *area_mutex(area_src, nr) = 428 (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; 429 count_verify[nr] = *area_count(area_src, nr) = 1; 430 /* 431 * In the transition between 255 to 256, powerpc will 432 * read out of order in my_bcmp and see both bytes as 433 * zero, so leave a placeholder below always non-zero 434 * after the count, to avoid my_bcmp to trigger false 435 * positives. 436 */ 437 *(area_count(area_src, nr) + 1) = 1; 438 } 439 440 pipefd = malloc(sizeof(int) * nr_cpus * 2); 441 if (!pipefd) 442 err("pipefd"); 443 for (cpu = 0; cpu < nr_cpus; cpu++) 444 if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) 445 err("pipe"); 446} 447 448static inline void uffd_test_ctx_init(uint64_t features) 449{ 450 uffd_test_ctx_init_ext(&features); 451} 452 453static int my_bcmp(char *str1, char *str2, size_t n) 454{ 455 unsigned long i; 456 for (i = 0; i < n; i++) 457 if (str1[i] != str2[i]) 458 return 1; 459 return 0; 460} 461 462static void wp_range(int ufd, __u64 start, __u64 len, bool wp) 463{ 464 struct uffdio_writeprotect prms; 465 466 /* Write protection page faults */ 467 prms.range.start = start; 468 prms.range.len = len; 469 /* Undo write-protect, do wakeup after that */ 470 prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; 471 472 if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) 473 err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); 474} 475 476static void continue_range(int ufd, __u64 start, __u64 len) 477{ 478 struct uffdio_continue req; 479 int ret; 480 481 req.range.start = start; 482 req.range.len = len; 483 req.mode = 0; 484 485 if (ioctl(ufd, UFFDIO_CONTINUE, &req)) 486 err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, 487 (uint64_t)start); 488 489 /* 490 * Error handling within the kernel for continue is subtly different 491 * from copy or zeropage, so it may be a source of bugs. Trigger an 492 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. 493 */ 494 req.mapped = 0; 495 ret = ioctl(ufd, UFFDIO_CONTINUE, &req); 496 if (ret >= 0 || req.mapped != -EEXIST) 497 err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, 498 ret, (int64_t) req.mapped); 499} 500 501static void *locking_thread(void *arg) 502{ 503 unsigned long cpu = (unsigned long) arg; 504 struct random_data rand; 505 unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */ 506 int32_t rand_nr; 507 unsigned long long count; 508 char randstate[64]; 509 unsigned int seed; 510 511 if (bounces & BOUNCE_RANDOM) { 512 seed = (unsigned int) time(NULL) - bounces; 513 if (!(bounces & BOUNCE_RACINGFAULTS)) 514 seed += cpu; 515 bzero(&rand, sizeof(rand)); 516 bzero(&randstate, sizeof(randstate)); 517 if (initstate_r(seed, randstate, sizeof(randstate), &rand)) 518 err("initstate_r failed"); 519 } else { 520 page_nr = -bounces; 521 if (!(bounces & BOUNCE_RACINGFAULTS)) 522 page_nr += cpu * nr_pages_per_cpu; 523 } 524 525 while (!finished) { 526 if (bounces & BOUNCE_RANDOM) { 527 if (random_r(&rand, &rand_nr)) 528 err("random_r failed"); 529 page_nr = rand_nr; 530 if (sizeof(page_nr) > sizeof(rand_nr)) { 531 if (random_r(&rand, &rand_nr)) 532 err("random_r failed"); 533 page_nr |= (((unsigned long) rand_nr) << 16) << 534 16; 535 } 536 } else 537 page_nr += 1; 538 page_nr %= nr_pages; 539 pthread_mutex_lock(area_mutex(area_dst, page_nr)); 540 count = *area_count(area_dst, page_nr); 541 if (count != count_verify[page_nr]) 542 err("page_nr %lu memory corruption %llu %llu", 543 page_nr, count, count_verify[page_nr]); 544 count++; 545 *area_count(area_dst, page_nr) = count_verify[page_nr] = count; 546 pthread_mutex_unlock(area_mutex(area_dst, page_nr)); 547 } 548 549 return NULL; 550} 551 552static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, 553 unsigned long offset) 554{ 555 uffd_test_ops->alias_mapping(&uffdio_copy->dst, 556 uffdio_copy->len, 557 offset); 558 if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { 559 /* real retval in ufdio_copy.copy */ 560 if (uffdio_copy->copy != -EEXIST) 561 err("UFFDIO_COPY retry error: %"PRId64, 562 (int64_t)uffdio_copy->copy); 563 } else { 564 err("UFFDIO_COPY retry unexpected: %"PRId64, 565 (int64_t)uffdio_copy->copy); 566 } 567} 568 569static void wake_range(int ufd, unsigned long addr, unsigned long len) 570{ 571 struct uffdio_range uffdio_wake; 572 573 uffdio_wake.start = addr; 574 uffdio_wake.len = len; 575 576 if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) 577 fprintf(stderr, "error waking %lu\n", 578 addr), exit(1); 579} 580 581static int __copy_page(int ufd, unsigned long offset, bool retry) 582{ 583 struct uffdio_copy uffdio_copy; 584 585 if (offset >= nr_pages * page_size) 586 err("unexpected offset %lu\n", offset); 587 uffdio_copy.dst = (unsigned long) area_dst + offset; 588 uffdio_copy.src = (unsigned long) area_src + offset; 589 uffdio_copy.len = page_size; 590 if (test_uffdio_wp) 591 uffdio_copy.mode = UFFDIO_COPY_MODE_WP; 592 else 593 uffdio_copy.mode = 0; 594 uffdio_copy.copy = 0; 595 if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { 596 /* real retval in ufdio_copy.copy */ 597 if (uffdio_copy.copy != -EEXIST) 598 err("UFFDIO_COPY error: %"PRId64, 599 (int64_t)uffdio_copy.copy); 600 wake_range(ufd, uffdio_copy.dst, page_size); 601 } else if (uffdio_copy.copy != page_size) { 602 err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); 603 } else { 604 if (test_uffdio_copy_eexist && retry) { 605 test_uffdio_copy_eexist = false; 606 retry_copy_page(ufd, &uffdio_copy, offset); 607 } 608 return 1; 609 } 610 return 0; 611} 612 613static int copy_page_retry(int ufd, unsigned long offset) 614{ 615 return __copy_page(ufd, offset, true); 616} 617 618static int copy_page(int ufd, unsigned long offset) 619{ 620 return __copy_page(ufd, offset, false); 621} 622 623static int uffd_read_msg(int ufd, struct uffd_msg *msg) 624{ 625 int ret = read(uffd, msg, sizeof(*msg)); 626 627 if (ret != sizeof(*msg)) { 628 if (ret < 0) { 629 if (errno == EAGAIN) 630 return 1; 631 err("blocking read error"); 632 } else { 633 err("short read"); 634 } 635 } 636 637 return 0; 638} 639 640static void uffd_handle_page_fault(struct uffd_msg *msg, 641 struct uffd_stats *stats) 642{ 643 unsigned long offset; 644 645 if (msg->event != UFFD_EVENT_PAGEFAULT) 646 err("unexpected msg event %u", msg->event); 647 648 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { 649 /* Write protect page faults */ 650 wp_range(uffd, msg->arg.pagefault.address, page_size, false); 651 stats->wp_faults++; 652 } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { 653 uint8_t *area; 654 int b; 655 656 /* 657 * Minor page faults 658 * 659 * To prove we can modify the original range for testing 660 * purposes, we're going to bit flip this range before 661 * continuing. 662 * 663 * Note that this requires all minor page fault tests operate on 664 * area_dst (non-UFFD-registered) and area_dst_alias 665 * (UFFD-registered). 666 */ 667 668 area = (uint8_t *)(area_dst + 669 ((char *)msg->arg.pagefault.address - 670 area_dst_alias)); 671 for (b = 0; b < page_size; ++b) 672 area[b] = ~area[b]; 673 continue_range(uffd, msg->arg.pagefault.address, page_size); 674 stats->minor_faults++; 675 } else { 676 /* Missing page faults */ 677 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) 678 err("unexpected write fault"); 679 680 offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; 681 offset &= ~(page_size-1); 682 683 if (copy_page(uffd, offset)) 684 stats->missing_faults++; 685 } 686} 687 688static void *uffd_poll_thread(void *arg) 689{ 690 struct uffd_stats *stats = (struct uffd_stats *)arg; 691 unsigned long cpu = stats->cpu; 692 struct pollfd pollfd[2]; 693 struct uffd_msg msg; 694 struct uffdio_register uffd_reg; 695 int ret; 696 char tmp_chr; 697 698 pollfd[0].fd = uffd; 699 pollfd[0].events = POLLIN; 700 pollfd[1].fd = pipefd[cpu*2]; 701 pollfd[1].events = POLLIN; 702 703 for (;;) { 704 ret = poll(pollfd, 2, -1); 705 if (ret <= 0) 706 err("poll error: %d", ret); 707 if (pollfd[1].revents & POLLIN) { 708 if (read(pollfd[1].fd, &tmp_chr, 1) != 1) 709 err("read pipefd error"); 710 break; 711 } 712 if (!(pollfd[0].revents & POLLIN)) 713 err("pollfd[0].revents %d", pollfd[0].revents); 714 if (uffd_read_msg(uffd, &msg)) 715 continue; 716 switch (msg.event) { 717 default: 718 err("unexpected msg event %u\n", msg.event); 719 break; 720 case UFFD_EVENT_PAGEFAULT: 721 uffd_handle_page_fault(&msg, stats); 722 break; 723 case UFFD_EVENT_FORK: 724 close(uffd); 725 uffd = msg.arg.fork.ufd; 726 pollfd[0].fd = uffd; 727 break; 728 case UFFD_EVENT_REMOVE: 729 uffd_reg.range.start = msg.arg.remove.start; 730 uffd_reg.range.len = msg.arg.remove.end - 731 msg.arg.remove.start; 732 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) 733 err("remove failure"); 734 break; 735 case UFFD_EVENT_REMAP: 736 area_dst = (char *)(unsigned long)msg.arg.remap.to; 737 break; 738 } 739 } 740 741 return NULL; 742} 743 744pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; 745 746static void *uffd_read_thread(void *arg) 747{ 748 struct uffd_stats *stats = (struct uffd_stats *)arg; 749 struct uffd_msg msg; 750 751 pthread_mutex_unlock(&uffd_read_mutex); 752 /* from here cancellation is ok */ 753 754 for (;;) { 755 if (uffd_read_msg(uffd, &msg)) 756 continue; 757 uffd_handle_page_fault(&msg, stats); 758 } 759 760 return NULL; 761} 762 763static void *background_thread(void *arg) 764{ 765 unsigned long cpu = (unsigned long) arg; 766 unsigned long page_nr, start_nr, mid_nr, end_nr; 767 768 start_nr = cpu * nr_pages_per_cpu; 769 end_nr = (cpu+1) * nr_pages_per_cpu; 770 mid_nr = (start_nr + end_nr) / 2; 771 772 /* Copy the first half of the pages */ 773 for (page_nr = start_nr; page_nr < mid_nr; page_nr++) 774 copy_page_retry(uffd, page_nr * page_size); 775 776 /* 777 * If we need to test uffd-wp, set it up now. Then we'll have 778 * at least the first half of the pages mapped already which 779 * can be write-protected for testing 780 */ 781 if (test_uffdio_wp) 782 wp_range(uffd, (unsigned long)area_dst + start_nr * page_size, 783 nr_pages_per_cpu * page_size, true); 784 785 /* 786 * Continue the 2nd half of the page copying, handling write 787 * protection faults if any 788 */ 789 for (page_nr = mid_nr; page_nr < end_nr; page_nr++) 790 copy_page_retry(uffd, page_nr * page_size); 791 792 return NULL; 793} 794 795static int stress(struct uffd_stats *uffd_stats) 796{ 797 unsigned long cpu; 798 pthread_t locking_threads[nr_cpus]; 799 pthread_t uffd_threads[nr_cpus]; 800 pthread_t background_threads[nr_cpus]; 801 802 finished = 0; 803 for (cpu = 0; cpu < nr_cpus; cpu++) { 804 if (pthread_create(&locking_threads[cpu], &attr, 805 locking_thread, (void *)cpu)) 806 return 1; 807 if (bounces & BOUNCE_POLL) { 808 if (pthread_create(&uffd_threads[cpu], &attr, 809 uffd_poll_thread, 810 (void *)&uffd_stats[cpu])) 811 return 1; 812 } else { 813 if (pthread_create(&uffd_threads[cpu], &attr, 814 uffd_read_thread, 815 (void *)&uffd_stats[cpu])) 816 return 1; 817 pthread_mutex_lock(&uffd_read_mutex); 818 } 819 if (pthread_create(&background_threads[cpu], &attr, 820 background_thread, (void *)cpu)) 821 return 1; 822 } 823 for (cpu = 0; cpu < nr_cpus; cpu++) 824 if (pthread_join(background_threads[cpu], NULL)) 825 return 1; 826 827 /* 828 * Be strict and immediately zap area_src, the whole area has 829 * been transferred already by the background treads. The 830 * area_src could then be faulted in in a racy way by still 831 * running uffdio_threads reading zeropages after we zapped 832 * area_src (but they're guaranteed to get -EEXIST from 833 * UFFDIO_COPY without writing zero pages into area_dst 834 * because the background threads already completed). 835 */ 836 uffd_test_ops->release_pages(area_src); 837 838 finished = 1; 839 for (cpu = 0; cpu < nr_cpus; cpu++) 840 if (pthread_join(locking_threads[cpu], NULL)) 841 return 1; 842 843 for (cpu = 0; cpu < nr_cpus; cpu++) { 844 char c; 845 if (bounces & BOUNCE_POLL) { 846 if (write(pipefd[cpu*2+1], &c, 1) != 1) 847 err("pipefd write error"); 848 if (pthread_join(uffd_threads[cpu], 849 (void *)&uffd_stats[cpu])) 850 return 1; 851 } else { 852 if (pthread_cancel(uffd_threads[cpu])) 853 return 1; 854 if (pthread_join(uffd_threads[cpu], NULL)) 855 return 1; 856 } 857 } 858 859 return 0; 860} 861 862sigjmp_buf jbuf, *sigbuf; 863 864static void sighndl(int sig, siginfo_t *siginfo, void *ptr) 865{ 866 if (sig == SIGBUS) { 867 if (sigbuf) 868 siglongjmp(*sigbuf, 1); 869 abort(); 870 } 871} 872 873/* 874 * For non-cooperative userfaultfd test we fork() a process that will 875 * generate pagefaults, will mremap the area monitored by the 876 * userfaultfd and at last this process will release the monitored 877 * area. 878 * For the anonymous and shared memory the area is divided into two 879 * parts, the first part is accessed before mremap, and the second 880 * part is accessed after mremap. Since hugetlbfs does not support 881 * mremap, the entire monitored area is accessed in a single pass for 882 * HUGETLB_TEST. 883 * The release of the pages currently generates event for shmem and 884 * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked 885 * for hugetlb. 886 * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register 887 * monitored area, generate pagefaults and test that signal is delivered. 888 * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2 889 * test robustness use case - we release monitored area, fork a process 890 * that will generate pagefaults and verify signal is generated. 891 * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal 892 * feature. Using monitor thread, verify no userfault events are generated. 893 */ 894static int faulting_process(int signal_test) 895{ 896 unsigned long nr; 897 unsigned long long count; 898 unsigned long split_nr_pages; 899 unsigned long lastnr; 900 struct sigaction act; 901 unsigned long signalled = 0; 902 903 if (test_type != TEST_HUGETLB) 904 split_nr_pages = (nr_pages + 1) / 2; 905 else 906 split_nr_pages = nr_pages; 907 908 if (signal_test) { 909 sigbuf = &jbuf; 910 memset(&act, 0, sizeof(act)); 911 act.sa_sigaction = sighndl; 912 act.sa_flags = SA_SIGINFO; 913 if (sigaction(SIGBUS, &act, 0)) 914 err("sigaction"); 915 lastnr = (unsigned long)-1; 916 } 917 918 for (nr = 0; nr < split_nr_pages; nr++) { 919 int steps = 1; 920 unsigned long offset = nr * page_size; 921 922 if (signal_test) { 923 if (sigsetjmp(*sigbuf, 1) != 0) { 924 if (steps == 1 && nr == lastnr) 925 err("Signal repeated"); 926 927 lastnr = nr; 928 if (signal_test == 1) { 929 if (steps == 1) { 930 /* This is a MISSING request */ 931 steps++; 932 if (copy_page(uffd, offset)) 933 signalled++; 934 } else { 935 /* This is a WP request */ 936 assert(steps == 2); 937 wp_range(uffd, 938 (__u64)area_dst + 939 offset, 940 page_size, false); 941 } 942 } else { 943 signalled++; 944 continue; 945 } 946 } 947 } 948 949 count = *area_count(area_dst, nr); 950 if (count != count_verify[nr]) 951 err("nr %lu memory corruption %llu %llu\n", 952 nr, count, count_verify[nr]); 953 /* 954 * Trigger write protection if there is by writing 955 * the same value back. 956 */ 957 *area_count(area_dst, nr) = count; 958 } 959 960 if (signal_test) 961 return signalled != split_nr_pages; 962 963 if (test_type == TEST_HUGETLB) 964 return 0; 965 966 area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, 967 MREMAP_MAYMOVE | MREMAP_FIXED, area_src); 968 if (area_dst == MAP_FAILED) 969 err("mremap"); 970 /* Reset area_src since we just clobbered it */ 971 area_src = NULL; 972 973 for (; nr < nr_pages; nr++) { 974 count = *area_count(area_dst, nr); 975 if (count != count_verify[nr]) { 976 err("nr %lu memory corruption %llu %llu\n", 977 nr, count, count_verify[nr]); 978 } 979 /* 980 * Trigger write protection if there is by writing 981 * the same value back. 982 */ 983 *area_count(area_dst, nr) = count; 984 } 985 986 uffd_test_ops->release_pages(area_dst); 987 988 for (nr = 0; nr < nr_pages; nr++) 989 if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) 990 err("nr %lu is not zero", nr); 991 992 return 0; 993} 994 995static void retry_uffdio_zeropage(int ufd, 996 struct uffdio_zeropage *uffdio_zeropage, 997 unsigned long offset) 998{ 999 uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start, 1000 uffdio_zeropage->range.len, 1001 offset); 1002 if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { 1003 if (uffdio_zeropage->zeropage != -EEXIST) 1004 err("UFFDIO_ZEROPAGE error: %"PRId64, 1005 (int64_t)uffdio_zeropage->zeropage); 1006 } else { 1007 err("UFFDIO_ZEROPAGE error: %"PRId64, 1008 (int64_t)uffdio_zeropage->zeropage); 1009 } 1010} 1011 1012static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) 1013{ 1014 struct uffdio_zeropage uffdio_zeropage; 1015 int ret; 1016 unsigned long has_zeropage; 1017 __s64 res; 1018 1019 has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE); 1020 1021 if (offset >= nr_pages * page_size) 1022 err("unexpected offset %lu", offset); 1023 uffdio_zeropage.range.start = (unsigned long) area_dst + offset; 1024 uffdio_zeropage.range.len = page_size; 1025 uffdio_zeropage.mode = 0; 1026 ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); 1027 res = uffdio_zeropage.zeropage; 1028 if (ret) { 1029 /* real retval in ufdio_zeropage.zeropage */ 1030 if (has_zeropage) 1031 err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res); 1032 else if (res != -EINVAL) 1033 err("UFFDIO_ZEROPAGE not -EINVAL"); 1034 } else if (has_zeropage) { 1035 if (res != page_size) { 1036 err("UFFDIO_ZEROPAGE unexpected size"); 1037 } else { 1038 if (test_uffdio_zeropage_eexist && retry) { 1039 test_uffdio_zeropage_eexist = false; 1040 retry_uffdio_zeropage(ufd, &uffdio_zeropage, 1041 offset); 1042 } 1043 return 1; 1044 } 1045 } else 1046 err("UFFDIO_ZEROPAGE succeeded"); 1047 1048 return 0; 1049} 1050 1051static int uffdio_zeropage(int ufd, unsigned long offset) 1052{ 1053 return __uffdio_zeropage(ufd, offset, false); 1054} 1055 1056/* exercise UFFDIO_ZEROPAGE */ 1057static int userfaultfd_zeropage_test(void) 1058{ 1059 struct uffdio_register uffdio_register; 1060 unsigned long expected_ioctls; 1061 1062 printf("testing UFFDIO_ZEROPAGE: "); 1063 fflush(stdout); 1064 1065 uffd_test_ctx_init(0); 1066 1067 uffdio_register.range.start = (unsigned long) area_dst; 1068 uffdio_register.range.len = nr_pages * page_size; 1069 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; 1070 if (test_uffdio_wp) 1071 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; 1072 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1073 err("register failure"); 1074 1075 expected_ioctls = uffd_test_ops->expected_ioctls; 1076 if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) 1077 err("unexpected missing ioctl for anon memory"); 1078 1079 if (uffdio_zeropage(uffd, 0)) 1080 if (my_bcmp(area_dst, zeropage, page_size)) 1081 err("zeropage is not zero"); 1082 1083 printf("done.\n"); 1084 return 0; 1085} 1086 1087static int userfaultfd_events_test(void) 1088{ 1089 struct uffdio_register uffdio_register; 1090 unsigned long expected_ioctls; 1091 pthread_t uffd_mon; 1092 int err, features; 1093 pid_t pid; 1094 char c; 1095 struct uffd_stats stats = { 0 }; 1096 1097 printf("testing events (fork, remap, remove): "); 1098 fflush(stdout); 1099 1100 features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | 1101 UFFD_FEATURE_EVENT_REMOVE; 1102 uffd_test_ctx_init(features); 1103 1104 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); 1105 1106 uffdio_register.range.start = (unsigned long) area_dst; 1107 uffdio_register.range.len = nr_pages * page_size; 1108 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; 1109 if (test_uffdio_wp) 1110 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; 1111 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1112 err("register failure"); 1113 1114 expected_ioctls = uffd_test_ops->expected_ioctls; 1115 if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) 1116 err("unexpected missing ioctl for anon memory"); 1117 1118 if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) 1119 err("uffd_poll_thread create"); 1120 1121 pid = fork(); 1122 if (pid < 0) 1123 err("fork"); 1124 1125 if (!pid) 1126 exit(faulting_process(0)); 1127 1128 waitpid(pid, &err, 0); 1129 if (err) 1130 err("faulting process failed"); 1131 if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) 1132 err("pipe write"); 1133 if (pthread_join(uffd_mon, NULL)) 1134 return 1; 1135 1136 uffd_stats_report(&stats, 1); 1137 1138 return stats.missing_faults != nr_pages; 1139} 1140 1141static int userfaultfd_sig_test(void) 1142{ 1143 struct uffdio_register uffdio_register; 1144 unsigned long expected_ioctls; 1145 unsigned long userfaults; 1146 pthread_t uffd_mon; 1147 int err, features; 1148 pid_t pid; 1149 char c; 1150 struct uffd_stats stats = { 0 }; 1151 1152 printf("testing signal delivery: "); 1153 fflush(stdout); 1154 1155 features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS; 1156 uffd_test_ctx_init(features); 1157 1158 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); 1159 1160 uffdio_register.range.start = (unsigned long) area_dst; 1161 uffdio_register.range.len = nr_pages * page_size; 1162 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; 1163 if (test_uffdio_wp) 1164 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; 1165 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1166 err("register failure"); 1167 1168 expected_ioctls = uffd_test_ops->expected_ioctls; 1169 if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) 1170 err("unexpected missing ioctl for anon memory"); 1171 1172 if (faulting_process(1)) 1173 err("faulting process failed"); 1174 1175 uffd_test_ops->release_pages(area_dst); 1176 1177 if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) 1178 err("uffd_poll_thread create"); 1179 1180 pid = fork(); 1181 if (pid < 0) 1182 err("fork"); 1183 1184 if (!pid) 1185 exit(faulting_process(2)); 1186 1187 waitpid(pid, &err, 0); 1188 if (err) 1189 err("faulting process failed"); 1190 if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) 1191 err("pipe write"); 1192 if (pthread_join(uffd_mon, (void **)&userfaults)) 1193 return 1; 1194 1195 printf("done.\n"); 1196 if (userfaults) 1197 err("Signal test failed, userfaults: %ld", userfaults); 1198 1199 return userfaults != 0; 1200} 1201 1202static int userfaultfd_minor_test(void) 1203{ 1204 struct uffdio_register uffdio_register; 1205 unsigned long expected_ioctls; 1206 unsigned long p; 1207 pthread_t uffd_mon; 1208 uint8_t expected_byte; 1209 void *expected_page; 1210 char c; 1211 struct uffd_stats stats = { 0 }; 1212 uint64_t req_features, features_out; 1213 1214 if (!test_uffdio_minor) 1215 return 0; 1216 1217 printf("testing minor faults: "); 1218 fflush(stdout); 1219 1220 if (test_type == TEST_HUGETLB) 1221 req_features = UFFD_FEATURE_MINOR_HUGETLBFS; 1222 else if (test_type == TEST_SHMEM) 1223 req_features = UFFD_FEATURE_MINOR_SHMEM; 1224 else 1225 return 1; 1226 1227 features_out = req_features; 1228 uffd_test_ctx_init_ext(&features_out); 1229 /* If kernel reports required features aren't supported, skip test. */ 1230 if ((features_out & req_features) != req_features) { 1231 printf("skipping test due to lack of feature support\n"); 1232 fflush(stdout); 1233 return 0; 1234 } 1235 1236 uffdio_register.range.start = (unsigned long)area_dst_alias; 1237 uffdio_register.range.len = nr_pages * page_size; 1238 uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; 1239 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1240 err("register failure"); 1241 1242 expected_ioctls = uffd_test_ops->expected_ioctls; 1243 expected_ioctls |= 1 << _UFFDIO_CONTINUE; 1244 if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) 1245 err("unexpected missing ioctl(s)"); 1246 1247 /* 1248 * After registering with UFFD, populate the non-UFFD-registered side of 1249 * the shared mapping. This should *not* trigger any UFFD minor faults. 1250 */ 1251 for (p = 0; p < nr_pages; ++p) { 1252 memset(area_dst + (p * page_size), p % ((uint8_t)-1), 1253 page_size); 1254 } 1255 1256 if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) 1257 err("uffd_poll_thread create"); 1258 1259 /* 1260 * Read each of the pages back using the UFFD-registered mapping. We 1261 * expect that the first time we touch a page, it will result in a minor 1262 * fault. uffd_poll_thread will resolve the fault by bit-flipping the 1263 * page's contents, and then issuing a CONTINUE ioctl. 1264 */ 1265 1266 if (posix_memalign(&expected_page, page_size, page_size)) 1267 err("out of memory"); 1268 1269 for (p = 0; p < nr_pages; ++p) { 1270 expected_byte = ~((uint8_t)(p % ((uint8_t)-1))); 1271 memset(expected_page, expected_byte, page_size); 1272 if (my_bcmp(expected_page, area_dst_alias + (p * page_size), 1273 page_size)) 1274 err("unexpected page contents after minor fault"); 1275 } 1276 1277 if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) 1278 err("pipe write"); 1279 if (pthread_join(uffd_mon, NULL)) 1280 return 1; 1281 1282 uffd_stats_report(&stats, 1); 1283 1284 return stats.missing_faults != 0 || stats.minor_faults != nr_pages; 1285} 1286 1287#define BIT_ULL(nr) (1ULL << (nr)) 1288#define PM_SOFT_DIRTY BIT_ULL(55) 1289#define PM_MMAP_EXCLUSIVE BIT_ULL(56) 1290#define PM_UFFD_WP BIT_ULL(57) 1291#define PM_FILE BIT_ULL(61) 1292#define PM_SWAP BIT_ULL(62) 1293#define PM_PRESENT BIT_ULL(63) 1294 1295static int pagemap_open(void) 1296{ 1297 int fd = open("/proc/self/pagemap", O_RDONLY); 1298 1299 if (fd < 0) 1300 err("open pagemap"); 1301 1302 return fd; 1303} 1304 1305static uint64_t pagemap_read_vaddr(int fd, void *vaddr) 1306{ 1307 uint64_t value; 1308 int ret; 1309 1310 ret = pread(fd, &value, sizeof(uint64_t), 1311 ((uint64_t)vaddr >> 12) * sizeof(uint64_t)); 1312 if (ret != sizeof(uint64_t)) 1313 err("pread() on pagemap failed"); 1314 1315 return value; 1316} 1317 1318/* This macro let __LINE__ works in err() */ 1319#define pagemap_check_wp(value, wp) do { \ 1320 if (!!(value & PM_UFFD_WP) != wp) \ 1321 err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \ 1322 } while (0) 1323 1324static int pagemap_test_fork(bool present) 1325{ 1326 pid_t child = fork(); 1327 uint64_t value; 1328 int fd, result; 1329 1330 if (!child) { 1331 /* Open the pagemap fd of the child itself */ 1332 fd = pagemap_open(); 1333 value = pagemap_read_vaddr(fd, area_dst); 1334 /* 1335 * After fork() uffd-wp bit should be gone as long as we're 1336 * without UFFD_FEATURE_EVENT_FORK 1337 */ 1338 pagemap_check_wp(value, false); 1339 /* Succeed */ 1340 exit(0); 1341 } 1342 waitpid(child, &result, 0); 1343 return result; 1344} 1345 1346static void userfaultfd_pagemap_test(unsigned int test_pgsize) 1347{ 1348 struct uffdio_register uffdio_register; 1349 int pagemap_fd; 1350 uint64_t value; 1351 1352 /* Pagemap tests uffd-wp only */ 1353 if (!test_uffdio_wp) 1354 return; 1355 1356 /* Not enough memory to test this page size */ 1357 if (test_pgsize > nr_pages * page_size) 1358 return; 1359 1360 printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize); 1361 /* Flush so it doesn't flush twice in parent/child later */ 1362 fflush(stdout); 1363 1364 uffd_test_ctx_init(0); 1365 1366 if (test_pgsize > page_size) { 1367 /* This is a thp test */ 1368 if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) 1369 err("madvise(MADV_HUGEPAGE) failed"); 1370 } else if (test_pgsize == page_size) { 1371 /* This is normal page test; force no thp */ 1372 if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) 1373 err("madvise(MADV_NOHUGEPAGE) failed"); 1374 } 1375 1376 uffdio_register.range.start = (unsigned long) area_dst; 1377 uffdio_register.range.len = nr_pages * page_size; 1378 uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; 1379 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1380 err("register failed"); 1381 1382 pagemap_fd = pagemap_open(); 1383 1384 /* Touch the page */ 1385 *area_dst = 1; 1386 wp_range(uffd, (uint64_t)area_dst, test_pgsize, true); 1387 value = pagemap_read_vaddr(pagemap_fd, area_dst); 1388 pagemap_check_wp(value, true); 1389 /* Make sure uffd-wp bit dropped when fork */ 1390 if (pagemap_test_fork(true)) 1391 err("Detected stall uffd-wp bit in child"); 1392 1393 /* Exclusive required or PAGEOUT won't work */ 1394 if (!(value & PM_MMAP_EXCLUSIVE)) 1395 err("multiple mapping detected: 0x%"PRIx64, value); 1396 1397 if (madvise(area_dst, test_pgsize, MADV_PAGEOUT)) 1398 err("madvise(MADV_PAGEOUT) failed"); 1399 1400 /* Uffd-wp should persist even swapped out */ 1401 value = pagemap_read_vaddr(pagemap_fd, area_dst); 1402 pagemap_check_wp(value, true); 1403 /* Make sure uffd-wp bit dropped when fork */ 1404 if (pagemap_test_fork(false)) 1405 err("Detected stall uffd-wp bit in child"); 1406 1407 /* Unprotect; this tests swap pte modifications */ 1408 wp_range(uffd, (uint64_t)area_dst, page_size, false); 1409 value = pagemap_read_vaddr(pagemap_fd, area_dst); 1410 pagemap_check_wp(value, false); 1411 1412 /* Fault in the page from disk */ 1413 *area_dst = 2; 1414 value = pagemap_read_vaddr(pagemap_fd, area_dst); 1415 pagemap_check_wp(value, false); 1416 1417 close(pagemap_fd); 1418 printf("done\n"); 1419} 1420 1421static int userfaultfd_stress(void) 1422{ 1423 void *area; 1424 char *tmp_area; 1425 unsigned long nr; 1426 struct uffdio_register uffdio_register; 1427 struct uffd_stats uffd_stats[nr_cpus]; 1428 1429 uffd_test_ctx_init(0); 1430 1431 if (posix_memalign(&area, page_size, page_size)) 1432 err("out of memory"); 1433 zeropage = area; 1434 bzero(zeropage, page_size); 1435 1436 pthread_mutex_lock(&uffd_read_mutex); 1437 1438 pthread_attr_init(&attr); 1439 pthread_attr_setstacksize(&attr, 16*1024*1024); 1440 1441 while (bounces--) { 1442 unsigned long expected_ioctls; 1443 1444 printf("bounces: %d, mode:", bounces); 1445 if (bounces & BOUNCE_RANDOM) 1446 printf(" rnd"); 1447 if (bounces & BOUNCE_RACINGFAULTS) 1448 printf(" racing"); 1449 if (bounces & BOUNCE_VERIFY) 1450 printf(" ver"); 1451 if (bounces & BOUNCE_POLL) 1452 printf(" poll"); 1453 else 1454 printf(" read"); 1455 printf(", "); 1456 fflush(stdout); 1457 1458 if (bounces & BOUNCE_POLL) 1459 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); 1460 else 1461 fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); 1462 1463 /* register */ 1464 uffdio_register.range.start = (unsigned long) area_dst; 1465 uffdio_register.range.len = nr_pages * page_size; 1466 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; 1467 if (test_uffdio_wp) 1468 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; 1469 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1470 err("register failure"); 1471 expected_ioctls = uffd_test_ops->expected_ioctls; 1472 if ((uffdio_register.ioctls & expected_ioctls) != 1473 expected_ioctls) 1474 err("unexpected missing ioctl for anon memory"); 1475 1476 if (area_dst_alias) { 1477 uffdio_register.range.start = (unsigned long) 1478 area_dst_alias; 1479 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1480 err("register failure alias"); 1481 } 1482 1483 /* 1484 * The madvise done previously isn't enough: some 1485 * uffd_thread could have read userfaults (one of 1486 * those already resolved by the background thread) 1487 * and it may be in the process of calling 1488 * UFFDIO_COPY. UFFDIO_COPY will read the zapped 1489 * area_src and it would map a zero page in it (of 1490 * course such a UFFDIO_COPY is perfectly safe as it'd 1491 * return -EEXIST). The problem comes at the next 1492 * bounce though: that racing UFFDIO_COPY would 1493 * generate zeropages in the area_src, so invalidating 1494 * the previous MADV_DONTNEED. Without this additional 1495 * MADV_DONTNEED those zeropages leftovers in the 1496 * area_src would lead to -EEXIST failure during the 1497 * next bounce, effectively leaving a zeropage in the 1498 * area_dst. 1499 * 1500 * Try to comment this out madvise to see the memory 1501 * corruption being caught pretty quick. 1502 * 1503 * khugepaged is also inhibited to collapse THP after 1504 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's 1505 * required to MADV_DONTNEED here. 1506 */ 1507 uffd_test_ops->release_pages(area_dst); 1508 1509 uffd_stats_reset(uffd_stats, nr_cpus); 1510 1511 /* bounce pass */ 1512 if (stress(uffd_stats)) 1513 return 1; 1514 1515 /* Clear all the write protections if there is any */ 1516 if (test_uffdio_wp) 1517 wp_range(uffd, (unsigned long)area_dst, 1518 nr_pages * page_size, false); 1519 1520 /* unregister */ 1521 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) 1522 err("unregister failure"); 1523 if (area_dst_alias) { 1524 uffdio_register.range.start = (unsigned long) area_dst; 1525 if (ioctl(uffd, UFFDIO_UNREGISTER, 1526 &uffdio_register.range)) 1527 err("unregister failure alias"); 1528 } 1529 1530 /* verification */ 1531 if (bounces & BOUNCE_VERIFY) 1532 for (nr = 0; nr < nr_pages; nr++) 1533 if (*area_count(area_dst, nr) != count_verify[nr]) 1534 err("error area_count %llu %llu %lu\n", 1535 *area_count(area_src, nr), 1536 count_verify[nr], nr); 1537 1538 /* prepare next bounce */ 1539 tmp_area = area_src; 1540 area_src = area_dst; 1541 area_dst = tmp_area; 1542 1543 tmp_area = area_src_alias; 1544 area_src_alias = area_dst_alias; 1545 area_dst_alias = tmp_area; 1546 1547 uffd_stats_report(uffd_stats, nr_cpus); 1548 } 1549 1550 if (test_type == TEST_ANON) { 1551 /* 1552 * shmem/hugetlb won't be able to run since they have different 1553 * behavior on fork() (file-backed memory normally drops ptes 1554 * directly when fork), meanwhile the pagemap test will verify 1555 * pgtable entry of fork()ed child. 1556 */ 1557 userfaultfd_pagemap_test(page_size); 1558 /* 1559 * Hard-code for x86_64 for now for 2M THP, as x86_64 is 1560 * currently the only one that supports uffd-wp 1561 */ 1562 userfaultfd_pagemap_test(page_size * 512); 1563 } 1564 1565 return userfaultfd_zeropage_test() || userfaultfd_sig_test() 1566 || userfaultfd_events_test() || userfaultfd_minor_test(); 1567} 1568 1569/* 1570 * Copied from mlock2-tests.c 1571 */ 1572unsigned long default_huge_page_size(void) 1573{ 1574 unsigned long hps = 0; 1575 char *line = NULL; 1576 size_t linelen = 0; 1577 FILE *f = fopen("/proc/meminfo", "r"); 1578 1579 if (!f) 1580 return 0; 1581 while (getline(&line, &linelen, f) > 0) { 1582 if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { 1583 hps <<= 10; 1584 break; 1585 } 1586 } 1587 1588 free(line); 1589 fclose(f); 1590 return hps; 1591} 1592 1593static void set_test_type(const char *type) 1594{ 1595 if (!strcmp(type, "anon")) { 1596 test_type = TEST_ANON; 1597 uffd_test_ops = &anon_uffd_test_ops; 1598 /* Only enable write-protect test for anonymous test */ 1599 test_uffdio_wp = true; 1600 } else if (!strcmp(type, "hugetlb")) { 1601 test_type = TEST_HUGETLB; 1602 uffd_test_ops = &hugetlb_uffd_test_ops; 1603 } else if (!strcmp(type, "hugetlb_shared")) { 1604 map_shared = true; 1605 test_type = TEST_HUGETLB; 1606 uffd_test_ops = &hugetlb_uffd_test_ops; 1607 /* Minor faults require shared hugetlb; only enable here. */ 1608 test_uffdio_minor = true; 1609 } else if (!strcmp(type, "shmem")) { 1610 map_shared = true; 1611 test_type = TEST_SHMEM; 1612 uffd_test_ops = &shmem_uffd_test_ops; 1613 test_uffdio_minor = true; 1614 } else { 1615 err("Unknown test type: %s", type); 1616 } 1617 1618 if (test_type == TEST_HUGETLB) 1619 page_size = default_huge_page_size(); 1620 else 1621 page_size = sysconf(_SC_PAGE_SIZE); 1622 1623 if (!page_size) 1624 err("Unable to determine page size"); 1625 if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 1626 > page_size) 1627 err("Impossible to run this test"); 1628} 1629 1630static void sigalrm(int sig) 1631{ 1632 if (sig != SIGALRM) 1633 abort(); 1634 test_uffdio_copy_eexist = true; 1635 test_uffdio_zeropage_eexist = true; 1636 alarm(ALARM_INTERVAL_SECS); 1637} 1638 1639int main(int argc, char **argv) 1640{ 1641 if (argc < 4) 1642 usage(); 1643 1644 if (signal(SIGALRM, sigalrm) == SIG_ERR) 1645 err("failed to arm SIGALRM"); 1646 alarm(ALARM_INTERVAL_SECS); 1647 1648 set_test_type(argv[1]); 1649 1650 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); 1651 nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size / 1652 nr_cpus; 1653 if (!nr_pages_per_cpu) { 1654 _err("invalid MiB"); 1655 usage(); 1656 } 1657 1658 bounces = atoi(argv[3]); 1659 if (bounces <= 0) { 1660 _err("invalid bounces"); 1661 usage(); 1662 } 1663 nr_pages = nr_pages_per_cpu * nr_cpus; 1664 1665 if (test_type == TEST_HUGETLB) { 1666 if (argc < 5) 1667 usage(); 1668 huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755); 1669 if (huge_fd < 0) 1670 err("Open of %s failed", argv[4]); 1671 if (ftruncate(huge_fd, 0)) 1672 err("ftruncate %s to size 0 failed", argv[4]); 1673 } else if (test_type == TEST_SHMEM) { 1674 shm_fd = memfd_create(argv[0], 0); 1675 if (shm_fd < 0) 1676 err("memfd_create"); 1677 if (ftruncate(shm_fd, nr_pages * page_size * 2)) 1678 err("ftruncate"); 1679 if (fallocate(shm_fd, 1680 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, 1681 nr_pages * page_size * 2)) 1682 err("fallocate"); 1683 } 1684 printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", 1685 nr_pages, nr_pages_per_cpu); 1686 return userfaultfd_stress(); 1687} 1688 1689#else /* __NR_userfaultfd */ 1690 1691#warning "missing __NR_userfaultfd definition" 1692 1693int main(void) 1694{ 1695 printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); 1696 return KSFT_SKIP; 1697} 1698 1699#endif /* __NR_userfaultfd */