at v5.16-rc4 1705 lines 46 kB view raw
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Stress userfaultfd syscall. 4 * 5 * Copyright (C) 2015 Red Hat, Inc. 6 * 7 * This test allocates two virtual areas and bounces the physical 8 * memory across the two virtual areas (from area_src to area_dst) 9 * using userfaultfd. 10 * 11 * There are three threads running per CPU: 12 * 13 * 1) one per-CPU thread takes a per-page pthread_mutex in a random 14 * page of the area_dst (while the physical page may still be in 15 * area_src), and increments a per-page counter in the same page, 16 * and checks its value against a verification region. 17 * 18 * 2) another per-CPU thread handles the userfaults generated by 19 * thread 1 above. userfaultfd blocking reads or poll() modes are 20 * exercised interleaved. 21 * 22 * 3) one last per-CPU thread transfers the memory in the background 23 * at maximum bandwidth (if not already transferred by thread 24 * 2). Each cpu thread takes cares of transferring a portion of the 25 * area. 26 * 27 * When all threads of type 3 completed the transfer, one bounce is 28 * complete. area_src and area_dst are then swapped. All threads are 29 * respawned and so the bounce is immediately restarted in the 30 * opposite direction. 31 * 32 * per-CPU threads 1 by triggering userfaults inside 33 * pthread_mutex_lock will also verify the atomicity of the memory 34 * transfer (UFFDIO_COPY). 35 */ 36 37#define _GNU_SOURCE 38#include <stdio.h> 39#include <errno.h> 40#include <unistd.h> 41#include <stdlib.h> 42#include <sys/types.h> 43#include <sys/stat.h> 44#include <fcntl.h> 45#include <time.h> 46#include <signal.h> 47#include <poll.h> 48#include <string.h> 49#include <sys/mman.h> 50#include <sys/syscall.h> 51#include <sys/ioctl.h> 52#include <sys/wait.h> 53#include <pthread.h> 54#include <linux/userfaultfd.h> 55#include <setjmp.h> 56#include <stdbool.h> 57#include <assert.h> 58#include <inttypes.h> 59#include <stdint.h> 60#include <sys/random.h> 61 62#include "../kselftest.h" 63 64#ifdef __NR_userfaultfd 65 66static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; 67 68#define BOUNCE_RANDOM (1<<0) 69#define BOUNCE_RACINGFAULTS (1<<1) 70#define BOUNCE_VERIFY (1<<2) 71#define BOUNCE_POLL (1<<3) 72static int bounces; 73 74#define TEST_ANON 1 75#define TEST_HUGETLB 2 76#define TEST_SHMEM 3 77static int test_type; 78 79/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ 80#define ALARM_INTERVAL_SECS 10 81static volatile bool test_uffdio_copy_eexist = true; 82static volatile bool test_uffdio_zeropage_eexist = true; 83/* Whether to test uffd write-protection */ 84static bool test_uffdio_wp = false; 85/* Whether to test uffd minor faults */ 86static bool test_uffdio_minor = false; 87 88static bool map_shared; 89static int shm_fd; 90static int huge_fd; 91static char *huge_fd_off0; 92static unsigned long long *count_verify; 93static int uffd = -1; 94static int uffd_flags, finished, *pipefd; 95static char *area_src, *area_src_alias, *area_dst, *area_dst_alias; 96static char *zeropage; 97pthread_attr_t attr; 98 99/* Userfaultfd test statistics */ 100struct uffd_stats { 101 int cpu; 102 unsigned long missing_faults; 103 unsigned long wp_faults; 104 unsigned long minor_faults; 105}; 106 107/* pthread_mutex_t starts at page offset 0 */ 108#define area_mutex(___area, ___nr) \ 109 ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) 110/* 111 * count is placed in the page after pthread_mutex_t naturally aligned 112 * to avoid non alignment faults on non-x86 archs. 113 */ 114#define area_count(___area, ___nr) \ 115 ((volatile unsigned long long *) ((unsigned long) \ 116 ((___area) + (___nr)*page_size + \ 117 sizeof(pthread_mutex_t) + \ 118 sizeof(unsigned long long) - 1) & \ 119 ~(unsigned long)(sizeof(unsigned long long) \ 120 - 1))) 121 122const char *examples = 123 "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" 124 "./userfaultfd anon 100 99999\n\n" 125 "# Run share memory test on 1GiB region with 99 bounces:\n" 126 "./userfaultfd shmem 1000 99\n\n" 127 "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n" 128 "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n" 129 "# Run the same hugetlb test but using shmem:\n" 130 "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n" 131 "# 10MiB-~6GiB 999 bounces anonymous test, " 132 "continue forever unless an error triggers\n" 133 "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; 134 135static void usage(void) 136{ 137 fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> " 138 "[hugetlbfs_file]\n\n"); 139 fprintf(stderr, "Supported <test type>: anon, hugetlb, " 140 "hugetlb_shared, shmem\n\n"); 141 fprintf(stderr, "Examples:\n\n"); 142 fprintf(stderr, "%s", examples); 143 exit(1); 144} 145 146#define _err(fmt, ...) \ 147 do { \ 148 int ret = errno; \ 149 fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ 150 fprintf(stderr, " (errno=%d, line=%d)\n", \ 151 ret, __LINE__); \ 152 } while (0) 153 154#define err(fmt, ...) \ 155 do { \ 156 _err(fmt, ##__VA_ARGS__); \ 157 exit(1); \ 158 } while (0) 159 160static void uffd_stats_reset(struct uffd_stats *uffd_stats, 161 unsigned long n_cpus) 162{ 163 int i; 164 165 for (i = 0; i < n_cpus; i++) { 166 uffd_stats[i].cpu = i; 167 uffd_stats[i].missing_faults = 0; 168 uffd_stats[i].wp_faults = 0; 169 uffd_stats[i].minor_faults = 0; 170 } 171} 172 173static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) 174{ 175 int i; 176 unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; 177 178 for (i = 0; i < n_cpus; i++) { 179 miss_total += stats[i].missing_faults; 180 wp_total += stats[i].wp_faults; 181 minor_total += stats[i].minor_faults; 182 } 183 184 printf("userfaults: "); 185 if (miss_total) { 186 printf("%llu missing (", miss_total); 187 for (i = 0; i < n_cpus; i++) 188 printf("%lu+", stats[i].missing_faults); 189 printf("\b) "); 190 } 191 if (wp_total) { 192 printf("%llu wp (", wp_total); 193 for (i = 0; i < n_cpus; i++) 194 printf("%lu+", stats[i].wp_faults); 195 printf("\b) "); 196 } 197 if (minor_total) { 198 printf("%llu minor (", minor_total); 199 for (i = 0; i < n_cpus; i++) 200 printf("%lu+", stats[i].minor_faults); 201 printf("\b)"); 202 } 203 printf("\n"); 204} 205 206static void anon_release_pages(char *rel_area) 207{ 208 if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) 209 err("madvise(MADV_DONTNEED) failed"); 210} 211 212static void anon_allocate_area(void **alloc_area) 213{ 214 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, 215 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); 216 if (*alloc_area == MAP_FAILED) 217 err("mmap of anonymous memory failed"); 218} 219 220static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) 221{ 222} 223 224static void hugetlb_release_pages(char *rel_area) 225{ 226 if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 227 rel_area == huge_fd_off0 ? 0 : nr_pages * page_size, 228 nr_pages * page_size)) 229 err("fallocate() failed"); 230} 231 232static void hugetlb_allocate_area(void **alloc_area) 233{ 234 void *area_alias = NULL; 235 char **alloc_area_alias; 236 237 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, 238 (map_shared ? MAP_SHARED : MAP_PRIVATE) | 239 MAP_HUGETLB, 240 huge_fd, *alloc_area == area_src ? 0 : 241 nr_pages * page_size); 242 if (*alloc_area == MAP_FAILED) 243 err("mmap of hugetlbfs file failed"); 244 245 if (map_shared) { 246 area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, 247 MAP_SHARED | MAP_HUGETLB, 248 huge_fd, *alloc_area == area_src ? 0 : 249 nr_pages * page_size); 250 if (area_alias == MAP_FAILED) 251 err("mmap of hugetlb file alias failed"); 252 } 253 254 if (*alloc_area == area_src) { 255 huge_fd_off0 = *alloc_area; 256 alloc_area_alias = &area_src_alias; 257 } else { 258 alloc_area_alias = &area_dst_alias; 259 } 260 if (area_alias) 261 *alloc_area_alias = area_alias; 262} 263 264static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) 265{ 266 if (!map_shared) 267 return; 268 /* 269 * We can't zap just the pagetable with hugetlbfs because 270 * MADV_DONTEED won't work. So exercise -EEXIST on a alias 271 * mapping where the pagetables are not established initially, 272 * this way we'll exercise the -EEXEC at the fs level. 273 */ 274 *start = (unsigned long) area_dst_alias + offset; 275} 276 277static void shmem_release_pages(char *rel_area) 278{ 279 if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) 280 err("madvise(MADV_REMOVE) failed"); 281} 282 283static void shmem_allocate_area(void **alloc_area) 284{ 285 void *area_alias = NULL; 286 bool is_src = alloc_area == (void **)&area_src; 287 unsigned long offset = is_src ? 0 : nr_pages * page_size; 288 289 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, 290 MAP_SHARED, shm_fd, offset); 291 if (*alloc_area == MAP_FAILED) 292 err("mmap of memfd failed"); 293 294 area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, 295 MAP_SHARED, shm_fd, offset); 296 if (area_alias == MAP_FAILED) 297 err("mmap of memfd alias failed"); 298 299 if (is_src) 300 area_src_alias = area_alias; 301 else 302 area_dst_alias = area_alias; 303} 304 305static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) 306{ 307 *start = (unsigned long)area_dst_alias + offset; 308} 309 310struct uffd_test_ops { 311 void (*allocate_area)(void **alloc_area); 312 void (*release_pages)(char *rel_area); 313 void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); 314}; 315 316static struct uffd_test_ops anon_uffd_test_ops = { 317 .allocate_area = anon_allocate_area, 318 .release_pages = anon_release_pages, 319 .alias_mapping = noop_alias_mapping, 320}; 321 322static struct uffd_test_ops shmem_uffd_test_ops = { 323 .allocate_area = shmem_allocate_area, 324 .release_pages = shmem_release_pages, 325 .alias_mapping = shmem_alias_mapping, 326}; 327 328static struct uffd_test_ops hugetlb_uffd_test_ops = { 329 .allocate_area = hugetlb_allocate_area, 330 .release_pages = hugetlb_release_pages, 331 .alias_mapping = hugetlb_alias_mapping, 332}; 333 334static struct uffd_test_ops *uffd_test_ops; 335 336static inline uint64_t uffd_minor_feature(void) 337{ 338 if (test_type == TEST_HUGETLB && map_shared) 339 return UFFD_FEATURE_MINOR_HUGETLBFS; 340 else if (test_type == TEST_SHMEM) 341 return UFFD_FEATURE_MINOR_SHMEM; 342 else 343 return 0; 344} 345 346static uint64_t get_expected_ioctls(uint64_t mode) 347{ 348 uint64_t ioctls = UFFD_API_RANGE_IOCTLS; 349 350 if (test_type == TEST_HUGETLB) 351 ioctls &= ~(1 << _UFFDIO_ZEROPAGE); 352 353 if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) 354 ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); 355 356 if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) 357 ioctls &= ~(1 << _UFFDIO_CONTINUE); 358 359 return ioctls; 360} 361 362static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) 363{ 364 uint64_t expected = get_expected_ioctls(mode); 365 uint64_t actual = ioctls & expected; 366 367 if (actual != expected) { 368 err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, 369 expected, actual); 370 } 371} 372 373static void userfaultfd_open(uint64_t *features) 374{ 375 struct uffdio_api uffdio_api; 376 377 uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); 378 if (uffd < 0) 379 err("userfaultfd syscall not available in this kernel"); 380 uffd_flags = fcntl(uffd, F_GETFD, NULL); 381 382 uffdio_api.api = UFFD_API; 383 uffdio_api.features = *features; 384 if (ioctl(uffd, UFFDIO_API, &uffdio_api)) 385 err("UFFDIO_API failed.\nPlease make sure to " 386 "run with either root or ptrace capability."); 387 if (uffdio_api.api != UFFD_API) 388 err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); 389 390 *features = uffdio_api.features; 391} 392 393static inline void munmap_area(void **area) 394{ 395 if (*area) 396 if (munmap(*area, nr_pages * page_size)) 397 err("munmap"); 398 399 *area = NULL; 400} 401 402static void uffd_test_ctx_clear(void) 403{ 404 size_t i; 405 406 if (pipefd) { 407 for (i = 0; i < nr_cpus * 2; ++i) { 408 if (close(pipefd[i])) 409 err("close pipefd"); 410 } 411 free(pipefd); 412 pipefd = NULL; 413 } 414 415 if (count_verify) { 416 free(count_verify); 417 count_verify = NULL; 418 } 419 420 if (uffd != -1) { 421 if (close(uffd)) 422 err("close uffd"); 423 uffd = -1; 424 } 425 426 huge_fd_off0 = NULL; 427 munmap_area((void **)&area_src); 428 munmap_area((void **)&area_src_alias); 429 munmap_area((void **)&area_dst); 430 munmap_area((void **)&area_dst_alias); 431} 432 433static void uffd_test_ctx_init(uint64_t features) 434{ 435 unsigned long nr, cpu; 436 437 uffd_test_ctx_clear(); 438 439 uffd_test_ops->allocate_area((void **)&area_src); 440 uffd_test_ops->allocate_area((void **)&area_dst); 441 442 userfaultfd_open(&features); 443 444 count_verify = malloc(nr_pages * sizeof(unsigned long long)); 445 if (!count_verify) 446 err("count_verify"); 447 448 for (nr = 0; nr < nr_pages; nr++) { 449 *area_mutex(area_src, nr) = 450 (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; 451 count_verify[nr] = *area_count(area_src, nr) = 1; 452 /* 453 * In the transition between 255 to 256, powerpc will 454 * read out of order in my_bcmp and see both bytes as 455 * zero, so leave a placeholder below always non-zero 456 * after the count, to avoid my_bcmp to trigger false 457 * positives. 458 */ 459 *(area_count(area_src, nr) + 1) = 1; 460 } 461 462 /* 463 * After initialization of area_src, we must explicitly release pages 464 * for area_dst to make sure it's fully empty. Otherwise we could have 465 * some area_dst pages be errornously initialized with zero pages, 466 * hence we could hit memory corruption later in the test. 467 * 468 * One example is when THP is globally enabled, above allocate_area() 469 * calls could have the two areas merged into a single VMA (as they 470 * will have the same VMA flags so they're mergeable). When we 471 * initialize the area_src above, it's possible that some part of 472 * area_dst could have been faulted in via one huge THP that will be 473 * shared between area_src and area_dst. It could cause some of the 474 * area_dst won't be trapped by missing userfaults. 475 * 476 * This release_pages() will guarantee even if that happened, we'll 477 * proactively split the thp and drop any accidentally initialized 478 * pages within area_dst. 479 */ 480 uffd_test_ops->release_pages(area_dst); 481 482 pipefd = malloc(sizeof(int) * nr_cpus * 2); 483 if (!pipefd) 484 err("pipefd"); 485 for (cpu = 0; cpu < nr_cpus; cpu++) 486 if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) 487 err("pipe"); 488} 489 490static int my_bcmp(char *str1, char *str2, size_t n) 491{ 492 unsigned long i; 493 for (i = 0; i < n; i++) 494 if (str1[i] != str2[i]) 495 return 1; 496 return 0; 497} 498 499static void wp_range(int ufd, __u64 start, __u64 len, bool wp) 500{ 501 struct uffdio_writeprotect prms; 502 503 /* Write protection page faults */ 504 prms.range.start = start; 505 prms.range.len = len; 506 /* Undo write-protect, do wakeup after that */ 507 prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; 508 509 if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) 510 err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); 511} 512 513static void continue_range(int ufd, __u64 start, __u64 len) 514{ 515 struct uffdio_continue req; 516 int ret; 517 518 req.range.start = start; 519 req.range.len = len; 520 req.mode = 0; 521 522 if (ioctl(ufd, UFFDIO_CONTINUE, &req)) 523 err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, 524 (uint64_t)start); 525 526 /* 527 * Error handling within the kernel for continue is subtly different 528 * from copy or zeropage, so it may be a source of bugs. Trigger an 529 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. 530 */ 531 req.mapped = 0; 532 ret = ioctl(ufd, UFFDIO_CONTINUE, &req); 533 if (ret >= 0 || req.mapped != -EEXIST) 534 err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, 535 ret, (int64_t) req.mapped); 536} 537 538static void *locking_thread(void *arg) 539{ 540 unsigned long cpu = (unsigned long) arg; 541 unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */ 542 unsigned long long count; 543 544 if (!(bounces & BOUNCE_RANDOM)) { 545 page_nr = -bounces; 546 if (!(bounces & BOUNCE_RACINGFAULTS)) 547 page_nr += cpu * nr_pages_per_cpu; 548 } 549 550 while (!finished) { 551 if (bounces & BOUNCE_RANDOM) { 552 if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr)) 553 err("getrandom failed"); 554 } else 555 page_nr += 1; 556 page_nr %= nr_pages; 557 pthread_mutex_lock(area_mutex(area_dst, page_nr)); 558 count = *area_count(area_dst, page_nr); 559 if (count != count_verify[page_nr]) 560 err("page_nr %lu memory corruption %llu %llu", 561 page_nr, count, count_verify[page_nr]); 562 count++; 563 *area_count(area_dst, page_nr) = count_verify[page_nr] = count; 564 pthread_mutex_unlock(area_mutex(area_dst, page_nr)); 565 } 566 567 return NULL; 568} 569 570static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, 571 unsigned long offset) 572{ 573 uffd_test_ops->alias_mapping(&uffdio_copy->dst, 574 uffdio_copy->len, 575 offset); 576 if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { 577 /* real retval in ufdio_copy.copy */ 578 if (uffdio_copy->copy != -EEXIST) 579 err("UFFDIO_COPY retry error: %"PRId64, 580 (int64_t)uffdio_copy->copy); 581 } else { 582 err("UFFDIO_COPY retry unexpected: %"PRId64, 583 (int64_t)uffdio_copy->copy); 584 } 585} 586 587static void wake_range(int ufd, unsigned long addr, unsigned long len) 588{ 589 struct uffdio_range uffdio_wake; 590 591 uffdio_wake.start = addr; 592 uffdio_wake.len = len; 593 594 if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) 595 fprintf(stderr, "error waking %lu\n", 596 addr), exit(1); 597} 598 599static int __copy_page(int ufd, unsigned long offset, bool retry) 600{ 601 struct uffdio_copy uffdio_copy; 602 603 if (offset >= nr_pages * page_size) 604 err("unexpected offset %lu\n", offset); 605 uffdio_copy.dst = (unsigned long) area_dst + offset; 606 uffdio_copy.src = (unsigned long) area_src + offset; 607 uffdio_copy.len = page_size; 608 if (test_uffdio_wp) 609 uffdio_copy.mode = UFFDIO_COPY_MODE_WP; 610 else 611 uffdio_copy.mode = 0; 612 uffdio_copy.copy = 0; 613 if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { 614 /* real retval in ufdio_copy.copy */ 615 if (uffdio_copy.copy != -EEXIST) 616 err("UFFDIO_COPY error: %"PRId64, 617 (int64_t)uffdio_copy.copy); 618 wake_range(ufd, uffdio_copy.dst, page_size); 619 } else if (uffdio_copy.copy != page_size) { 620 err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); 621 } else { 622 if (test_uffdio_copy_eexist && retry) { 623 test_uffdio_copy_eexist = false; 624 retry_copy_page(ufd, &uffdio_copy, offset); 625 } 626 return 1; 627 } 628 return 0; 629} 630 631static int copy_page_retry(int ufd, unsigned long offset) 632{ 633 return __copy_page(ufd, offset, true); 634} 635 636static int copy_page(int ufd, unsigned long offset) 637{ 638 return __copy_page(ufd, offset, false); 639} 640 641static int uffd_read_msg(int ufd, struct uffd_msg *msg) 642{ 643 int ret = read(uffd, msg, sizeof(*msg)); 644 645 if (ret != sizeof(*msg)) { 646 if (ret < 0) { 647 if (errno == EAGAIN) 648 return 1; 649 err("blocking read error"); 650 } else { 651 err("short read"); 652 } 653 } 654 655 return 0; 656} 657 658static void uffd_handle_page_fault(struct uffd_msg *msg, 659 struct uffd_stats *stats) 660{ 661 unsigned long offset; 662 663 if (msg->event != UFFD_EVENT_PAGEFAULT) 664 err("unexpected msg event %u", msg->event); 665 666 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { 667 /* Write protect page faults */ 668 wp_range(uffd, msg->arg.pagefault.address, page_size, false); 669 stats->wp_faults++; 670 } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { 671 uint8_t *area; 672 int b; 673 674 /* 675 * Minor page faults 676 * 677 * To prove we can modify the original range for testing 678 * purposes, we're going to bit flip this range before 679 * continuing. 680 * 681 * Note that this requires all minor page fault tests operate on 682 * area_dst (non-UFFD-registered) and area_dst_alias 683 * (UFFD-registered). 684 */ 685 686 area = (uint8_t *)(area_dst + 687 ((char *)msg->arg.pagefault.address - 688 area_dst_alias)); 689 for (b = 0; b < page_size; ++b) 690 area[b] = ~area[b]; 691 continue_range(uffd, msg->arg.pagefault.address, page_size); 692 stats->minor_faults++; 693 } else { 694 /* Missing page faults */ 695 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) 696 err("unexpected write fault"); 697 698 offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; 699 offset &= ~(page_size-1); 700 701 if (copy_page(uffd, offset)) 702 stats->missing_faults++; 703 } 704} 705 706static void *uffd_poll_thread(void *arg) 707{ 708 struct uffd_stats *stats = (struct uffd_stats *)arg; 709 unsigned long cpu = stats->cpu; 710 struct pollfd pollfd[2]; 711 struct uffd_msg msg; 712 struct uffdio_register uffd_reg; 713 int ret; 714 char tmp_chr; 715 716 pollfd[0].fd = uffd; 717 pollfd[0].events = POLLIN; 718 pollfd[1].fd = pipefd[cpu*2]; 719 pollfd[1].events = POLLIN; 720 721 for (;;) { 722 ret = poll(pollfd, 2, -1); 723 if (ret <= 0) 724 err("poll error: %d", ret); 725 if (pollfd[1].revents & POLLIN) { 726 if (read(pollfd[1].fd, &tmp_chr, 1) != 1) 727 err("read pipefd error"); 728 break; 729 } 730 if (!(pollfd[0].revents & POLLIN)) 731 err("pollfd[0].revents %d", pollfd[0].revents); 732 if (uffd_read_msg(uffd, &msg)) 733 continue; 734 switch (msg.event) { 735 default: 736 err("unexpected msg event %u\n", msg.event); 737 break; 738 case UFFD_EVENT_PAGEFAULT: 739 uffd_handle_page_fault(&msg, stats); 740 break; 741 case UFFD_EVENT_FORK: 742 close(uffd); 743 uffd = msg.arg.fork.ufd; 744 pollfd[0].fd = uffd; 745 break; 746 case UFFD_EVENT_REMOVE: 747 uffd_reg.range.start = msg.arg.remove.start; 748 uffd_reg.range.len = msg.arg.remove.end - 749 msg.arg.remove.start; 750 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) 751 err("remove failure"); 752 break; 753 case UFFD_EVENT_REMAP: 754 area_dst = (char *)(unsigned long)msg.arg.remap.to; 755 break; 756 } 757 } 758 759 return NULL; 760} 761 762pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; 763 764static void *uffd_read_thread(void *arg) 765{ 766 struct uffd_stats *stats = (struct uffd_stats *)arg; 767 struct uffd_msg msg; 768 769 pthread_mutex_unlock(&uffd_read_mutex); 770 /* from here cancellation is ok */ 771 772 for (;;) { 773 if (uffd_read_msg(uffd, &msg)) 774 continue; 775 uffd_handle_page_fault(&msg, stats); 776 } 777 778 return NULL; 779} 780 781static void *background_thread(void *arg) 782{ 783 unsigned long cpu = (unsigned long) arg; 784 unsigned long page_nr, start_nr, mid_nr, end_nr; 785 786 start_nr = cpu * nr_pages_per_cpu; 787 end_nr = (cpu+1) * nr_pages_per_cpu; 788 mid_nr = (start_nr + end_nr) / 2; 789 790 /* Copy the first half of the pages */ 791 for (page_nr = start_nr; page_nr < mid_nr; page_nr++) 792 copy_page_retry(uffd, page_nr * page_size); 793 794 /* 795 * If we need to test uffd-wp, set it up now. Then we'll have 796 * at least the first half of the pages mapped already which 797 * can be write-protected for testing 798 */ 799 if (test_uffdio_wp) 800 wp_range(uffd, (unsigned long)area_dst + start_nr * page_size, 801 nr_pages_per_cpu * page_size, true); 802 803 /* 804 * Continue the 2nd half of the page copying, handling write 805 * protection faults if any 806 */ 807 for (page_nr = mid_nr; page_nr < end_nr; page_nr++) 808 copy_page_retry(uffd, page_nr * page_size); 809 810 return NULL; 811} 812 813static int stress(struct uffd_stats *uffd_stats) 814{ 815 unsigned long cpu; 816 pthread_t locking_threads[nr_cpus]; 817 pthread_t uffd_threads[nr_cpus]; 818 pthread_t background_threads[nr_cpus]; 819 820 finished = 0; 821 for (cpu = 0; cpu < nr_cpus; cpu++) { 822 if (pthread_create(&locking_threads[cpu], &attr, 823 locking_thread, (void *)cpu)) 824 return 1; 825 if (bounces & BOUNCE_POLL) { 826 if (pthread_create(&uffd_threads[cpu], &attr, 827 uffd_poll_thread, 828 (void *)&uffd_stats[cpu])) 829 return 1; 830 } else { 831 if (pthread_create(&uffd_threads[cpu], &attr, 832 uffd_read_thread, 833 (void *)&uffd_stats[cpu])) 834 return 1; 835 pthread_mutex_lock(&uffd_read_mutex); 836 } 837 if (pthread_create(&background_threads[cpu], &attr, 838 background_thread, (void *)cpu)) 839 return 1; 840 } 841 for (cpu = 0; cpu < nr_cpus; cpu++) 842 if (pthread_join(background_threads[cpu], NULL)) 843 return 1; 844 845 /* 846 * Be strict and immediately zap area_src, the whole area has 847 * been transferred already by the background treads. The 848 * area_src could then be faulted in in a racy way by still 849 * running uffdio_threads reading zeropages after we zapped 850 * area_src (but they're guaranteed to get -EEXIST from 851 * UFFDIO_COPY without writing zero pages into area_dst 852 * because the background threads already completed). 853 */ 854 uffd_test_ops->release_pages(area_src); 855 856 finished = 1; 857 for (cpu = 0; cpu < nr_cpus; cpu++) 858 if (pthread_join(locking_threads[cpu], NULL)) 859 return 1; 860 861 for (cpu = 0; cpu < nr_cpus; cpu++) { 862 char c; 863 if (bounces & BOUNCE_POLL) { 864 if (write(pipefd[cpu*2+1], &c, 1) != 1) 865 err("pipefd write error"); 866 if (pthread_join(uffd_threads[cpu], 867 (void *)&uffd_stats[cpu])) 868 return 1; 869 } else { 870 if (pthread_cancel(uffd_threads[cpu])) 871 return 1; 872 if (pthread_join(uffd_threads[cpu], NULL)) 873 return 1; 874 } 875 } 876 877 return 0; 878} 879 880sigjmp_buf jbuf, *sigbuf; 881 882static void sighndl(int sig, siginfo_t *siginfo, void *ptr) 883{ 884 if (sig == SIGBUS) { 885 if (sigbuf) 886 siglongjmp(*sigbuf, 1); 887 abort(); 888 } 889} 890 891/* 892 * For non-cooperative userfaultfd test we fork() a process that will 893 * generate pagefaults, will mremap the area monitored by the 894 * userfaultfd and at last this process will release the monitored 895 * area. 896 * For the anonymous and shared memory the area is divided into two 897 * parts, the first part is accessed before mremap, and the second 898 * part is accessed after mremap. Since hugetlbfs does not support 899 * mremap, the entire monitored area is accessed in a single pass for 900 * HUGETLB_TEST. 901 * The release of the pages currently generates event for shmem and 902 * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked 903 * for hugetlb. 904 * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register 905 * monitored area, generate pagefaults and test that signal is delivered. 906 * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2 907 * test robustness use case - we release monitored area, fork a process 908 * that will generate pagefaults and verify signal is generated. 909 * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal 910 * feature. Using monitor thread, verify no userfault events are generated. 911 */ 912static int faulting_process(int signal_test) 913{ 914 unsigned long nr; 915 unsigned long long count; 916 unsigned long split_nr_pages; 917 unsigned long lastnr; 918 struct sigaction act; 919 unsigned long signalled = 0; 920 921 if (test_type != TEST_HUGETLB) 922 split_nr_pages = (nr_pages + 1) / 2; 923 else 924 split_nr_pages = nr_pages; 925 926 if (signal_test) { 927 sigbuf = &jbuf; 928 memset(&act, 0, sizeof(act)); 929 act.sa_sigaction = sighndl; 930 act.sa_flags = SA_SIGINFO; 931 if (sigaction(SIGBUS, &act, 0)) 932 err("sigaction"); 933 lastnr = (unsigned long)-1; 934 } 935 936 for (nr = 0; nr < split_nr_pages; nr++) { 937 int steps = 1; 938 unsigned long offset = nr * page_size; 939 940 if (signal_test) { 941 if (sigsetjmp(*sigbuf, 1) != 0) { 942 if (steps == 1 && nr == lastnr) 943 err("Signal repeated"); 944 945 lastnr = nr; 946 if (signal_test == 1) { 947 if (steps == 1) { 948 /* This is a MISSING request */ 949 steps++; 950 if (copy_page(uffd, offset)) 951 signalled++; 952 } else { 953 /* This is a WP request */ 954 assert(steps == 2); 955 wp_range(uffd, 956 (__u64)area_dst + 957 offset, 958 page_size, false); 959 } 960 } else { 961 signalled++; 962 continue; 963 } 964 } 965 } 966 967 count = *area_count(area_dst, nr); 968 if (count != count_verify[nr]) 969 err("nr %lu memory corruption %llu %llu\n", 970 nr, count, count_verify[nr]); 971 /* 972 * Trigger write protection if there is by writing 973 * the same value back. 974 */ 975 *area_count(area_dst, nr) = count; 976 } 977 978 if (signal_test) 979 return signalled != split_nr_pages; 980 981 if (test_type == TEST_HUGETLB) 982 return 0; 983 984 area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, 985 MREMAP_MAYMOVE | MREMAP_FIXED, area_src); 986 if (area_dst == MAP_FAILED) 987 err("mremap"); 988 /* Reset area_src since we just clobbered it */ 989 area_src = NULL; 990 991 for (; nr < nr_pages; nr++) { 992 count = *area_count(area_dst, nr); 993 if (count != count_verify[nr]) { 994 err("nr %lu memory corruption %llu %llu\n", 995 nr, count, count_verify[nr]); 996 } 997 /* 998 * Trigger write protection if there is by writing 999 * the same value back. 1000 */ 1001 *area_count(area_dst, nr) = count; 1002 } 1003 1004 uffd_test_ops->release_pages(area_dst); 1005 1006 for (nr = 0; nr < nr_pages; nr++) 1007 if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) 1008 err("nr %lu is not zero", nr); 1009 1010 return 0; 1011} 1012 1013static void retry_uffdio_zeropage(int ufd, 1014 struct uffdio_zeropage *uffdio_zeropage, 1015 unsigned long offset) 1016{ 1017 uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start, 1018 uffdio_zeropage->range.len, 1019 offset); 1020 if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { 1021 if (uffdio_zeropage->zeropage != -EEXIST) 1022 err("UFFDIO_ZEROPAGE error: %"PRId64, 1023 (int64_t)uffdio_zeropage->zeropage); 1024 } else { 1025 err("UFFDIO_ZEROPAGE error: %"PRId64, 1026 (int64_t)uffdio_zeropage->zeropage); 1027 } 1028} 1029 1030static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) 1031{ 1032 struct uffdio_zeropage uffdio_zeropage; 1033 int ret; 1034 bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE); 1035 __s64 res; 1036 1037 if (offset >= nr_pages * page_size) 1038 err("unexpected offset %lu", offset); 1039 uffdio_zeropage.range.start = (unsigned long) area_dst + offset; 1040 uffdio_zeropage.range.len = page_size; 1041 uffdio_zeropage.mode = 0; 1042 ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); 1043 res = uffdio_zeropage.zeropage; 1044 if (ret) { 1045 /* real retval in ufdio_zeropage.zeropage */ 1046 if (has_zeropage) 1047 err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res); 1048 else if (res != -EINVAL) 1049 err("UFFDIO_ZEROPAGE not -EINVAL"); 1050 } else if (has_zeropage) { 1051 if (res != page_size) { 1052 err("UFFDIO_ZEROPAGE unexpected size"); 1053 } else { 1054 if (test_uffdio_zeropage_eexist && retry) { 1055 test_uffdio_zeropage_eexist = false; 1056 retry_uffdio_zeropage(ufd, &uffdio_zeropage, 1057 offset); 1058 } 1059 return 1; 1060 } 1061 } else 1062 err("UFFDIO_ZEROPAGE succeeded"); 1063 1064 return 0; 1065} 1066 1067static int uffdio_zeropage(int ufd, unsigned long offset) 1068{ 1069 return __uffdio_zeropage(ufd, offset, false); 1070} 1071 1072/* exercise UFFDIO_ZEROPAGE */ 1073static int userfaultfd_zeropage_test(void) 1074{ 1075 struct uffdio_register uffdio_register; 1076 1077 printf("testing UFFDIO_ZEROPAGE: "); 1078 fflush(stdout); 1079 1080 uffd_test_ctx_init(0); 1081 1082 uffdio_register.range.start = (unsigned long) area_dst; 1083 uffdio_register.range.len = nr_pages * page_size; 1084 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; 1085 if (test_uffdio_wp) 1086 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; 1087 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1088 err("register failure"); 1089 1090 assert_expected_ioctls_present( 1091 uffdio_register.mode, uffdio_register.ioctls); 1092 1093 if (uffdio_zeropage(uffd, 0)) 1094 if (my_bcmp(area_dst, zeropage, page_size)) 1095 err("zeropage is not zero"); 1096 1097 printf("done.\n"); 1098 return 0; 1099} 1100 1101static int userfaultfd_events_test(void) 1102{ 1103 struct uffdio_register uffdio_register; 1104 pthread_t uffd_mon; 1105 int err, features; 1106 pid_t pid; 1107 char c; 1108 struct uffd_stats stats = { 0 }; 1109 1110 printf("testing events (fork, remap, remove): "); 1111 fflush(stdout); 1112 1113 features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | 1114 UFFD_FEATURE_EVENT_REMOVE; 1115 uffd_test_ctx_init(features); 1116 1117 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); 1118 1119 uffdio_register.range.start = (unsigned long) area_dst; 1120 uffdio_register.range.len = nr_pages * page_size; 1121 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; 1122 if (test_uffdio_wp) 1123 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; 1124 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1125 err("register failure"); 1126 1127 assert_expected_ioctls_present( 1128 uffdio_register.mode, uffdio_register.ioctls); 1129 1130 if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) 1131 err("uffd_poll_thread create"); 1132 1133 pid = fork(); 1134 if (pid < 0) 1135 err("fork"); 1136 1137 if (!pid) 1138 exit(faulting_process(0)); 1139 1140 waitpid(pid, &err, 0); 1141 if (err) 1142 err("faulting process failed"); 1143 if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) 1144 err("pipe write"); 1145 if (pthread_join(uffd_mon, NULL)) 1146 return 1; 1147 1148 uffd_stats_report(&stats, 1); 1149 1150 return stats.missing_faults != nr_pages; 1151} 1152 1153static int userfaultfd_sig_test(void) 1154{ 1155 struct uffdio_register uffdio_register; 1156 unsigned long userfaults; 1157 pthread_t uffd_mon; 1158 int err, features; 1159 pid_t pid; 1160 char c; 1161 struct uffd_stats stats = { 0 }; 1162 1163 printf("testing signal delivery: "); 1164 fflush(stdout); 1165 1166 features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS; 1167 uffd_test_ctx_init(features); 1168 1169 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); 1170 1171 uffdio_register.range.start = (unsigned long) area_dst; 1172 uffdio_register.range.len = nr_pages * page_size; 1173 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; 1174 if (test_uffdio_wp) 1175 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; 1176 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1177 err("register failure"); 1178 1179 assert_expected_ioctls_present( 1180 uffdio_register.mode, uffdio_register.ioctls); 1181 1182 if (faulting_process(1)) 1183 err("faulting process failed"); 1184 1185 uffd_test_ops->release_pages(area_dst); 1186 1187 if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) 1188 err("uffd_poll_thread create"); 1189 1190 pid = fork(); 1191 if (pid < 0) 1192 err("fork"); 1193 1194 if (!pid) 1195 exit(faulting_process(2)); 1196 1197 waitpid(pid, &err, 0); 1198 if (err) 1199 err("faulting process failed"); 1200 if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) 1201 err("pipe write"); 1202 if (pthread_join(uffd_mon, (void **)&userfaults)) 1203 return 1; 1204 1205 printf("done.\n"); 1206 if (userfaults) 1207 err("Signal test failed, userfaults: %ld", userfaults); 1208 1209 return userfaults != 0; 1210} 1211 1212static int userfaultfd_minor_test(void) 1213{ 1214 struct uffdio_register uffdio_register; 1215 unsigned long p; 1216 pthread_t uffd_mon; 1217 uint8_t expected_byte; 1218 void *expected_page; 1219 char c; 1220 struct uffd_stats stats = { 0 }; 1221 1222 if (!test_uffdio_minor) 1223 return 0; 1224 1225 printf("testing minor faults: "); 1226 fflush(stdout); 1227 1228 uffd_test_ctx_init(uffd_minor_feature()); 1229 1230 uffdio_register.range.start = (unsigned long)area_dst_alias; 1231 uffdio_register.range.len = nr_pages * page_size; 1232 uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; 1233 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1234 err("register failure"); 1235 1236 assert_expected_ioctls_present( 1237 uffdio_register.mode, uffdio_register.ioctls); 1238 1239 /* 1240 * After registering with UFFD, populate the non-UFFD-registered side of 1241 * the shared mapping. This should *not* trigger any UFFD minor faults. 1242 */ 1243 for (p = 0; p < nr_pages; ++p) { 1244 memset(area_dst + (p * page_size), p % ((uint8_t)-1), 1245 page_size); 1246 } 1247 1248 if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) 1249 err("uffd_poll_thread create"); 1250 1251 /* 1252 * Read each of the pages back using the UFFD-registered mapping. We 1253 * expect that the first time we touch a page, it will result in a minor 1254 * fault. uffd_poll_thread will resolve the fault by bit-flipping the 1255 * page's contents, and then issuing a CONTINUE ioctl. 1256 */ 1257 1258 if (posix_memalign(&expected_page, page_size, page_size)) 1259 err("out of memory"); 1260 1261 for (p = 0; p < nr_pages; ++p) { 1262 expected_byte = ~((uint8_t)(p % ((uint8_t)-1))); 1263 memset(expected_page, expected_byte, page_size); 1264 if (my_bcmp(expected_page, area_dst_alias + (p * page_size), 1265 page_size)) 1266 err("unexpected page contents after minor fault"); 1267 } 1268 1269 if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) 1270 err("pipe write"); 1271 if (pthread_join(uffd_mon, NULL)) 1272 return 1; 1273 1274 uffd_stats_report(&stats, 1); 1275 1276 return stats.missing_faults != 0 || stats.minor_faults != nr_pages; 1277} 1278 1279#define BIT_ULL(nr) (1ULL << (nr)) 1280#define PM_SOFT_DIRTY BIT_ULL(55) 1281#define PM_MMAP_EXCLUSIVE BIT_ULL(56) 1282#define PM_UFFD_WP BIT_ULL(57) 1283#define PM_FILE BIT_ULL(61) 1284#define PM_SWAP BIT_ULL(62) 1285#define PM_PRESENT BIT_ULL(63) 1286 1287static int pagemap_open(void) 1288{ 1289 int fd = open("/proc/self/pagemap", O_RDONLY); 1290 1291 if (fd < 0) 1292 err("open pagemap"); 1293 1294 return fd; 1295} 1296 1297static uint64_t pagemap_read_vaddr(int fd, void *vaddr) 1298{ 1299 uint64_t value; 1300 int ret; 1301 1302 ret = pread(fd, &value, sizeof(uint64_t), 1303 ((uint64_t)vaddr >> 12) * sizeof(uint64_t)); 1304 if (ret != sizeof(uint64_t)) 1305 err("pread() on pagemap failed"); 1306 1307 return value; 1308} 1309 1310/* This macro let __LINE__ works in err() */ 1311#define pagemap_check_wp(value, wp) do { \ 1312 if (!!(value & PM_UFFD_WP) != wp) \ 1313 err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \ 1314 } while (0) 1315 1316static int pagemap_test_fork(bool present) 1317{ 1318 pid_t child = fork(); 1319 uint64_t value; 1320 int fd, result; 1321 1322 if (!child) { 1323 /* Open the pagemap fd of the child itself */ 1324 fd = pagemap_open(); 1325 value = pagemap_read_vaddr(fd, area_dst); 1326 /* 1327 * After fork() uffd-wp bit should be gone as long as we're 1328 * without UFFD_FEATURE_EVENT_FORK 1329 */ 1330 pagemap_check_wp(value, false); 1331 /* Succeed */ 1332 exit(0); 1333 } 1334 waitpid(child, &result, 0); 1335 return result; 1336} 1337 1338static void userfaultfd_pagemap_test(unsigned int test_pgsize) 1339{ 1340 struct uffdio_register uffdio_register; 1341 int pagemap_fd; 1342 uint64_t value; 1343 1344 /* Pagemap tests uffd-wp only */ 1345 if (!test_uffdio_wp) 1346 return; 1347 1348 /* Not enough memory to test this page size */ 1349 if (test_pgsize > nr_pages * page_size) 1350 return; 1351 1352 printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize); 1353 /* Flush so it doesn't flush twice in parent/child later */ 1354 fflush(stdout); 1355 1356 uffd_test_ctx_init(0); 1357 1358 if (test_pgsize > page_size) { 1359 /* This is a thp test */ 1360 if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) 1361 err("madvise(MADV_HUGEPAGE) failed"); 1362 } else if (test_pgsize == page_size) { 1363 /* This is normal page test; force no thp */ 1364 if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) 1365 err("madvise(MADV_NOHUGEPAGE) failed"); 1366 } 1367 1368 uffdio_register.range.start = (unsigned long) area_dst; 1369 uffdio_register.range.len = nr_pages * page_size; 1370 uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; 1371 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1372 err("register failed"); 1373 1374 pagemap_fd = pagemap_open(); 1375 1376 /* Touch the page */ 1377 *area_dst = 1; 1378 wp_range(uffd, (uint64_t)area_dst, test_pgsize, true); 1379 value = pagemap_read_vaddr(pagemap_fd, area_dst); 1380 pagemap_check_wp(value, true); 1381 /* Make sure uffd-wp bit dropped when fork */ 1382 if (pagemap_test_fork(true)) 1383 err("Detected stall uffd-wp bit in child"); 1384 1385 /* Exclusive required or PAGEOUT won't work */ 1386 if (!(value & PM_MMAP_EXCLUSIVE)) 1387 err("multiple mapping detected: 0x%"PRIx64, value); 1388 1389 if (madvise(area_dst, test_pgsize, MADV_PAGEOUT)) 1390 err("madvise(MADV_PAGEOUT) failed"); 1391 1392 /* Uffd-wp should persist even swapped out */ 1393 value = pagemap_read_vaddr(pagemap_fd, area_dst); 1394 pagemap_check_wp(value, true); 1395 /* Make sure uffd-wp bit dropped when fork */ 1396 if (pagemap_test_fork(false)) 1397 err("Detected stall uffd-wp bit in child"); 1398 1399 /* Unprotect; this tests swap pte modifications */ 1400 wp_range(uffd, (uint64_t)area_dst, page_size, false); 1401 value = pagemap_read_vaddr(pagemap_fd, area_dst); 1402 pagemap_check_wp(value, false); 1403 1404 /* Fault in the page from disk */ 1405 *area_dst = 2; 1406 value = pagemap_read_vaddr(pagemap_fd, area_dst); 1407 pagemap_check_wp(value, false); 1408 1409 close(pagemap_fd); 1410 printf("done\n"); 1411} 1412 1413static int userfaultfd_stress(void) 1414{ 1415 void *area; 1416 char *tmp_area; 1417 unsigned long nr; 1418 struct uffdio_register uffdio_register; 1419 struct uffd_stats uffd_stats[nr_cpus]; 1420 1421 uffd_test_ctx_init(0); 1422 1423 if (posix_memalign(&area, page_size, page_size)) 1424 err("out of memory"); 1425 zeropage = area; 1426 bzero(zeropage, page_size); 1427 1428 pthread_mutex_lock(&uffd_read_mutex); 1429 1430 pthread_attr_init(&attr); 1431 pthread_attr_setstacksize(&attr, 16*1024*1024); 1432 1433 while (bounces--) { 1434 printf("bounces: %d, mode:", bounces); 1435 if (bounces & BOUNCE_RANDOM) 1436 printf(" rnd"); 1437 if (bounces & BOUNCE_RACINGFAULTS) 1438 printf(" racing"); 1439 if (bounces & BOUNCE_VERIFY) 1440 printf(" ver"); 1441 if (bounces & BOUNCE_POLL) 1442 printf(" poll"); 1443 else 1444 printf(" read"); 1445 printf(", "); 1446 fflush(stdout); 1447 1448 if (bounces & BOUNCE_POLL) 1449 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); 1450 else 1451 fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); 1452 1453 /* register */ 1454 uffdio_register.range.start = (unsigned long) area_dst; 1455 uffdio_register.range.len = nr_pages * page_size; 1456 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; 1457 if (test_uffdio_wp) 1458 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; 1459 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1460 err("register failure"); 1461 assert_expected_ioctls_present( 1462 uffdio_register.mode, uffdio_register.ioctls); 1463 1464 if (area_dst_alias) { 1465 uffdio_register.range.start = (unsigned long) 1466 area_dst_alias; 1467 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1468 err("register failure alias"); 1469 } 1470 1471 /* 1472 * The madvise done previously isn't enough: some 1473 * uffd_thread could have read userfaults (one of 1474 * those already resolved by the background thread) 1475 * and it may be in the process of calling 1476 * UFFDIO_COPY. UFFDIO_COPY will read the zapped 1477 * area_src and it would map a zero page in it (of 1478 * course such a UFFDIO_COPY is perfectly safe as it'd 1479 * return -EEXIST). The problem comes at the next 1480 * bounce though: that racing UFFDIO_COPY would 1481 * generate zeropages in the area_src, so invalidating 1482 * the previous MADV_DONTNEED. Without this additional 1483 * MADV_DONTNEED those zeropages leftovers in the 1484 * area_src would lead to -EEXIST failure during the 1485 * next bounce, effectively leaving a zeropage in the 1486 * area_dst. 1487 * 1488 * Try to comment this out madvise to see the memory 1489 * corruption being caught pretty quick. 1490 * 1491 * khugepaged is also inhibited to collapse THP after 1492 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's 1493 * required to MADV_DONTNEED here. 1494 */ 1495 uffd_test_ops->release_pages(area_dst); 1496 1497 uffd_stats_reset(uffd_stats, nr_cpus); 1498 1499 /* bounce pass */ 1500 if (stress(uffd_stats)) 1501 return 1; 1502 1503 /* Clear all the write protections if there is any */ 1504 if (test_uffdio_wp) 1505 wp_range(uffd, (unsigned long)area_dst, 1506 nr_pages * page_size, false); 1507 1508 /* unregister */ 1509 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) 1510 err("unregister failure"); 1511 if (area_dst_alias) { 1512 uffdio_register.range.start = (unsigned long) area_dst; 1513 if (ioctl(uffd, UFFDIO_UNREGISTER, 1514 &uffdio_register.range)) 1515 err("unregister failure alias"); 1516 } 1517 1518 /* verification */ 1519 if (bounces & BOUNCE_VERIFY) 1520 for (nr = 0; nr < nr_pages; nr++) 1521 if (*area_count(area_dst, nr) != count_verify[nr]) 1522 err("error area_count %llu %llu %lu\n", 1523 *area_count(area_src, nr), 1524 count_verify[nr], nr); 1525 1526 /* prepare next bounce */ 1527 tmp_area = area_src; 1528 area_src = area_dst; 1529 area_dst = tmp_area; 1530 1531 tmp_area = area_src_alias; 1532 area_src_alias = area_dst_alias; 1533 area_dst_alias = tmp_area; 1534 1535 uffd_stats_report(uffd_stats, nr_cpus); 1536 } 1537 1538 if (test_type == TEST_ANON) { 1539 /* 1540 * shmem/hugetlb won't be able to run since they have different 1541 * behavior on fork() (file-backed memory normally drops ptes 1542 * directly when fork), meanwhile the pagemap test will verify 1543 * pgtable entry of fork()ed child. 1544 */ 1545 userfaultfd_pagemap_test(page_size); 1546 /* 1547 * Hard-code for x86_64 for now for 2M THP, as x86_64 is 1548 * currently the only one that supports uffd-wp 1549 */ 1550 userfaultfd_pagemap_test(page_size * 512); 1551 } 1552 1553 return userfaultfd_zeropage_test() || userfaultfd_sig_test() 1554 || userfaultfd_events_test() || userfaultfd_minor_test(); 1555} 1556 1557/* 1558 * Copied from mlock2-tests.c 1559 */ 1560unsigned long default_huge_page_size(void) 1561{ 1562 unsigned long hps = 0; 1563 char *line = NULL; 1564 size_t linelen = 0; 1565 FILE *f = fopen("/proc/meminfo", "r"); 1566 1567 if (!f) 1568 return 0; 1569 while (getline(&line, &linelen, f) > 0) { 1570 if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { 1571 hps <<= 10; 1572 break; 1573 } 1574 } 1575 1576 free(line); 1577 fclose(f); 1578 return hps; 1579} 1580 1581static void set_test_type(const char *type) 1582{ 1583 uint64_t features = UFFD_API_FEATURES; 1584 1585 if (!strcmp(type, "anon")) { 1586 test_type = TEST_ANON; 1587 uffd_test_ops = &anon_uffd_test_ops; 1588 /* Only enable write-protect test for anonymous test */ 1589 test_uffdio_wp = true; 1590 } else if (!strcmp(type, "hugetlb")) { 1591 test_type = TEST_HUGETLB; 1592 uffd_test_ops = &hugetlb_uffd_test_ops; 1593 } else if (!strcmp(type, "hugetlb_shared")) { 1594 map_shared = true; 1595 test_type = TEST_HUGETLB; 1596 uffd_test_ops = &hugetlb_uffd_test_ops; 1597 /* Minor faults require shared hugetlb; only enable here. */ 1598 test_uffdio_minor = true; 1599 } else if (!strcmp(type, "shmem")) { 1600 map_shared = true; 1601 test_type = TEST_SHMEM; 1602 uffd_test_ops = &shmem_uffd_test_ops; 1603 test_uffdio_minor = true; 1604 } else { 1605 err("Unknown test type: %s", type); 1606 } 1607 1608 if (test_type == TEST_HUGETLB) 1609 page_size = default_huge_page_size(); 1610 else 1611 page_size = sysconf(_SC_PAGE_SIZE); 1612 1613 if (!page_size) 1614 err("Unable to determine page size"); 1615 if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 1616 > page_size) 1617 err("Impossible to run this test"); 1618 1619 /* 1620 * Whether we can test certain features depends not just on test type, 1621 * but also on whether or not this particular kernel supports the 1622 * feature. 1623 */ 1624 1625 userfaultfd_open(&features); 1626 1627 test_uffdio_wp = test_uffdio_wp && 1628 (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); 1629 test_uffdio_minor = test_uffdio_minor && 1630 (features & uffd_minor_feature()); 1631 1632 close(uffd); 1633 uffd = -1; 1634} 1635 1636static void sigalrm(int sig) 1637{ 1638 if (sig != SIGALRM) 1639 abort(); 1640 test_uffdio_copy_eexist = true; 1641 test_uffdio_zeropage_eexist = true; 1642 alarm(ALARM_INTERVAL_SECS); 1643} 1644 1645int main(int argc, char **argv) 1646{ 1647 if (argc < 4) 1648 usage(); 1649 1650 if (signal(SIGALRM, sigalrm) == SIG_ERR) 1651 err("failed to arm SIGALRM"); 1652 alarm(ALARM_INTERVAL_SECS); 1653 1654 set_test_type(argv[1]); 1655 1656 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); 1657 nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size / 1658 nr_cpus; 1659 if (!nr_pages_per_cpu) { 1660 _err("invalid MiB"); 1661 usage(); 1662 } 1663 1664 bounces = atoi(argv[3]); 1665 if (bounces <= 0) { 1666 _err("invalid bounces"); 1667 usage(); 1668 } 1669 nr_pages = nr_pages_per_cpu * nr_cpus; 1670 1671 if (test_type == TEST_HUGETLB) { 1672 if (argc < 5) 1673 usage(); 1674 huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755); 1675 if (huge_fd < 0) 1676 err("Open of %s failed", argv[4]); 1677 if (ftruncate(huge_fd, 0)) 1678 err("ftruncate %s to size 0 failed", argv[4]); 1679 } else if (test_type == TEST_SHMEM) { 1680 shm_fd = memfd_create(argv[0], 0); 1681 if (shm_fd < 0) 1682 err("memfd_create"); 1683 if (ftruncate(shm_fd, nr_pages * page_size * 2)) 1684 err("ftruncate"); 1685 if (fallocate(shm_fd, 1686 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, 1687 nr_pages * page_size * 2)) 1688 err("fallocate"); 1689 } 1690 printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", 1691 nr_pages, nr_pages_per_cpu); 1692 return userfaultfd_stress(); 1693} 1694 1695#else /* __NR_userfaultfd */ 1696 1697#warning "missing __NR_userfaultfd definition" 1698 1699int main(void) 1700{ 1701 printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); 1702 return KSFT_SKIP; 1703} 1704 1705#endif /* __NR_userfaultfd */