at v6.1-rc4 1878 lines 51 kB view raw
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Stress userfaultfd syscall. 4 * 5 * Copyright (C) 2015 Red Hat, Inc. 6 * 7 * This test allocates two virtual areas and bounces the physical 8 * memory across the two virtual areas (from area_src to area_dst) 9 * using userfaultfd. 10 * 11 * There are three threads running per CPU: 12 * 13 * 1) one per-CPU thread takes a per-page pthread_mutex in a random 14 * page of the area_dst (while the physical page may still be in 15 * area_src), and increments a per-page counter in the same page, 16 * and checks its value against a verification region. 17 * 18 * 2) another per-CPU thread handles the userfaults generated by 19 * thread 1 above. userfaultfd blocking reads or poll() modes are 20 * exercised interleaved. 21 * 22 * 3) one last per-CPU thread transfers the memory in the background 23 * at maximum bandwidth (if not already transferred by thread 24 * 2). Each cpu thread takes cares of transferring a portion of the 25 * area. 26 * 27 * When all threads of type 3 completed the transfer, one bounce is 28 * complete. area_src and area_dst are then swapped. All threads are 29 * respawned and so the bounce is immediately restarted in the 30 * opposite direction. 31 * 32 * per-CPU threads 1 by triggering userfaults inside 33 * pthread_mutex_lock will also verify the atomicity of the memory 34 * transfer (UFFDIO_COPY). 35 */ 36 37#define _GNU_SOURCE 38#include <stdio.h> 39#include <errno.h> 40#include <unistd.h> 41#include <stdlib.h> 42#include <sys/types.h> 43#include <sys/stat.h> 44#include <fcntl.h> 45#include <time.h> 46#include <signal.h> 47#include <poll.h> 48#include <string.h> 49#include <linux/mman.h> 50#include <sys/mman.h> 51#include <sys/syscall.h> 52#include <sys/ioctl.h> 53#include <sys/wait.h> 54#include <pthread.h> 55#include <linux/userfaultfd.h> 56#include <setjmp.h> 57#include <stdbool.h> 58#include <assert.h> 59#include <inttypes.h> 60#include <stdint.h> 61#include <sys/random.h> 62 63#include "../kselftest.h" 64#include "vm_util.h" 65 66#ifdef __NR_userfaultfd 67 68static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; 69 70#define BOUNCE_RANDOM (1<<0) 71#define BOUNCE_RACINGFAULTS (1<<1) 72#define BOUNCE_VERIFY (1<<2) 73#define BOUNCE_POLL (1<<3) 74static int bounces; 75 76#define TEST_ANON 1 77#define TEST_HUGETLB 2 78#define TEST_SHMEM 3 79static int test_type; 80 81#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) 82 83#define BASE_PMD_ADDR ((void *)(1UL << 30)) 84 85/* test using /dev/userfaultfd, instead of userfaultfd(2) */ 86static bool test_dev_userfaultfd; 87 88/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ 89#define ALARM_INTERVAL_SECS 10 90static volatile bool test_uffdio_copy_eexist = true; 91static volatile bool test_uffdio_zeropage_eexist = true; 92/* Whether to test uffd write-protection */ 93static bool test_uffdio_wp = true; 94/* Whether to test uffd minor faults */ 95static bool test_uffdio_minor = false; 96 97static bool map_shared; 98static int shm_fd; 99static int huge_fd; 100static unsigned long long *count_verify; 101static int uffd = -1; 102static int uffd_flags, finished, *pipefd; 103static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; 104static char *zeropage; 105pthread_attr_t attr; 106static bool test_collapse; 107 108/* Userfaultfd test statistics */ 109struct uffd_stats { 110 int cpu; 111 unsigned long missing_faults; 112 unsigned long wp_faults; 113 unsigned long minor_faults; 114}; 115 116/* pthread_mutex_t starts at page offset 0 */ 117#define area_mutex(___area, ___nr) \ 118 ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) 119/* 120 * count is placed in the page after pthread_mutex_t naturally aligned 121 * to avoid non alignment faults on non-x86 archs. 122 */ 123#define area_count(___area, ___nr) \ 124 ((volatile unsigned long long *) ((unsigned long) \ 125 ((___area) + (___nr)*page_size + \ 126 sizeof(pthread_mutex_t) + \ 127 sizeof(unsigned long long) - 1) & \ 128 ~(unsigned long)(sizeof(unsigned long long) \ 129 - 1))) 130 131#define swap(a, b) \ 132 do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) 133 134#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1))) 135 136const char *examples = 137 "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" 138 "./userfaultfd anon 100 99999\n\n" 139 "# Run the same anonymous memory test, but using /dev/userfaultfd:\n" 140 "./userfaultfd anon:dev 100 99999\n\n" 141 "# Run share memory test on 1GiB region with 99 bounces:\n" 142 "./userfaultfd shmem 1000 99\n\n" 143 "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" 144 "./userfaultfd hugetlb 256 50\n\n" 145 "# Run the same hugetlb test but using shared file:\n" 146 "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n" 147 "# 10MiB-~6GiB 999 bounces anonymous test, " 148 "continue forever unless an error triggers\n" 149 "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; 150 151static void usage(void) 152{ 153 fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> " 154 "[hugetlbfs_file]\n\n"); 155 fprintf(stderr, "Supported <test type>: anon, hugetlb, " 156 "hugetlb_shared, shmem\n\n"); 157 fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. " 158 "Supported mods:\n"); 159 fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n"); 160 fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n"); 161 fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n" 162 "memory\n"); 163 fprintf(stderr, "\nExample test mod usage:\n"); 164 fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n"); 165 fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n"); 166 167 fprintf(stderr, "Examples:\n\n"); 168 fprintf(stderr, "%s", examples); 169 exit(1); 170} 171 172#define _err(fmt, ...) \ 173 do { \ 174 int ret = errno; \ 175 fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ 176 fprintf(stderr, " (errno=%d, line=%d)\n", \ 177 ret, __LINE__); \ 178 } while (0) 179 180#define errexit(exitcode, fmt, ...) \ 181 do { \ 182 _err(fmt, ##__VA_ARGS__); \ 183 exit(exitcode); \ 184 } while (0) 185 186#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) 187 188static void uffd_stats_reset(struct uffd_stats *uffd_stats, 189 unsigned long n_cpus) 190{ 191 int i; 192 193 for (i = 0; i < n_cpus; i++) { 194 uffd_stats[i].cpu = i; 195 uffd_stats[i].missing_faults = 0; 196 uffd_stats[i].wp_faults = 0; 197 uffd_stats[i].minor_faults = 0; 198 } 199} 200 201static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) 202{ 203 int i; 204 unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; 205 206 for (i = 0; i < n_cpus; i++) { 207 miss_total += stats[i].missing_faults; 208 wp_total += stats[i].wp_faults; 209 minor_total += stats[i].minor_faults; 210 } 211 212 printf("userfaults: "); 213 if (miss_total) { 214 printf("%llu missing (", miss_total); 215 for (i = 0; i < n_cpus; i++) 216 printf("%lu+", stats[i].missing_faults); 217 printf("\b) "); 218 } 219 if (wp_total) { 220 printf("%llu wp (", wp_total); 221 for (i = 0; i < n_cpus; i++) 222 printf("%lu+", stats[i].wp_faults); 223 printf("\b) "); 224 } 225 if (minor_total) { 226 printf("%llu minor (", minor_total); 227 for (i = 0; i < n_cpus; i++) 228 printf("%lu+", stats[i].minor_faults); 229 printf("\b)"); 230 } 231 printf("\n"); 232} 233 234static void anon_release_pages(char *rel_area) 235{ 236 if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) 237 err("madvise(MADV_DONTNEED) failed"); 238} 239 240static void anon_allocate_area(void **alloc_area, bool is_src) 241{ 242 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, 243 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); 244} 245 246static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) 247{ 248} 249 250static void hugetlb_release_pages(char *rel_area) 251{ 252 if (!map_shared) { 253 if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) 254 err("madvise(MADV_DONTNEED) failed"); 255 } else { 256 if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) 257 err("madvise(MADV_REMOVE) failed"); 258 } 259} 260 261static void hugetlb_allocate_area(void **alloc_area, bool is_src) 262{ 263 void *area_alias = NULL; 264 char **alloc_area_alias; 265 266 if (!map_shared) 267 *alloc_area = mmap(NULL, 268 nr_pages * page_size, 269 PROT_READ | PROT_WRITE, 270 MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | 271 (is_src ? 0 : MAP_NORESERVE), 272 -1, 273 0); 274 else 275 *alloc_area = mmap(NULL, 276 nr_pages * page_size, 277 PROT_READ | PROT_WRITE, 278 MAP_SHARED | 279 (is_src ? 0 : MAP_NORESERVE), 280 huge_fd, 281 is_src ? 0 : nr_pages * page_size); 282 if (*alloc_area == MAP_FAILED) 283 err("mmap of hugetlbfs file failed"); 284 285 if (map_shared) { 286 area_alias = mmap(NULL, 287 nr_pages * page_size, 288 PROT_READ | PROT_WRITE, 289 MAP_SHARED, 290 huge_fd, 291 is_src ? 0 : nr_pages * page_size); 292 if (area_alias == MAP_FAILED) 293 err("mmap of hugetlb file alias failed"); 294 } 295 296 if (is_src) { 297 alloc_area_alias = &area_src_alias; 298 } else { 299 alloc_area_alias = &area_dst_alias; 300 } 301 if (area_alias) 302 *alloc_area_alias = area_alias; 303} 304 305static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) 306{ 307 if (!map_shared) 308 return; 309 310 *start = (unsigned long) area_dst_alias + offset; 311} 312 313static void shmem_release_pages(char *rel_area) 314{ 315 if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) 316 err("madvise(MADV_REMOVE) failed"); 317} 318 319static void shmem_allocate_area(void **alloc_area, bool is_src) 320{ 321 void *area_alias = NULL; 322 size_t bytes = nr_pages * page_size; 323 unsigned long offset = is_src ? 0 : bytes; 324 char *p = NULL, *p_alias = NULL; 325 326 if (test_collapse) { 327 p = BASE_PMD_ADDR; 328 if (!is_src) 329 /* src map + alias + interleaved hpages */ 330 p += 2 * (bytes + hpage_size); 331 p_alias = p; 332 p_alias += bytes; 333 p_alias += hpage_size; /* Prevent src/dst VMA merge */ 334 } 335 336 *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, 337 shm_fd, offset); 338 if (*alloc_area == MAP_FAILED) 339 err("mmap of memfd failed"); 340 if (test_collapse && *alloc_area != p) 341 err("mmap of memfd failed at %p", p); 342 343 area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, 344 shm_fd, offset); 345 if (area_alias == MAP_FAILED) 346 err("mmap of memfd alias failed"); 347 if (test_collapse && area_alias != p_alias) 348 err("mmap of anonymous memory failed at %p", p_alias); 349 350 if (is_src) 351 area_src_alias = area_alias; 352 else 353 area_dst_alias = area_alias; 354} 355 356static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) 357{ 358 *start = (unsigned long)area_dst_alias + offset; 359} 360 361static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) 362{ 363 if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) 364 err("Did not find expected %d number of hugepages", 365 expect_nr_hpages); 366} 367 368struct uffd_test_ops { 369 void (*allocate_area)(void **alloc_area, bool is_src); 370 void (*release_pages)(char *rel_area); 371 void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); 372 void (*check_pmd_mapping)(void *p, int expect_nr_hpages); 373}; 374 375static struct uffd_test_ops anon_uffd_test_ops = { 376 .allocate_area = anon_allocate_area, 377 .release_pages = anon_release_pages, 378 .alias_mapping = noop_alias_mapping, 379 .check_pmd_mapping = NULL, 380}; 381 382static struct uffd_test_ops shmem_uffd_test_ops = { 383 .allocate_area = shmem_allocate_area, 384 .release_pages = shmem_release_pages, 385 .alias_mapping = shmem_alias_mapping, 386 .check_pmd_mapping = shmem_check_pmd_mapping, 387}; 388 389static struct uffd_test_ops hugetlb_uffd_test_ops = { 390 .allocate_area = hugetlb_allocate_area, 391 .release_pages = hugetlb_release_pages, 392 .alias_mapping = hugetlb_alias_mapping, 393 .check_pmd_mapping = NULL, 394}; 395 396static struct uffd_test_ops *uffd_test_ops; 397 398static inline uint64_t uffd_minor_feature(void) 399{ 400 if (test_type == TEST_HUGETLB && map_shared) 401 return UFFD_FEATURE_MINOR_HUGETLBFS; 402 else if (test_type == TEST_SHMEM) 403 return UFFD_FEATURE_MINOR_SHMEM; 404 else 405 return 0; 406} 407 408static uint64_t get_expected_ioctls(uint64_t mode) 409{ 410 uint64_t ioctls = UFFD_API_RANGE_IOCTLS; 411 412 if (test_type == TEST_HUGETLB) 413 ioctls &= ~(1 << _UFFDIO_ZEROPAGE); 414 415 if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) 416 ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); 417 418 if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) 419 ioctls &= ~(1 << _UFFDIO_CONTINUE); 420 421 return ioctls; 422} 423 424static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) 425{ 426 uint64_t expected = get_expected_ioctls(mode); 427 uint64_t actual = ioctls & expected; 428 429 if (actual != expected) { 430 err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, 431 expected, actual); 432 } 433} 434 435static int __userfaultfd_open_dev(void) 436{ 437 int fd, _uffd; 438 439 fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); 440 if (fd < 0) 441 errexit(KSFT_SKIP, "opening /dev/userfaultfd failed"); 442 443 _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS); 444 if (_uffd < 0) 445 errexit(errno == ENOTTY ? KSFT_SKIP : 1, 446 "creating userfaultfd failed"); 447 close(fd); 448 return _uffd; 449} 450 451static void userfaultfd_open(uint64_t *features) 452{ 453 struct uffdio_api uffdio_api; 454 455 if (test_dev_userfaultfd) 456 uffd = __userfaultfd_open_dev(); 457 else { 458 uffd = syscall(__NR_userfaultfd, UFFD_FLAGS); 459 if (uffd < 0) 460 errexit(errno == ENOSYS ? KSFT_SKIP : 1, 461 "creating userfaultfd failed"); 462 } 463 uffd_flags = fcntl(uffd, F_GETFD, NULL); 464 465 uffdio_api.api = UFFD_API; 466 uffdio_api.features = *features; 467 if (ioctl(uffd, UFFDIO_API, &uffdio_api)) 468 err("UFFDIO_API failed.\nPlease make sure to " 469 "run with either root or ptrace capability."); 470 if (uffdio_api.api != UFFD_API) 471 err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); 472 473 *features = uffdio_api.features; 474} 475 476static inline void munmap_area(void **area) 477{ 478 if (*area) 479 if (munmap(*area, nr_pages * page_size)) 480 err("munmap"); 481 482 *area = NULL; 483} 484 485static void uffd_test_ctx_clear(void) 486{ 487 size_t i; 488 489 if (pipefd) { 490 for (i = 0; i < nr_cpus * 2; ++i) { 491 if (close(pipefd[i])) 492 err("close pipefd"); 493 } 494 free(pipefd); 495 pipefd = NULL; 496 } 497 498 if (count_verify) { 499 free(count_verify); 500 count_verify = NULL; 501 } 502 503 if (uffd != -1) { 504 if (close(uffd)) 505 err("close uffd"); 506 uffd = -1; 507 } 508 509 munmap_area((void **)&area_src); 510 munmap_area((void **)&area_src_alias); 511 munmap_area((void **)&area_dst); 512 munmap_area((void **)&area_dst_alias); 513 munmap_area((void **)&area_remap); 514} 515 516static void uffd_test_ctx_init(uint64_t features) 517{ 518 unsigned long nr, cpu; 519 520 uffd_test_ctx_clear(); 521 522 uffd_test_ops->allocate_area((void **)&area_src, true); 523 uffd_test_ops->allocate_area((void **)&area_dst, false); 524 525 userfaultfd_open(&features); 526 527 count_verify = malloc(nr_pages * sizeof(unsigned long long)); 528 if (!count_verify) 529 err("count_verify"); 530 531 for (nr = 0; nr < nr_pages; nr++) { 532 *area_mutex(area_src, nr) = 533 (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; 534 count_verify[nr] = *area_count(area_src, nr) = 1; 535 /* 536 * In the transition between 255 to 256, powerpc will 537 * read out of order in my_bcmp and see both bytes as 538 * zero, so leave a placeholder below always non-zero 539 * after the count, to avoid my_bcmp to trigger false 540 * positives. 541 */ 542 *(area_count(area_src, nr) + 1) = 1; 543 } 544 545 /* 546 * After initialization of area_src, we must explicitly release pages 547 * for area_dst to make sure it's fully empty. Otherwise we could have 548 * some area_dst pages be errornously initialized with zero pages, 549 * hence we could hit memory corruption later in the test. 550 * 551 * One example is when THP is globally enabled, above allocate_area() 552 * calls could have the two areas merged into a single VMA (as they 553 * will have the same VMA flags so they're mergeable). When we 554 * initialize the area_src above, it's possible that some part of 555 * area_dst could have been faulted in via one huge THP that will be 556 * shared between area_src and area_dst. It could cause some of the 557 * area_dst won't be trapped by missing userfaults. 558 * 559 * This release_pages() will guarantee even if that happened, we'll 560 * proactively split the thp and drop any accidentally initialized 561 * pages within area_dst. 562 */ 563 uffd_test_ops->release_pages(area_dst); 564 565 pipefd = malloc(sizeof(int) * nr_cpus * 2); 566 if (!pipefd) 567 err("pipefd"); 568 for (cpu = 0; cpu < nr_cpus; cpu++) 569 if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) 570 err("pipe"); 571} 572 573static int my_bcmp(char *str1, char *str2, size_t n) 574{ 575 unsigned long i; 576 for (i = 0; i < n; i++) 577 if (str1[i] != str2[i]) 578 return 1; 579 return 0; 580} 581 582static void wp_range(int ufd, __u64 start, __u64 len, bool wp) 583{ 584 struct uffdio_writeprotect prms; 585 586 /* Write protection page faults */ 587 prms.range.start = start; 588 prms.range.len = len; 589 /* Undo write-protect, do wakeup after that */ 590 prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; 591 592 if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) 593 err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); 594} 595 596static void continue_range(int ufd, __u64 start, __u64 len) 597{ 598 struct uffdio_continue req; 599 int ret; 600 601 req.range.start = start; 602 req.range.len = len; 603 req.mode = 0; 604 605 if (ioctl(ufd, UFFDIO_CONTINUE, &req)) 606 err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, 607 (uint64_t)start); 608 609 /* 610 * Error handling within the kernel for continue is subtly different 611 * from copy or zeropage, so it may be a source of bugs. Trigger an 612 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. 613 */ 614 req.mapped = 0; 615 ret = ioctl(ufd, UFFDIO_CONTINUE, &req); 616 if (ret >= 0 || req.mapped != -EEXIST) 617 err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, 618 ret, (int64_t) req.mapped); 619} 620 621static void *locking_thread(void *arg) 622{ 623 unsigned long cpu = (unsigned long) arg; 624 unsigned long page_nr; 625 unsigned long long count; 626 627 if (!(bounces & BOUNCE_RANDOM)) { 628 page_nr = -bounces; 629 if (!(bounces & BOUNCE_RACINGFAULTS)) 630 page_nr += cpu * nr_pages_per_cpu; 631 } 632 633 while (!finished) { 634 if (bounces & BOUNCE_RANDOM) { 635 if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr)) 636 err("getrandom failed"); 637 } else 638 page_nr += 1; 639 page_nr %= nr_pages; 640 pthread_mutex_lock(area_mutex(area_dst, page_nr)); 641 count = *area_count(area_dst, page_nr); 642 if (count != count_verify[page_nr]) 643 err("page_nr %lu memory corruption %llu %llu", 644 page_nr, count, count_verify[page_nr]); 645 count++; 646 *area_count(area_dst, page_nr) = count_verify[page_nr] = count; 647 pthread_mutex_unlock(area_mutex(area_dst, page_nr)); 648 } 649 650 return NULL; 651} 652 653static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, 654 unsigned long offset) 655{ 656 uffd_test_ops->alias_mapping(&uffdio_copy->dst, 657 uffdio_copy->len, 658 offset); 659 if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { 660 /* real retval in ufdio_copy.copy */ 661 if (uffdio_copy->copy != -EEXIST) 662 err("UFFDIO_COPY retry error: %"PRId64, 663 (int64_t)uffdio_copy->copy); 664 } else { 665 err("UFFDIO_COPY retry unexpected: %"PRId64, 666 (int64_t)uffdio_copy->copy); 667 } 668} 669 670static void wake_range(int ufd, unsigned long addr, unsigned long len) 671{ 672 struct uffdio_range uffdio_wake; 673 674 uffdio_wake.start = addr; 675 uffdio_wake.len = len; 676 677 if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) 678 fprintf(stderr, "error waking %lu\n", 679 addr), exit(1); 680} 681 682static int __copy_page(int ufd, unsigned long offset, bool retry) 683{ 684 struct uffdio_copy uffdio_copy; 685 686 if (offset >= nr_pages * page_size) 687 err("unexpected offset %lu\n", offset); 688 uffdio_copy.dst = (unsigned long) area_dst + offset; 689 uffdio_copy.src = (unsigned long) area_src + offset; 690 uffdio_copy.len = page_size; 691 if (test_uffdio_wp) 692 uffdio_copy.mode = UFFDIO_COPY_MODE_WP; 693 else 694 uffdio_copy.mode = 0; 695 uffdio_copy.copy = 0; 696 if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { 697 /* real retval in ufdio_copy.copy */ 698 if (uffdio_copy.copy != -EEXIST) 699 err("UFFDIO_COPY error: %"PRId64, 700 (int64_t)uffdio_copy.copy); 701 wake_range(ufd, uffdio_copy.dst, page_size); 702 } else if (uffdio_copy.copy != page_size) { 703 err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); 704 } else { 705 if (test_uffdio_copy_eexist && retry) { 706 test_uffdio_copy_eexist = false; 707 retry_copy_page(ufd, &uffdio_copy, offset); 708 } 709 return 1; 710 } 711 return 0; 712} 713 714static int copy_page_retry(int ufd, unsigned long offset) 715{ 716 return __copy_page(ufd, offset, true); 717} 718 719static int copy_page(int ufd, unsigned long offset) 720{ 721 return __copy_page(ufd, offset, false); 722} 723 724static int uffd_read_msg(int ufd, struct uffd_msg *msg) 725{ 726 int ret = read(uffd, msg, sizeof(*msg)); 727 728 if (ret != sizeof(*msg)) { 729 if (ret < 0) { 730 if (errno == EAGAIN || errno == EINTR) 731 return 1; 732 err("blocking read error"); 733 } else { 734 err("short read"); 735 } 736 } 737 738 return 0; 739} 740 741static void uffd_handle_page_fault(struct uffd_msg *msg, 742 struct uffd_stats *stats) 743{ 744 unsigned long offset; 745 746 if (msg->event != UFFD_EVENT_PAGEFAULT) 747 err("unexpected msg event %u", msg->event); 748 749 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { 750 /* Write protect page faults */ 751 wp_range(uffd, msg->arg.pagefault.address, page_size, false); 752 stats->wp_faults++; 753 } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { 754 uint8_t *area; 755 int b; 756 757 /* 758 * Minor page faults 759 * 760 * To prove we can modify the original range for testing 761 * purposes, we're going to bit flip this range before 762 * continuing. 763 * 764 * Note that this requires all minor page fault tests operate on 765 * area_dst (non-UFFD-registered) and area_dst_alias 766 * (UFFD-registered). 767 */ 768 769 area = (uint8_t *)(area_dst + 770 ((char *)msg->arg.pagefault.address - 771 area_dst_alias)); 772 for (b = 0; b < page_size; ++b) 773 area[b] = ~area[b]; 774 continue_range(uffd, msg->arg.pagefault.address, page_size); 775 stats->minor_faults++; 776 } else { 777 /* 778 * Missing page faults. 779 * 780 * Here we force a write check for each of the missing mode 781 * faults. It's guaranteed because the only threads that 782 * will trigger uffd faults are the locking threads, and 783 * their first instruction to touch the missing page will 784 * always be pthread_mutex_lock(). 785 * 786 * Note that here we relied on an NPTL glibc impl detail to 787 * always read the lock type at the entry of the lock op 788 * (pthread_mutex_t.__data.__type, offset 0x10) before 789 * doing any locking operations to guarantee that. It's 790 * actually not good to rely on this impl detail because 791 * logically a pthread-compatible lib can implement the 792 * locks without types and we can fail when linking with 793 * them. However since we used to find bugs with this 794 * strict check we still keep it around. Hopefully this 795 * could be a good hint when it fails again. If one day 796 * it'll break on some other impl of glibc we'll revisit. 797 */ 798 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) 799 err("unexpected write fault"); 800 801 offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; 802 offset &= ~(page_size-1); 803 804 if (copy_page(uffd, offset)) 805 stats->missing_faults++; 806 } 807} 808 809static void *uffd_poll_thread(void *arg) 810{ 811 struct uffd_stats *stats = (struct uffd_stats *)arg; 812 unsigned long cpu = stats->cpu; 813 struct pollfd pollfd[2]; 814 struct uffd_msg msg; 815 struct uffdio_register uffd_reg; 816 int ret; 817 char tmp_chr; 818 819 pollfd[0].fd = uffd; 820 pollfd[0].events = POLLIN; 821 pollfd[1].fd = pipefd[cpu*2]; 822 pollfd[1].events = POLLIN; 823 824 for (;;) { 825 ret = poll(pollfd, 2, -1); 826 if (ret <= 0) { 827 if (errno == EINTR || errno == EAGAIN) 828 continue; 829 err("poll error: %d", ret); 830 } 831 if (pollfd[1].revents & POLLIN) { 832 if (read(pollfd[1].fd, &tmp_chr, 1) != 1) 833 err("read pipefd error"); 834 break; 835 } 836 if (!(pollfd[0].revents & POLLIN)) 837 err("pollfd[0].revents %d", pollfd[0].revents); 838 if (uffd_read_msg(uffd, &msg)) 839 continue; 840 switch (msg.event) { 841 default: 842 err("unexpected msg event %u\n", msg.event); 843 break; 844 case UFFD_EVENT_PAGEFAULT: 845 uffd_handle_page_fault(&msg, stats); 846 break; 847 case UFFD_EVENT_FORK: 848 close(uffd); 849 uffd = msg.arg.fork.ufd; 850 pollfd[0].fd = uffd; 851 break; 852 case UFFD_EVENT_REMOVE: 853 uffd_reg.range.start = msg.arg.remove.start; 854 uffd_reg.range.len = msg.arg.remove.end - 855 msg.arg.remove.start; 856 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) 857 err("remove failure"); 858 break; 859 case UFFD_EVENT_REMAP: 860 area_remap = area_dst; /* save for later unmap */ 861 area_dst = (char *)(unsigned long)msg.arg.remap.to; 862 break; 863 } 864 } 865 866 return NULL; 867} 868 869pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; 870 871static void *uffd_read_thread(void *arg) 872{ 873 struct uffd_stats *stats = (struct uffd_stats *)arg; 874 struct uffd_msg msg; 875 876 pthread_mutex_unlock(&uffd_read_mutex); 877 /* from here cancellation is ok */ 878 879 for (;;) { 880 if (uffd_read_msg(uffd, &msg)) 881 continue; 882 uffd_handle_page_fault(&msg, stats); 883 } 884 885 return NULL; 886} 887 888static void *background_thread(void *arg) 889{ 890 unsigned long cpu = (unsigned long) arg; 891 unsigned long page_nr, start_nr, mid_nr, end_nr; 892 893 start_nr = cpu * nr_pages_per_cpu; 894 end_nr = (cpu+1) * nr_pages_per_cpu; 895 mid_nr = (start_nr + end_nr) / 2; 896 897 /* Copy the first half of the pages */ 898 for (page_nr = start_nr; page_nr < mid_nr; page_nr++) 899 copy_page_retry(uffd, page_nr * page_size); 900 901 /* 902 * If we need to test uffd-wp, set it up now. Then we'll have 903 * at least the first half of the pages mapped already which 904 * can be write-protected for testing 905 */ 906 if (test_uffdio_wp) 907 wp_range(uffd, (unsigned long)area_dst + start_nr * page_size, 908 nr_pages_per_cpu * page_size, true); 909 910 /* 911 * Continue the 2nd half of the page copying, handling write 912 * protection faults if any 913 */ 914 for (page_nr = mid_nr; page_nr < end_nr; page_nr++) 915 copy_page_retry(uffd, page_nr * page_size); 916 917 return NULL; 918} 919 920static int stress(struct uffd_stats *uffd_stats) 921{ 922 unsigned long cpu; 923 pthread_t locking_threads[nr_cpus]; 924 pthread_t uffd_threads[nr_cpus]; 925 pthread_t background_threads[nr_cpus]; 926 927 finished = 0; 928 for (cpu = 0; cpu < nr_cpus; cpu++) { 929 if (pthread_create(&locking_threads[cpu], &attr, 930 locking_thread, (void *)cpu)) 931 return 1; 932 if (bounces & BOUNCE_POLL) { 933 if (pthread_create(&uffd_threads[cpu], &attr, 934 uffd_poll_thread, 935 (void *)&uffd_stats[cpu])) 936 return 1; 937 } else { 938 if (pthread_create(&uffd_threads[cpu], &attr, 939 uffd_read_thread, 940 (void *)&uffd_stats[cpu])) 941 return 1; 942 pthread_mutex_lock(&uffd_read_mutex); 943 } 944 if (pthread_create(&background_threads[cpu], &attr, 945 background_thread, (void *)cpu)) 946 return 1; 947 } 948 for (cpu = 0; cpu < nr_cpus; cpu++) 949 if (pthread_join(background_threads[cpu], NULL)) 950 return 1; 951 952 /* 953 * Be strict and immediately zap area_src, the whole area has 954 * been transferred already by the background treads. The 955 * area_src could then be faulted in a racy way by still 956 * running uffdio_threads reading zeropages after we zapped 957 * area_src (but they're guaranteed to get -EEXIST from 958 * UFFDIO_COPY without writing zero pages into area_dst 959 * because the background threads already completed). 960 */ 961 uffd_test_ops->release_pages(area_src); 962 963 finished = 1; 964 for (cpu = 0; cpu < nr_cpus; cpu++) 965 if (pthread_join(locking_threads[cpu], NULL)) 966 return 1; 967 968 for (cpu = 0; cpu < nr_cpus; cpu++) { 969 char c; 970 if (bounces & BOUNCE_POLL) { 971 if (write(pipefd[cpu*2+1], &c, 1) != 1) 972 err("pipefd write error"); 973 if (pthread_join(uffd_threads[cpu], 974 (void *)&uffd_stats[cpu])) 975 return 1; 976 } else { 977 if (pthread_cancel(uffd_threads[cpu])) 978 return 1; 979 if (pthread_join(uffd_threads[cpu], NULL)) 980 return 1; 981 } 982 } 983 984 return 0; 985} 986 987sigjmp_buf jbuf, *sigbuf; 988 989static void sighndl(int sig, siginfo_t *siginfo, void *ptr) 990{ 991 if (sig == SIGBUS) { 992 if (sigbuf) 993 siglongjmp(*sigbuf, 1); 994 abort(); 995 } 996} 997 998/* 999 * For non-cooperative userfaultfd test we fork() a process that will 1000 * generate pagefaults, will mremap the area monitored by the 1001 * userfaultfd and at last this process will release the monitored 1002 * area. 1003 * For the anonymous and shared memory the area is divided into two 1004 * parts, the first part is accessed before mremap, and the second 1005 * part is accessed after mremap. Since hugetlbfs does not support 1006 * mremap, the entire monitored area is accessed in a single pass for 1007 * HUGETLB_TEST. 1008 * The release of the pages currently generates event for shmem and 1009 * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked 1010 * for hugetlb. 1011 * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register 1012 * monitored area, generate pagefaults and test that signal is delivered. 1013 * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2 1014 * test robustness use case - we release monitored area, fork a process 1015 * that will generate pagefaults and verify signal is generated. 1016 * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal 1017 * feature. Using monitor thread, verify no userfault events are generated. 1018 */ 1019static int faulting_process(int signal_test) 1020{ 1021 unsigned long nr; 1022 unsigned long long count; 1023 unsigned long split_nr_pages; 1024 unsigned long lastnr; 1025 struct sigaction act; 1026 volatile unsigned long signalled = 0; 1027 1028 split_nr_pages = (nr_pages + 1) / 2; 1029 1030 if (signal_test) { 1031 sigbuf = &jbuf; 1032 memset(&act, 0, sizeof(act)); 1033 act.sa_sigaction = sighndl; 1034 act.sa_flags = SA_SIGINFO; 1035 if (sigaction(SIGBUS, &act, 0)) 1036 err("sigaction"); 1037 lastnr = (unsigned long)-1; 1038 } 1039 1040 for (nr = 0; nr < split_nr_pages; nr++) { 1041 volatile int steps = 1; 1042 unsigned long offset = nr * page_size; 1043 1044 if (signal_test) { 1045 if (sigsetjmp(*sigbuf, 1) != 0) { 1046 if (steps == 1 && nr == lastnr) 1047 err("Signal repeated"); 1048 1049 lastnr = nr; 1050 if (signal_test == 1) { 1051 if (steps == 1) { 1052 /* This is a MISSING request */ 1053 steps++; 1054 if (copy_page(uffd, offset)) 1055 signalled++; 1056 } else { 1057 /* This is a WP request */ 1058 assert(steps == 2); 1059 wp_range(uffd, 1060 (__u64)area_dst + 1061 offset, 1062 page_size, false); 1063 } 1064 } else { 1065 signalled++; 1066 continue; 1067 } 1068 } 1069 } 1070 1071 count = *area_count(area_dst, nr); 1072 if (count != count_verify[nr]) 1073 err("nr %lu memory corruption %llu %llu\n", 1074 nr, count, count_verify[nr]); 1075 /* 1076 * Trigger write protection if there is by writing 1077 * the same value back. 1078 */ 1079 *area_count(area_dst, nr) = count; 1080 } 1081 1082 if (signal_test) 1083 return signalled != split_nr_pages; 1084 1085 area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, 1086 MREMAP_MAYMOVE | MREMAP_FIXED, area_src); 1087 if (area_dst == MAP_FAILED) 1088 err("mremap"); 1089 /* Reset area_src since we just clobbered it */ 1090 area_src = NULL; 1091 1092 for (; nr < nr_pages; nr++) { 1093 count = *area_count(area_dst, nr); 1094 if (count != count_verify[nr]) { 1095 err("nr %lu memory corruption %llu %llu\n", 1096 nr, count, count_verify[nr]); 1097 } 1098 /* 1099 * Trigger write protection if there is by writing 1100 * the same value back. 1101 */ 1102 *area_count(area_dst, nr) = count; 1103 } 1104 1105 uffd_test_ops->release_pages(area_dst); 1106 1107 for (nr = 0; nr < nr_pages; nr++) 1108 if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) 1109 err("nr %lu is not zero", nr); 1110 1111 return 0; 1112} 1113 1114static void retry_uffdio_zeropage(int ufd, 1115 struct uffdio_zeropage *uffdio_zeropage, 1116 unsigned long offset) 1117{ 1118 uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start, 1119 uffdio_zeropage->range.len, 1120 offset); 1121 if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { 1122 if (uffdio_zeropage->zeropage != -EEXIST) 1123 err("UFFDIO_ZEROPAGE error: %"PRId64, 1124 (int64_t)uffdio_zeropage->zeropage); 1125 } else { 1126 err("UFFDIO_ZEROPAGE error: %"PRId64, 1127 (int64_t)uffdio_zeropage->zeropage); 1128 } 1129} 1130 1131static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) 1132{ 1133 struct uffdio_zeropage uffdio_zeropage; 1134 int ret; 1135 bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE); 1136 __s64 res; 1137 1138 if (offset >= nr_pages * page_size) 1139 err("unexpected offset %lu", offset); 1140 uffdio_zeropage.range.start = (unsigned long) area_dst + offset; 1141 uffdio_zeropage.range.len = page_size; 1142 uffdio_zeropage.mode = 0; 1143 ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); 1144 res = uffdio_zeropage.zeropage; 1145 if (ret) { 1146 /* real retval in ufdio_zeropage.zeropage */ 1147 if (has_zeropage) 1148 err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res); 1149 else if (res != -EINVAL) 1150 err("UFFDIO_ZEROPAGE not -EINVAL"); 1151 } else if (has_zeropage) { 1152 if (res != page_size) { 1153 err("UFFDIO_ZEROPAGE unexpected size"); 1154 } else { 1155 if (test_uffdio_zeropage_eexist && retry) { 1156 test_uffdio_zeropage_eexist = false; 1157 retry_uffdio_zeropage(ufd, &uffdio_zeropage, 1158 offset); 1159 } 1160 return 1; 1161 } 1162 } else 1163 err("UFFDIO_ZEROPAGE succeeded"); 1164 1165 return 0; 1166} 1167 1168static int uffdio_zeropage(int ufd, unsigned long offset) 1169{ 1170 return __uffdio_zeropage(ufd, offset, false); 1171} 1172 1173/* exercise UFFDIO_ZEROPAGE */ 1174static int userfaultfd_zeropage_test(void) 1175{ 1176 struct uffdio_register uffdio_register; 1177 1178 printf("testing UFFDIO_ZEROPAGE: "); 1179 fflush(stdout); 1180 1181 uffd_test_ctx_init(0); 1182 1183 uffdio_register.range.start = (unsigned long) area_dst; 1184 uffdio_register.range.len = nr_pages * page_size; 1185 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; 1186 if (test_uffdio_wp) 1187 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; 1188 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1189 err("register failure"); 1190 1191 assert_expected_ioctls_present( 1192 uffdio_register.mode, uffdio_register.ioctls); 1193 1194 if (uffdio_zeropage(uffd, 0)) 1195 if (my_bcmp(area_dst, zeropage, page_size)) 1196 err("zeropage is not zero"); 1197 1198 printf("done.\n"); 1199 return 0; 1200} 1201 1202static int userfaultfd_events_test(void) 1203{ 1204 struct uffdio_register uffdio_register; 1205 pthread_t uffd_mon; 1206 int err, features; 1207 pid_t pid; 1208 char c; 1209 struct uffd_stats stats = { 0 }; 1210 1211 printf("testing events (fork, remap, remove): "); 1212 fflush(stdout); 1213 1214 features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | 1215 UFFD_FEATURE_EVENT_REMOVE; 1216 uffd_test_ctx_init(features); 1217 1218 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); 1219 1220 uffdio_register.range.start = (unsigned long) area_dst; 1221 uffdio_register.range.len = nr_pages * page_size; 1222 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; 1223 if (test_uffdio_wp) 1224 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; 1225 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1226 err("register failure"); 1227 1228 assert_expected_ioctls_present( 1229 uffdio_register.mode, uffdio_register.ioctls); 1230 1231 if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) 1232 err("uffd_poll_thread create"); 1233 1234 pid = fork(); 1235 if (pid < 0) 1236 err("fork"); 1237 1238 if (!pid) 1239 exit(faulting_process(0)); 1240 1241 waitpid(pid, &err, 0); 1242 if (err) 1243 err("faulting process failed"); 1244 if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) 1245 err("pipe write"); 1246 if (pthread_join(uffd_mon, NULL)) 1247 return 1; 1248 1249 uffd_stats_report(&stats, 1); 1250 1251 return stats.missing_faults != nr_pages; 1252} 1253 1254static int userfaultfd_sig_test(void) 1255{ 1256 struct uffdio_register uffdio_register; 1257 unsigned long userfaults; 1258 pthread_t uffd_mon; 1259 int err, features; 1260 pid_t pid; 1261 char c; 1262 struct uffd_stats stats = { 0 }; 1263 1264 printf("testing signal delivery: "); 1265 fflush(stdout); 1266 1267 features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS; 1268 uffd_test_ctx_init(features); 1269 1270 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); 1271 1272 uffdio_register.range.start = (unsigned long) area_dst; 1273 uffdio_register.range.len = nr_pages * page_size; 1274 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; 1275 if (test_uffdio_wp) 1276 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; 1277 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1278 err("register failure"); 1279 1280 assert_expected_ioctls_present( 1281 uffdio_register.mode, uffdio_register.ioctls); 1282 1283 if (faulting_process(1)) 1284 err("faulting process failed"); 1285 1286 uffd_test_ops->release_pages(area_dst); 1287 1288 if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) 1289 err("uffd_poll_thread create"); 1290 1291 pid = fork(); 1292 if (pid < 0) 1293 err("fork"); 1294 1295 if (!pid) 1296 exit(faulting_process(2)); 1297 1298 waitpid(pid, &err, 0); 1299 if (err) 1300 err("faulting process failed"); 1301 if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) 1302 err("pipe write"); 1303 if (pthread_join(uffd_mon, (void **)&userfaults)) 1304 return 1; 1305 1306 printf("done.\n"); 1307 if (userfaults) 1308 err("Signal test failed, userfaults: %ld", userfaults); 1309 1310 return userfaults != 0; 1311} 1312 1313void check_memory_contents(char *p) 1314{ 1315 unsigned long i; 1316 uint8_t expected_byte; 1317 void *expected_page; 1318 1319 if (posix_memalign(&expected_page, page_size, page_size)) 1320 err("out of memory"); 1321 1322 for (i = 0; i < nr_pages; ++i) { 1323 expected_byte = ~((uint8_t)(i % ((uint8_t)-1))); 1324 memset(expected_page, expected_byte, page_size); 1325 if (my_bcmp(expected_page, p + (i * page_size), page_size)) 1326 err("unexpected page contents after minor fault"); 1327 } 1328 1329 free(expected_page); 1330} 1331 1332static int userfaultfd_minor_test(void) 1333{ 1334 unsigned long p; 1335 struct uffdio_register uffdio_register; 1336 pthread_t uffd_mon; 1337 char c; 1338 struct uffd_stats stats = { 0 }; 1339 1340 if (!test_uffdio_minor) 1341 return 0; 1342 1343 printf("testing minor faults: "); 1344 fflush(stdout); 1345 1346 uffd_test_ctx_init(uffd_minor_feature()); 1347 1348 uffdio_register.range.start = (unsigned long)area_dst_alias; 1349 uffdio_register.range.len = nr_pages * page_size; 1350 uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; 1351 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1352 err("register failure"); 1353 1354 assert_expected_ioctls_present( 1355 uffdio_register.mode, uffdio_register.ioctls); 1356 1357 /* 1358 * After registering with UFFD, populate the non-UFFD-registered side of 1359 * the shared mapping. This should *not* trigger any UFFD minor faults. 1360 */ 1361 for (p = 0; p < nr_pages; ++p) { 1362 memset(area_dst + (p * page_size), p % ((uint8_t)-1), 1363 page_size); 1364 } 1365 1366 if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) 1367 err("uffd_poll_thread create"); 1368 1369 /* 1370 * Read each of the pages back using the UFFD-registered mapping. We 1371 * expect that the first time we touch a page, it will result in a minor 1372 * fault. uffd_poll_thread will resolve the fault by bit-flipping the 1373 * page's contents, and then issuing a CONTINUE ioctl. 1374 */ 1375 check_memory_contents(area_dst_alias); 1376 1377 if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) 1378 err("pipe write"); 1379 if (pthread_join(uffd_mon, NULL)) 1380 return 1; 1381 1382 uffd_stats_report(&stats, 1); 1383 1384 if (test_collapse) { 1385 printf("testing collapse of uffd memory into PMD-mapped THPs:"); 1386 if (madvise(area_dst_alias, nr_pages * page_size, 1387 MADV_COLLAPSE)) 1388 err("madvise(MADV_COLLAPSE)"); 1389 1390 uffd_test_ops->check_pmd_mapping(area_dst, 1391 nr_pages * page_size / 1392 hpage_size); 1393 /* 1394 * This won't cause uffd-fault - it purely just makes sure there 1395 * was no corruption. 1396 */ 1397 check_memory_contents(area_dst_alias); 1398 printf(" done.\n"); 1399 } 1400 1401 return stats.missing_faults != 0 || stats.minor_faults != nr_pages; 1402} 1403 1404#define BIT_ULL(nr) (1ULL << (nr)) 1405#define PM_SOFT_DIRTY BIT_ULL(55) 1406#define PM_MMAP_EXCLUSIVE BIT_ULL(56) 1407#define PM_UFFD_WP BIT_ULL(57) 1408#define PM_FILE BIT_ULL(61) 1409#define PM_SWAP BIT_ULL(62) 1410#define PM_PRESENT BIT_ULL(63) 1411 1412static int pagemap_open(void) 1413{ 1414 int fd = open("/proc/self/pagemap", O_RDONLY); 1415 1416 if (fd < 0) 1417 err("open pagemap"); 1418 1419 return fd; 1420} 1421 1422static uint64_t pagemap_read_vaddr(int fd, void *vaddr) 1423{ 1424 uint64_t value; 1425 int ret; 1426 1427 ret = pread(fd, &value, sizeof(uint64_t), 1428 ((uint64_t)vaddr >> 12) * sizeof(uint64_t)); 1429 if (ret != sizeof(uint64_t)) 1430 err("pread() on pagemap failed"); 1431 1432 return value; 1433} 1434 1435/* This macro let __LINE__ works in err() */ 1436#define pagemap_check_wp(value, wp) do { \ 1437 if (!!(value & PM_UFFD_WP) != wp) \ 1438 err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \ 1439 } while (0) 1440 1441static int pagemap_test_fork(bool present) 1442{ 1443 pid_t child = fork(); 1444 uint64_t value; 1445 int fd, result; 1446 1447 if (!child) { 1448 /* Open the pagemap fd of the child itself */ 1449 fd = pagemap_open(); 1450 value = pagemap_read_vaddr(fd, area_dst); 1451 /* 1452 * After fork() uffd-wp bit should be gone as long as we're 1453 * without UFFD_FEATURE_EVENT_FORK 1454 */ 1455 pagemap_check_wp(value, false); 1456 /* Succeed */ 1457 exit(0); 1458 } 1459 waitpid(child, &result, 0); 1460 return result; 1461} 1462 1463static void userfaultfd_pagemap_test(unsigned int test_pgsize) 1464{ 1465 struct uffdio_register uffdio_register; 1466 int pagemap_fd; 1467 uint64_t value; 1468 1469 /* Pagemap tests uffd-wp only */ 1470 if (!test_uffdio_wp) 1471 return; 1472 1473 /* Not enough memory to test this page size */ 1474 if (test_pgsize > nr_pages * page_size) 1475 return; 1476 1477 printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize); 1478 /* Flush so it doesn't flush twice in parent/child later */ 1479 fflush(stdout); 1480 1481 uffd_test_ctx_init(0); 1482 1483 if (test_pgsize > page_size) { 1484 /* This is a thp test */ 1485 if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) 1486 err("madvise(MADV_HUGEPAGE) failed"); 1487 } else if (test_pgsize == page_size) { 1488 /* This is normal page test; force no thp */ 1489 if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) 1490 err("madvise(MADV_NOHUGEPAGE) failed"); 1491 } 1492 1493 uffdio_register.range.start = (unsigned long) area_dst; 1494 uffdio_register.range.len = nr_pages * page_size; 1495 uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; 1496 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1497 err("register failed"); 1498 1499 pagemap_fd = pagemap_open(); 1500 1501 /* Touch the page */ 1502 *area_dst = 1; 1503 wp_range(uffd, (uint64_t)area_dst, test_pgsize, true); 1504 value = pagemap_read_vaddr(pagemap_fd, area_dst); 1505 pagemap_check_wp(value, true); 1506 /* Make sure uffd-wp bit dropped when fork */ 1507 if (pagemap_test_fork(true)) 1508 err("Detected stall uffd-wp bit in child"); 1509 1510 /* Exclusive required or PAGEOUT won't work */ 1511 if (!(value & PM_MMAP_EXCLUSIVE)) 1512 err("multiple mapping detected: 0x%"PRIx64, value); 1513 1514 if (madvise(area_dst, test_pgsize, MADV_PAGEOUT)) 1515 err("madvise(MADV_PAGEOUT) failed"); 1516 1517 /* Uffd-wp should persist even swapped out */ 1518 value = pagemap_read_vaddr(pagemap_fd, area_dst); 1519 pagemap_check_wp(value, true); 1520 /* Make sure uffd-wp bit dropped when fork */ 1521 if (pagemap_test_fork(false)) 1522 err("Detected stall uffd-wp bit in child"); 1523 1524 /* Unprotect; this tests swap pte modifications */ 1525 wp_range(uffd, (uint64_t)area_dst, page_size, false); 1526 value = pagemap_read_vaddr(pagemap_fd, area_dst); 1527 pagemap_check_wp(value, false); 1528 1529 /* Fault in the page from disk */ 1530 *area_dst = 2; 1531 value = pagemap_read_vaddr(pagemap_fd, area_dst); 1532 pagemap_check_wp(value, false); 1533 1534 close(pagemap_fd); 1535 printf("done\n"); 1536} 1537 1538static int userfaultfd_stress(void) 1539{ 1540 void *area; 1541 unsigned long nr; 1542 struct uffdio_register uffdio_register; 1543 struct uffd_stats uffd_stats[nr_cpus]; 1544 1545 uffd_test_ctx_init(0); 1546 1547 if (posix_memalign(&area, page_size, page_size)) 1548 err("out of memory"); 1549 zeropage = area; 1550 bzero(zeropage, page_size); 1551 1552 pthread_mutex_lock(&uffd_read_mutex); 1553 1554 pthread_attr_init(&attr); 1555 pthread_attr_setstacksize(&attr, 16*1024*1024); 1556 1557 while (bounces--) { 1558 printf("bounces: %d, mode:", bounces); 1559 if (bounces & BOUNCE_RANDOM) 1560 printf(" rnd"); 1561 if (bounces & BOUNCE_RACINGFAULTS) 1562 printf(" racing"); 1563 if (bounces & BOUNCE_VERIFY) 1564 printf(" ver"); 1565 if (bounces & BOUNCE_POLL) 1566 printf(" poll"); 1567 else 1568 printf(" read"); 1569 printf(", "); 1570 fflush(stdout); 1571 1572 if (bounces & BOUNCE_POLL) 1573 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); 1574 else 1575 fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); 1576 1577 /* register */ 1578 uffdio_register.range.start = (unsigned long) area_dst; 1579 uffdio_register.range.len = nr_pages * page_size; 1580 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; 1581 if (test_uffdio_wp) 1582 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; 1583 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1584 err("register failure"); 1585 assert_expected_ioctls_present( 1586 uffdio_register.mode, uffdio_register.ioctls); 1587 1588 if (area_dst_alias) { 1589 uffdio_register.range.start = (unsigned long) 1590 area_dst_alias; 1591 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1592 err("register failure alias"); 1593 } 1594 1595 /* 1596 * The madvise done previously isn't enough: some 1597 * uffd_thread could have read userfaults (one of 1598 * those already resolved by the background thread) 1599 * and it may be in the process of calling 1600 * UFFDIO_COPY. UFFDIO_COPY will read the zapped 1601 * area_src and it would map a zero page in it (of 1602 * course such a UFFDIO_COPY is perfectly safe as it'd 1603 * return -EEXIST). The problem comes at the next 1604 * bounce though: that racing UFFDIO_COPY would 1605 * generate zeropages in the area_src, so invalidating 1606 * the previous MADV_DONTNEED. Without this additional 1607 * MADV_DONTNEED those zeropages leftovers in the 1608 * area_src would lead to -EEXIST failure during the 1609 * next bounce, effectively leaving a zeropage in the 1610 * area_dst. 1611 * 1612 * Try to comment this out madvise to see the memory 1613 * corruption being caught pretty quick. 1614 * 1615 * khugepaged is also inhibited to collapse THP after 1616 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's 1617 * required to MADV_DONTNEED here. 1618 */ 1619 uffd_test_ops->release_pages(area_dst); 1620 1621 uffd_stats_reset(uffd_stats, nr_cpus); 1622 1623 /* bounce pass */ 1624 if (stress(uffd_stats)) 1625 return 1; 1626 1627 /* Clear all the write protections if there is any */ 1628 if (test_uffdio_wp) 1629 wp_range(uffd, (unsigned long)area_dst, 1630 nr_pages * page_size, false); 1631 1632 /* unregister */ 1633 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) 1634 err("unregister failure"); 1635 if (area_dst_alias) { 1636 uffdio_register.range.start = (unsigned long) area_dst; 1637 if (ioctl(uffd, UFFDIO_UNREGISTER, 1638 &uffdio_register.range)) 1639 err("unregister failure alias"); 1640 } 1641 1642 /* verification */ 1643 if (bounces & BOUNCE_VERIFY) 1644 for (nr = 0; nr < nr_pages; nr++) 1645 if (*area_count(area_dst, nr) != count_verify[nr]) 1646 err("error area_count %llu %llu %lu\n", 1647 *area_count(area_src, nr), 1648 count_verify[nr], nr); 1649 1650 /* prepare next bounce */ 1651 swap(area_src, area_dst); 1652 1653 swap(area_src_alias, area_dst_alias); 1654 1655 uffd_stats_report(uffd_stats, nr_cpus); 1656 } 1657 1658 if (test_type == TEST_ANON) { 1659 /* 1660 * shmem/hugetlb won't be able to run since they have different 1661 * behavior on fork() (file-backed memory normally drops ptes 1662 * directly when fork), meanwhile the pagemap test will verify 1663 * pgtable entry of fork()ed child. 1664 */ 1665 userfaultfd_pagemap_test(page_size); 1666 /* 1667 * Hard-code for x86_64 for now for 2M THP, as x86_64 is 1668 * currently the only one that supports uffd-wp 1669 */ 1670 userfaultfd_pagemap_test(page_size * 512); 1671 } 1672 1673 return userfaultfd_zeropage_test() || userfaultfd_sig_test() 1674 || userfaultfd_events_test() || userfaultfd_minor_test(); 1675} 1676 1677/* 1678 * Copied from mlock2-tests.c 1679 */ 1680unsigned long default_huge_page_size(void) 1681{ 1682 unsigned long hps = 0; 1683 char *line = NULL; 1684 size_t linelen = 0; 1685 FILE *f = fopen("/proc/meminfo", "r"); 1686 1687 if (!f) 1688 return 0; 1689 while (getline(&line, &linelen, f) > 0) { 1690 if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { 1691 hps <<= 10; 1692 break; 1693 } 1694 } 1695 1696 free(line); 1697 fclose(f); 1698 return hps; 1699} 1700 1701static void set_test_type(const char *type) 1702{ 1703 if (!strcmp(type, "anon")) { 1704 test_type = TEST_ANON; 1705 uffd_test_ops = &anon_uffd_test_ops; 1706 } else if (!strcmp(type, "hugetlb")) { 1707 test_type = TEST_HUGETLB; 1708 uffd_test_ops = &hugetlb_uffd_test_ops; 1709 } else if (!strcmp(type, "hugetlb_shared")) { 1710 map_shared = true; 1711 test_type = TEST_HUGETLB; 1712 uffd_test_ops = &hugetlb_uffd_test_ops; 1713 /* Minor faults require shared hugetlb; only enable here. */ 1714 test_uffdio_minor = true; 1715 } else if (!strcmp(type, "shmem")) { 1716 map_shared = true; 1717 test_type = TEST_SHMEM; 1718 uffd_test_ops = &shmem_uffd_test_ops; 1719 test_uffdio_minor = true; 1720 } 1721} 1722 1723static void parse_test_type_arg(const char *raw_type) 1724{ 1725 char *buf = strdup(raw_type); 1726 uint64_t features = UFFD_API_FEATURES; 1727 1728 while (buf) { 1729 const char *token = strsep(&buf, ":"); 1730 1731 if (!test_type) 1732 set_test_type(token); 1733 else if (!strcmp(token, "dev")) 1734 test_dev_userfaultfd = true; 1735 else if (!strcmp(token, "syscall")) 1736 test_dev_userfaultfd = false; 1737 else if (!strcmp(token, "collapse")) 1738 test_collapse = true; 1739 else 1740 err("unrecognized test mod '%s'", token); 1741 } 1742 1743 if (!test_type) 1744 err("failed to parse test type argument: '%s'", raw_type); 1745 1746 if (test_collapse && test_type != TEST_SHMEM) 1747 err("Unsupported test: %s", raw_type); 1748 1749 if (test_type == TEST_HUGETLB) 1750 page_size = hpage_size; 1751 else 1752 page_size = sysconf(_SC_PAGE_SIZE); 1753 1754 if (!page_size) 1755 err("Unable to determine page size"); 1756 if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 1757 > page_size) 1758 err("Impossible to run this test"); 1759 1760 /* 1761 * Whether we can test certain features depends not just on test type, 1762 * but also on whether or not this particular kernel supports the 1763 * feature. 1764 */ 1765 1766 userfaultfd_open(&features); 1767 1768 test_uffdio_wp = test_uffdio_wp && 1769 (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); 1770 test_uffdio_minor = test_uffdio_minor && 1771 (features & uffd_minor_feature()); 1772 1773 close(uffd); 1774 uffd = -1; 1775} 1776 1777static void sigalrm(int sig) 1778{ 1779 if (sig != SIGALRM) 1780 abort(); 1781 test_uffdio_copy_eexist = true; 1782 test_uffdio_zeropage_eexist = true; 1783 alarm(ALARM_INTERVAL_SECS); 1784} 1785 1786int main(int argc, char **argv) 1787{ 1788 size_t bytes; 1789 1790 if (argc < 4) 1791 usage(); 1792 1793 if (signal(SIGALRM, sigalrm) == SIG_ERR) 1794 err("failed to arm SIGALRM"); 1795 alarm(ALARM_INTERVAL_SECS); 1796 1797 hpage_size = default_huge_page_size(); 1798 parse_test_type_arg(argv[1]); 1799 bytes = atol(argv[2]) * 1024 * 1024; 1800 1801 if (test_collapse && bytes & (hpage_size - 1)) 1802 err("MiB must be multiple of %lu if :collapse mod set", 1803 hpage_size >> 20); 1804 1805 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); 1806 1807 if (test_collapse) { 1808 /* nr_cpus must divide (bytes / page_size), otherwise, 1809 * area allocations of (nr_pages * paze_size) won't be a 1810 * multiple of hpage_size, even if bytes is a multiple of 1811 * hpage_size. 1812 * 1813 * This means that nr_cpus must divide (N * (2 << (H-P)) 1814 * where: 1815 * bytes = hpage_size * N 1816 * hpage_size = 2 << H 1817 * page_size = 2 << P 1818 * 1819 * And we want to chose nr_cpus to be the largest value 1820 * satisfying this constraint, not larger than the number 1821 * of online CPUs. Unfortunately, prime factorization of 1822 * N and nr_cpus may be arbitrary, so have to search for it. 1823 * Instead, just use the highest power of 2 dividing both 1824 * nr_cpus and (bytes / page_size). 1825 */ 1826 int x = factor_of_2(nr_cpus); 1827 int y = factor_of_2(bytes / page_size); 1828 1829 nr_cpus = x < y ? x : y; 1830 } 1831 nr_pages_per_cpu = bytes / page_size / nr_cpus; 1832 if (!nr_pages_per_cpu) { 1833 _err("invalid MiB"); 1834 usage(); 1835 } 1836 1837 bounces = atoi(argv[3]); 1838 if (bounces <= 0) { 1839 _err("invalid bounces"); 1840 usage(); 1841 } 1842 nr_pages = nr_pages_per_cpu * nr_cpus; 1843 1844 if (test_type == TEST_HUGETLB && map_shared) { 1845 if (argc < 5) 1846 usage(); 1847 huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755); 1848 if (huge_fd < 0) 1849 err("Open of %s failed", argv[4]); 1850 if (ftruncate(huge_fd, 0)) 1851 err("ftruncate %s to size 0 failed", argv[4]); 1852 } else if (test_type == TEST_SHMEM) { 1853 shm_fd = memfd_create(argv[0], 0); 1854 if (shm_fd < 0) 1855 err("memfd_create"); 1856 if (ftruncate(shm_fd, nr_pages * page_size * 2)) 1857 err("ftruncate"); 1858 if (fallocate(shm_fd, 1859 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, 1860 nr_pages * page_size * 2)) 1861 err("fallocate"); 1862 } 1863 printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", 1864 nr_pages, nr_pages_per_cpu); 1865 return userfaultfd_stress(); 1866} 1867 1868#else /* __NR_userfaultfd */ 1869 1870#warning "missing __NR_userfaultfd definition" 1871 1872int main(void) 1873{ 1874 printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); 1875 return KSFT_SKIP; 1876} 1877 1878#endif /* __NR_userfaultfd */