Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mshv: Add support for movable memory regions

Introduce support for movable memory regions in the Hyper-V root partition
driver to improve memory management flexibility and enable advanced use
cases such as dynamic memory remapping.

Mirror the address space between the Linux root partition and guest VMs
using HMM. The root partition owns the memory, while guest VMs act as
devices with page tables managed via hypercalls. MSHV handles VP intercepts
by invoking hmm_range_fault() and updating SLAT entries. When memory is
reclaimed, HMM invalidates the relevant regions, prompting MSHV to clear
SLAT entries; guest VMs will fault again on access.

Integrate mmu_interval_notifier for movable regions, implement handlers for
HMM faults and memory invalidation, and update memory region mapping logic
to support movable regions.

While MMU notifiers are commonly used in virtualization drivers, this
implementation leverages HMM (Heterogeneous Memory Management) for its
specialized functionality. HMM provides a framework for mirroring,
invalidation, and fault handling, reducing boilerplate and improving
maintainability compared to generic MMU notifiers.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
Reviewed-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>

authored by

Stanislav Kinsburskii and committed by
Wei Liu
b9a66cd5 c39dda08

+346 -36
+2
drivers/hv/Kconfig
··· 76 76 depends on PAGE_SIZE_4KB 77 77 select EVENTFD 78 78 select VIRT_XFER_TO_GUEST_WORK 79 + select HMM_MIRROR 80 + select MMU_NOTIFIER 79 81 default n 80 82 help 81 83 Select this option to enable support for booting and running as root
+212 -6
drivers/hv/mshv_regions.c
··· 7 7 * Authors: Microsoft Linux virtualization team 8 8 */ 9 9 10 + #include <linux/hmm.h> 11 + #include <linux/hyperv.h> 10 12 #include <linux/kref.h> 11 13 #include <linux/mm.h> 12 14 #include <linux/vmalloc.h> ··· 16 14 #include <asm/mshyperv.h> 17 15 18 16 #include "mshv_root.h" 17 + 18 + #define MSHV_MAP_FAULT_IN_PAGES PTRS_PER_PMD 19 19 20 20 /** 21 21 * mshv_region_process_chunk - Processes a contiguous chunk of memory pages ··· 138 134 } 139 135 140 136 struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, 141 - u64 uaddr, u32 flags, 142 - bool is_mmio) 137 + u64 uaddr, u32 flags) 143 138 { 144 139 struct mshv_mem_region *region; 145 140 ··· 154 151 region->hv_map_flags |= HV_MAP_GPA_WRITABLE; 155 152 if (flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) 156 153 region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; 157 - 158 - if (!is_mmio) 159 - region->flags.range_pinned = true; 160 154 161 155 kref_init(&region->refcount); 162 156 ··· 245 245 static void mshv_region_invalidate_pages(struct mshv_mem_region *region, 246 246 u64 page_offset, u64 page_count) 247 247 { 248 - if (region->flags.range_pinned) 248 + if (region->type == MSHV_REGION_TYPE_MEM_PINNED) 249 249 unpin_user_pages(region->pages + page_offset, page_count); 250 250 251 251 memset(region->pages + page_offset, 0, ··· 321 321 struct mshv_partition *partition = region->partition; 322 322 int ret; 323 323 324 + if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE) 325 + mshv_region_movable_fini(region); 326 + 324 327 if (mshv_partition_encrypted(partition)) { 325 328 ret = mshv_region_share(region); 326 329 if (ret) { ··· 349 346 int mshv_region_get(struct mshv_mem_region *region) 350 347 { 351 348 return kref_get_unless_zero(&region->refcount); 349 + } 350 + 351 + /** 352 + * mshv_region_hmm_fault_and_lock - Handle HMM faults and lock the memory region 353 + * @region: Pointer to the memory region structure 354 + * @range: Pointer to the HMM range structure 355 + * 356 + * This function performs the following steps: 357 + * 1. Reads the notifier sequence for the HMM range. 358 + * 2. Acquires a read lock on the memory map. 359 + * 3. Handles HMM faults for the specified range. 360 + * 4. Releases the read lock on the memory map. 361 + * 5. If successful, locks the memory region mutex. 362 + * 6. Verifies if the notifier sequence has changed during the operation. 363 + * If it has, releases the mutex and returns -EBUSY to match with 364 + * hmm_range_fault() return code for repeating. 365 + * 366 + * Return: 0 on success, a negative error code otherwise. 367 + */ 368 + static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region, 369 + struct hmm_range *range) 370 + { 371 + int ret; 372 + 373 + range->notifier_seq = mmu_interval_read_begin(range->notifier); 374 + mmap_read_lock(region->mni.mm); 375 + ret = hmm_range_fault(range); 376 + mmap_read_unlock(region->mni.mm); 377 + if (ret) 378 + return ret; 379 + 380 + mutex_lock(&region->mutex); 381 + 382 + if (mmu_interval_read_retry(range->notifier, range->notifier_seq)) { 383 + mutex_unlock(&region->mutex); 384 + cond_resched(); 385 + return -EBUSY; 386 + } 387 + 388 + return 0; 389 + } 390 + 391 + /** 392 + * mshv_region_range_fault - Handle memory range faults for a given region. 393 + * @region: Pointer to the memory region structure. 394 + * @page_offset: Offset of the page within the region. 395 + * @page_count: Number of pages to handle. 396 + * 397 + * This function resolves memory faults for a specified range of pages 398 + * within a memory region. It uses HMM (Heterogeneous Memory Management) 399 + * to fault in the required pages and updates the region's page array. 400 + * 401 + * Return: 0 on success, negative error code on failure. 402 + */ 403 + static int mshv_region_range_fault(struct mshv_mem_region *region, 404 + u64 page_offset, u64 page_count) 405 + { 406 + struct hmm_range range = { 407 + .notifier = &region->mni, 408 + .default_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE, 409 + }; 410 + unsigned long *pfns; 411 + int ret; 412 + u64 i; 413 + 414 + pfns = kmalloc_array(page_count, sizeof(*pfns), GFP_KERNEL); 415 + if (!pfns) 416 + return -ENOMEM; 417 + 418 + range.hmm_pfns = pfns; 419 + range.start = region->start_uaddr + page_offset * HV_HYP_PAGE_SIZE; 420 + range.end = range.start + page_count * HV_HYP_PAGE_SIZE; 421 + 422 + do { 423 + ret = mshv_region_hmm_fault_and_lock(region, &range); 424 + } while (ret == -EBUSY); 425 + 426 + if (ret) 427 + goto out; 428 + 429 + for (i = 0; i < page_count; i++) 430 + region->pages[page_offset + i] = hmm_pfn_to_page(pfns[i]); 431 + 432 + ret = mshv_region_remap_pages(region, region->hv_map_flags, 433 + page_offset, page_count); 434 + 435 + mutex_unlock(&region->mutex); 436 + out: 437 + kfree(pfns); 438 + return ret; 439 + } 440 + 441 + bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn) 442 + { 443 + u64 page_offset, page_count; 444 + int ret; 445 + 446 + /* Align the page offset to the nearest MSHV_MAP_FAULT_IN_PAGES. */ 447 + page_offset = ALIGN_DOWN(gfn - region->start_gfn, 448 + MSHV_MAP_FAULT_IN_PAGES); 449 + 450 + /* Map more pages than requested to reduce the number of faults. */ 451 + page_count = min(region->nr_pages - page_offset, 452 + MSHV_MAP_FAULT_IN_PAGES); 453 + 454 + ret = mshv_region_range_fault(region, page_offset, page_count); 455 + 456 + WARN_ONCE(ret, 457 + "p%llu: GPA intercept failed: region %#llx-%#llx, gfn %#llx, page_offset %llu, page_count %llu\n", 458 + region->partition->pt_id, region->start_uaddr, 459 + region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT), 460 + gfn, page_offset, page_count); 461 + 462 + return !ret; 463 + } 464 + 465 + /** 466 + * mshv_region_interval_invalidate - Invalidate a range of memory region 467 + * @mni: Pointer to the mmu_interval_notifier structure 468 + * @range: Pointer to the mmu_notifier_range structure 469 + * @cur_seq: Current sequence number for the interval notifier 470 + * 471 + * This function invalidates a memory region by remapping its pages with 472 + * no access permissions. It locks the region's mutex to ensure thread safety 473 + * and updates the sequence number for the interval notifier. If the range 474 + * is blockable, it uses a blocking lock; otherwise, it attempts a non-blocking 475 + * lock and returns false if unsuccessful. 476 + * 477 + * NOTE: Failure to invalidate a region is a serious error, as the pages will 478 + * be considered freed while they are still mapped by the hypervisor. 479 + * Any attempt to access such pages will likely crash the system. 480 + * 481 + * Return: true if the region was successfully invalidated, false otherwise. 482 + */ 483 + static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni, 484 + const struct mmu_notifier_range *range, 485 + unsigned long cur_seq) 486 + { 487 + struct mshv_mem_region *region = container_of(mni, 488 + struct mshv_mem_region, 489 + mni); 490 + u64 page_offset, page_count; 491 + unsigned long mstart, mend; 492 + int ret = -EPERM; 493 + 494 + if (mmu_notifier_range_blockable(range)) 495 + mutex_lock(&region->mutex); 496 + else if (!mutex_trylock(&region->mutex)) 497 + goto out_fail; 498 + 499 + mmu_interval_set_seq(mni, cur_seq); 500 + 501 + mstart = max(range->start, region->start_uaddr); 502 + mend = min(range->end, region->start_uaddr + 503 + (region->nr_pages << HV_HYP_PAGE_SHIFT)); 504 + 505 + page_offset = HVPFN_DOWN(mstart - region->start_uaddr); 506 + page_count = HVPFN_DOWN(mend - mstart); 507 + 508 + ret = mshv_region_remap_pages(region, HV_MAP_GPA_NO_ACCESS, 509 + page_offset, page_count); 510 + if (ret) 511 + goto out_fail; 512 + 513 + mshv_region_invalidate_pages(region, page_offset, page_count); 514 + 515 + mutex_unlock(&region->mutex); 516 + 517 + return true; 518 + 519 + out_fail: 520 + WARN_ONCE(ret, 521 + "Failed to invalidate region %#llx-%#llx (range %#lx-%#lx, event: %u, pages %#llx-%#llx, mm: %#llx): %d\n", 522 + region->start_uaddr, 523 + region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT), 524 + range->start, range->end, range->event, 525 + page_offset, page_offset + page_count - 1, (u64)range->mm, ret); 526 + return false; 527 + } 528 + 529 + static const struct mmu_interval_notifier_ops mshv_region_mni_ops = { 530 + .invalidate = mshv_region_interval_invalidate, 531 + }; 532 + 533 + void mshv_region_movable_fini(struct mshv_mem_region *region) 534 + { 535 + mmu_interval_notifier_remove(&region->mni); 536 + } 537 + 538 + bool mshv_region_movable_init(struct mshv_mem_region *region) 539 + { 540 + int ret; 541 + 542 + ret = mmu_interval_notifier_insert(&region->mni, current->mm, 543 + region->start_uaddr, 544 + region->nr_pages << HV_HYP_PAGE_SHIFT, 545 + &mshv_region_mni_ops); 546 + if (ret) 547 + return false; 548 + 549 + mutex_init(&region->mutex); 550 + 551 + return true; 352 552 }
+14 -6
drivers/hv/mshv_root.h
··· 15 15 #include <linux/hashtable.h> 16 16 #include <linux/dev_printk.h> 17 17 #include <linux/build_bug.h> 18 + #include <linux/mmu_notifier.h> 18 19 #include <uapi/linux/mshv.h> 19 20 20 21 /* ··· 71 70 #define vp_info(v, fmt, ...) vp_devprintk(info, v, fmt, ##__VA_ARGS__) 72 71 #define vp_dbg(v, fmt, ...) vp_devprintk(dbg, v, fmt, ##__VA_ARGS__) 73 72 73 + enum mshv_region_type { 74 + MSHV_REGION_TYPE_MEM_PINNED, 75 + MSHV_REGION_TYPE_MEM_MOVABLE, 76 + MSHV_REGION_TYPE_MMIO 77 + }; 78 + 74 79 struct mshv_mem_region { 75 80 struct hlist_node hnode; 76 81 struct kref refcount; ··· 84 77 u64 start_gfn; 85 78 u64 start_uaddr; 86 79 u32 hv_map_flags; 87 - struct { 88 - u64 range_pinned: 1; 89 - u64 reserved: 63; 90 - } flags; 91 80 struct mshv_partition *partition; 81 + enum mshv_region_type type; 82 + struct mmu_interval_notifier mni; 83 + struct mutex mutex; /* protects region pages remapping */ 92 84 struct page *pages[]; 93 85 }; 94 86 ··· 321 315 extern u8 * __percpu *hv_synic_eventring_tail; 322 316 323 317 struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, 324 - u64 uaddr, u32 flags, 325 - bool is_mmio); 318 + u64 uaddr, u32 flags); 326 319 int mshv_region_share(struct mshv_mem_region *region); 327 320 int mshv_region_unshare(struct mshv_mem_region *region); 328 321 int mshv_region_map(struct mshv_mem_region *region); ··· 329 324 int mshv_region_pin(struct mshv_mem_region *region); 330 325 void mshv_region_put(struct mshv_mem_region *region); 331 326 int mshv_region_get(struct mshv_mem_region *region); 327 + bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn); 328 + void mshv_region_movable_fini(struct mshv_mem_region *region); 329 + bool mshv_region_movable_init(struct mshv_mem_region *region); 332 330 333 331 #endif /* _MSHV_ROOT_H_ */
+118 -24
drivers/hv/mshv_root_main.c
··· 594 594 static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, 595 595 "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ"); 596 596 597 + static struct mshv_mem_region * 598 + mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) 599 + { 600 + struct mshv_mem_region *region; 601 + 602 + hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { 603 + if (gfn >= region->start_gfn && 604 + gfn < region->start_gfn + region->nr_pages) 605 + return region; 606 + } 607 + 608 + return NULL; 609 + } 610 + 611 + #ifdef CONFIG_X86_64 612 + static struct mshv_mem_region * 613 + mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn) 614 + { 615 + struct mshv_mem_region *region; 616 + 617 + spin_lock(&p->pt_mem_regions_lock); 618 + region = mshv_partition_region_by_gfn(p, gfn); 619 + if (!region || !mshv_region_get(region)) { 620 + spin_unlock(&p->pt_mem_regions_lock); 621 + return NULL; 622 + } 623 + spin_unlock(&p->pt_mem_regions_lock); 624 + 625 + return region; 626 + } 627 + 628 + /** 629 + * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts. 630 + * @vp: Pointer to the virtual processor structure. 631 + * 632 + * This function processes GPA intercepts by identifying the memory region 633 + * corresponding to the intercepted GPA, aligning the page offset, and 634 + * mapping the required pages. It ensures that the region is valid and 635 + * handles faults efficiently by mapping multiple pages at once. 636 + * 637 + * Return: true if the intercept was handled successfully, false otherwise. 638 + */ 639 + static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) 640 + { 641 + struct mshv_partition *p = vp->vp_partition; 642 + struct mshv_mem_region *region; 643 + struct hv_x64_memory_intercept_message *msg; 644 + bool ret; 645 + u64 gfn; 646 + 647 + msg = (struct hv_x64_memory_intercept_message *) 648 + vp->vp_intercept_msg_page->u.payload; 649 + 650 + gfn = HVPFN_DOWN(msg->guest_physical_address); 651 + 652 + region = mshv_partition_region_by_gfn_get(p, gfn); 653 + if (!region) 654 + return false; 655 + 656 + /* Only movable memory ranges are supported for GPA intercepts */ 657 + if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE) 658 + ret = mshv_region_handle_gfn_fault(region, gfn); 659 + else 660 + ret = false; 661 + 662 + mshv_region_put(region); 663 + 664 + return ret; 665 + } 666 + #else /* CONFIG_X86_64 */ 667 + static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; } 668 + #endif /* CONFIG_X86_64 */ 669 + 670 + static bool mshv_vp_handle_intercept(struct mshv_vp *vp) 671 + { 672 + switch (vp->vp_intercept_msg_page->header.message_type) { 673 + case HVMSG_GPA_INTERCEPT: 674 + return mshv_handle_gpa_intercept(vp); 675 + } 676 + return false; 677 + } 678 + 597 679 static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg) 598 680 { 599 681 long rc; 600 682 601 - if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 602 - rc = mshv_run_vp_with_root_scheduler(vp); 603 - else 604 - rc = mshv_run_vp_with_hyp_scheduler(vp); 683 + do { 684 + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) 685 + rc = mshv_run_vp_with_root_scheduler(vp); 686 + else 687 + rc = mshv_run_vp_with_hyp_scheduler(vp); 688 + } while (rc == 0 && mshv_vp_handle_intercept(vp)); 605 689 606 690 if (rc) 607 691 return rc; ··· 1143 1059 *status = partition->async_hypercall_status; 1144 1060 } 1145 1061 1146 - static struct mshv_mem_region * 1147 - mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) 1148 - { 1149 - struct mshv_mem_region *region; 1150 - 1151 - hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { 1152 - if (gfn >= region->start_gfn && 1153 - gfn < region->start_gfn + region->nr_pages) 1154 - return region; 1155 - } 1156 - 1157 - return NULL; 1158 - } 1159 - 1160 1062 /* 1161 1063 * NB: caller checks and makes sure mem->size is page aligned 1162 1064 * Returns: 0 with regionpp updated on success, or -errno ··· 1167 1097 spin_unlock(&partition->pt_mem_regions_lock); 1168 1098 1169 1099 rg = mshv_region_create(mem->guest_pfn, nr_pages, 1170 - mem->userspace_addr, mem->flags, 1171 - is_mmio); 1100 + mem->userspace_addr, mem->flags); 1172 1101 if (IS_ERR(rg)) 1173 1102 return PTR_ERR(rg); 1103 + 1104 + if (is_mmio) 1105 + rg->type = MSHV_REGION_TYPE_MMIO; 1106 + else if (mshv_partition_encrypted(partition) || 1107 + !mshv_region_movable_init(rg)) 1108 + rg->type = MSHV_REGION_TYPE_MEM_PINNED; 1109 + else 1110 + rg->type = MSHV_REGION_TYPE_MEM_MOVABLE; 1174 1111 1175 1112 rg->partition = partition; 1176 1113 ··· 1294 1217 if (ret) 1295 1218 return ret; 1296 1219 1297 - if (is_mmio) 1298 - ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn, 1299 - mmio_pfn, HVPFN_DOWN(mem.size)); 1300 - else 1220 + switch (region->type) { 1221 + case MSHV_REGION_TYPE_MEM_PINNED: 1301 1222 ret = mshv_prepare_pinned_region(region); 1223 + break; 1224 + case MSHV_REGION_TYPE_MEM_MOVABLE: 1225 + /* 1226 + * For movable memory regions, remap with no access to let 1227 + * the hypervisor track dirty pages, enabling pre-copy live 1228 + * migration. 1229 + */ 1230 + ret = hv_call_map_gpa_pages(partition->pt_id, 1231 + region->start_gfn, 1232 + region->nr_pages, 1233 + HV_MAP_GPA_NO_ACCESS, NULL); 1234 + break; 1235 + case MSHV_REGION_TYPE_MMIO: 1236 + ret = hv_call_map_mmio_pages(partition->pt_id, 1237 + region->start_gfn, 1238 + mmio_pfn, 1239 + region->nr_pages); 1240 + break; 1241 + } 1302 1242 1303 1243 if (ret) 1304 1244 goto errout;