Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mshv: Fix huge page handling in memory region traversal

The previous code assumed that if a region's first page was huge, the
entire region consisted of huge pages and stored this in a large_pages
flag. This premise is incorrect not only for movable regions (where
pages can be split and merged on invalidate callbacks or page faults),
but even for pinned regions: THPs can be split and merged during
allocation, so a large, pinned region may contain a mix of huge and
regular pages.

This change removes the large_pages flag and replaces region-wide
assumptions with per-chunk inspection of the actual page size when
mapping, unmapping, sharing, and unsharing. This makes huge page
handling correct for mixed-page regions and avoids relying on stale
metadata that can easily become invalid as memory is remapped.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
Reviewed-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
Reviewed-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>

authored by

Stanislav Kinsburskii and committed by
Wei Liu
abceb429 e950c30a

+193 -33
+192 -31
drivers/hv/mshv_regions.c
··· 14 14 15 15 #include "mshv_root.h" 16 16 17 + /** 18 + * mshv_region_process_chunk - Processes a contiguous chunk of memory pages 19 + * in a region. 20 + * @region : Pointer to the memory region structure. 21 + * @flags : Flags to pass to the handler. 22 + * @page_offset: Offset into the region's pages array to start processing. 23 + * @page_count : Number of pages to process. 24 + * @handler : Callback function to handle the chunk. 25 + * 26 + * This function scans the region's pages starting from @page_offset, 27 + * checking for contiguous present pages of the same size (normal or huge). 28 + * It invokes @handler for the chunk of contiguous pages found. Returns the 29 + * number of pages handled, or a negative error code if the first page is 30 + * not present or the handler fails. 31 + * 32 + * Note: The @handler callback must be able to handle both normal and huge 33 + * pages. 34 + * 35 + * Return: Number of pages handled, or negative error code. 36 + */ 37 + static long mshv_region_process_chunk(struct mshv_mem_region *region, 38 + u32 flags, 39 + u64 page_offset, u64 page_count, 40 + int (*handler)(struct mshv_mem_region *region, 41 + u32 flags, 42 + u64 page_offset, 43 + u64 page_count)) 44 + { 45 + u64 count, stride; 46 + unsigned int page_order; 47 + struct page *page; 48 + int ret; 49 + 50 + page = region->pages[page_offset]; 51 + if (!page) 52 + return -EINVAL; 53 + 54 + page_order = folio_order(page_folio(page)); 55 + /* The hypervisor only supports 4K and 2M page sizes */ 56 + if (page_order && page_order != HPAGE_PMD_ORDER) 57 + return -EINVAL; 58 + 59 + stride = 1 << page_order; 60 + 61 + /* Start at stride since the first page is validated */ 62 + for (count = stride; count < page_count; count += stride) { 63 + page = region->pages[page_offset + count]; 64 + 65 + /* Break if current page is not present */ 66 + if (!page) 67 + break; 68 + 69 + /* Break if page size changes */ 70 + if (page_order != folio_order(page_folio(page))) 71 + break; 72 + } 73 + 74 + ret = handler(region, flags, page_offset, count); 75 + if (ret) 76 + return ret; 77 + 78 + return count; 79 + } 80 + 81 + /** 82 + * mshv_region_process_range - Processes a range of memory pages in a 83 + * region. 84 + * @region : Pointer to the memory region structure. 85 + * @flags : Flags to pass to the handler. 86 + * @page_offset: Offset into the region's pages array to start processing. 87 + * @page_count : Number of pages to process. 88 + * @handler : Callback function to handle each chunk of contiguous 89 + * pages. 90 + * 91 + * Iterates over the specified range of pages in @region, skipping 92 + * non-present pages. For each contiguous chunk of present pages, invokes 93 + * @handler via mshv_region_process_chunk. 94 + * 95 + * Note: The @handler callback must be able to handle both normal and huge 96 + * pages. 97 + * 98 + * Returns 0 on success, or a negative error code on failure. 99 + */ 100 + static int mshv_region_process_range(struct mshv_mem_region *region, 101 + u32 flags, 102 + u64 page_offset, u64 page_count, 103 + int (*handler)(struct mshv_mem_region *region, 104 + u32 flags, 105 + u64 page_offset, 106 + u64 page_count)) 107 + { 108 + long ret; 109 + 110 + if (page_offset + page_count > region->nr_pages) 111 + return -EINVAL; 112 + 113 + while (page_count) { 114 + /* Skip non-present pages */ 115 + if (!region->pages[page_offset]) { 116 + page_offset++; 117 + page_count--; 118 + continue; 119 + } 120 + 121 + ret = mshv_region_process_chunk(region, flags, 122 + page_offset, 123 + page_count, 124 + handler); 125 + if (ret < 0) 126 + return ret; 127 + 128 + page_offset += ret; 129 + page_count -= ret; 130 + } 131 + 132 + return 0; 133 + } 134 + 17 135 struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, 18 136 u64 uaddr, u32 flags, 19 137 bool is_mmio) ··· 151 33 if (flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) 152 34 region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; 153 35 154 - /* Note: large_pages flag populated when we pin the pages */ 155 36 if (!is_mmio) 156 37 region->flags.range_pinned = true; 157 38 158 39 return region; 159 40 } 160 41 42 + static int mshv_region_chunk_share(struct mshv_mem_region *region, 43 + u32 flags, 44 + u64 page_offset, u64 page_count) 45 + { 46 + struct page *page = region->pages[page_offset]; 47 + 48 + if (PageHuge(page) || PageTransCompound(page)) 49 + flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; 50 + 51 + return hv_call_modify_spa_host_access(region->partition->pt_id, 52 + region->pages + page_offset, 53 + page_count, 54 + HV_MAP_GPA_READABLE | 55 + HV_MAP_GPA_WRITABLE, 56 + flags, true); 57 + } 58 + 161 59 int mshv_region_share(struct mshv_mem_region *region) 162 60 { 163 61 u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED; 164 62 165 - if (region->flags.large_pages) 63 + return mshv_region_process_range(region, flags, 64 + 0, region->nr_pages, 65 + mshv_region_chunk_share); 66 + } 67 + 68 + static int mshv_region_chunk_unshare(struct mshv_mem_region *region, 69 + u32 flags, 70 + u64 page_offset, u64 page_count) 71 + { 72 + struct page *page = region->pages[page_offset]; 73 + 74 + if (PageHuge(page) || PageTransCompound(page)) 166 75 flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; 167 76 168 77 return hv_call_modify_spa_host_access(region->partition->pt_id, 169 - region->pages, region->nr_pages, 170 - HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE, 171 - flags, true); 78 + region->pages + page_offset, 79 + page_count, 0, 80 + flags, false); 172 81 } 173 82 174 83 int mshv_region_unshare(struct mshv_mem_region *region) 175 84 { 176 85 u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE; 177 86 178 - if (region->flags.large_pages) 179 - flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; 87 + return mshv_region_process_range(region, flags, 88 + 0, region->nr_pages, 89 + mshv_region_chunk_unshare); 90 + } 180 91 181 - return hv_call_modify_spa_host_access(region->partition->pt_id, 182 - region->pages, region->nr_pages, 183 - 0, 184 - flags, false); 92 + static int mshv_region_chunk_remap(struct mshv_mem_region *region, 93 + u32 flags, 94 + u64 page_offset, u64 page_count) 95 + { 96 + struct page *page = region->pages[page_offset]; 97 + 98 + if (PageHuge(page) || PageTransCompound(page)) 99 + flags |= HV_MAP_GPA_LARGE_PAGE; 100 + 101 + return hv_call_map_gpa_pages(region->partition->pt_id, 102 + region->start_gfn + page_offset, 103 + page_count, flags, 104 + region->pages + page_offset); 185 105 } 186 106 187 107 static int mshv_region_remap_pages(struct mshv_mem_region *region, 188 108 u32 map_flags, 189 109 u64 page_offset, u64 page_count) 190 110 { 191 - if (page_offset + page_count > region->nr_pages) 192 - return -EINVAL; 193 - 194 - if (region->flags.large_pages) 195 - map_flags |= HV_MAP_GPA_LARGE_PAGE; 196 - 197 - return hv_call_map_gpa_pages(region->partition->pt_id, 198 - region->start_gfn + page_offset, 199 - page_count, map_flags, 200 - region->pages + page_offset); 111 + return mshv_region_process_range(region, map_flags, 112 + page_offset, page_count, 113 + mshv_region_chunk_remap); 201 114 } 202 115 203 116 int mshv_region_map(struct mshv_mem_region *region) ··· 283 134 goto release_pages; 284 135 } 285 136 286 - if (PageHuge(region->pages[0])) 287 - region->flags.large_pages = true; 288 - 289 137 return 0; 290 138 291 139 release_pages: ··· 290 144 return ret; 291 145 } 292 146 147 + static int mshv_region_chunk_unmap(struct mshv_mem_region *region, 148 + u32 flags, 149 + u64 page_offset, u64 page_count) 150 + { 151 + struct page *page = region->pages[page_offset]; 152 + 153 + if (PageHuge(page) || PageTransCompound(page)) 154 + flags |= HV_UNMAP_GPA_LARGE_PAGE; 155 + 156 + return hv_call_unmap_gpa_pages(region->partition->pt_id, 157 + region->start_gfn + page_offset, 158 + page_count, flags); 159 + } 160 + 161 + static int mshv_region_unmap(struct mshv_mem_region *region) 162 + { 163 + return mshv_region_process_range(region, 0, 164 + 0, region->nr_pages, 165 + mshv_region_chunk_unmap); 166 + } 167 + 293 168 void mshv_region_destroy(struct mshv_mem_region *region) 294 169 { 295 170 struct mshv_partition *partition = region->partition; 296 - u32 unmap_flags = 0; 297 171 int ret; 298 172 299 173 hlist_del(&region->hnode); ··· 328 162 } 329 163 } 330 164 331 - if (region->flags.large_pages) 332 - unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; 333 - 334 - /* ignore unmap failures and continue as process may be exiting */ 335 - hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn, 336 - region->nr_pages, unmap_flags); 165 + mshv_region_unmap(region); 337 166 338 167 mshv_region_invalidate(region); 339 168
+1 -2
drivers/hv/mshv_root.h
··· 77 77 u64 start_uaddr; 78 78 u32 hv_map_flags; 79 79 struct { 80 - u64 large_pages: 1; /* 2MiB */ 81 80 u64 range_pinned: 1; 82 - u64 reserved: 62; 81 + u64 reserved: 63; 83 82 } flags; 84 83 struct mshv_partition *partition; 85 84 struct page *pages[];