mm/migrate_device: add THP splitting during migration

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Implement migrate_vma_split_pages() to handle THP splitting during the
migration process when destination cannot allocate compound pages.

This addresses the common scenario where migrate_vma_setup() succeeds with
MIGRATE_PFN_COMPOUND pages, but the destination device cannot allocate
large pages during the migration phase.

Key changes:
- migrate_vma_split_pages(): Split already-isolated pages during migration
- Enhanced folio_split() and __split_unmapped_folio() with isolated
parameter to avoid redundant unmap/remap operations

This provides a fallback mechansim to ensure migration succeeds even when
large page allocation fails at the destination.

[matthew.brost@intel.com: add THP splitting during migration]
Link: https://lkml.kernel.org/r/20251120230825.181072-2-matthew.brost@intel.com
Link: https://lkml.kernel.org/r/20251001065707.920170-12-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Balbir Singh and committed by

Andrew Morton 4 months ago 4265d67e 56ef3989

+119 -34

4 changed files

expand all

include

linux

huge_mm.h

lib

test_hmm.c

huge_memory.c

migrate_device.c

+9 -2

include/linux/huge_mm.h

··· 365 365 vm_flags_t vm_flags); 366 366 367 367 bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins); 368 - int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, 369 - unsigned int new_order); 368 + int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, 369 + unsigned int new_order, bool unmapped); 370 370 int min_order_for_split(struct folio *folio); 371 371 int split_folio_to_list(struct folio *folio, struct list_head *list); 372 372 bool uniform_split_supported(struct folio *folio, unsigned int new_order, ··· 375 375 bool warns); 376 376 int folio_split(struct folio *folio, unsigned int new_order, struct page *page, 377 377 struct list_head *list); 378 + 379 + static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, 380 + unsigned int new_order) 381 + { 382 + return __split_huge_page_to_list_to_order(page, list, new_order, false); 383 + } 384 + 378 385 /* 379 386 * try_folio_split_to_order - try to split a @folio at @page to @new_order using 380 387 * non uniform split.

lib/test_hmm.c

··· 1613 1613 nr = 1 << order; 1614 1614 1615 1615 /* 1616 + * When folios are partially mapped, we can't rely on the folio 1617 + * order of vmf->page as the folio might not be fully split yet 1618 + */ 1619 + if (vmf->pte) { 1620 + order = 0; 1621 + nr = 1; 1622 + } 1623 + 1624 + /* 1616 1625 * Consider a per-cpu cache of src and dst pfns, but with 1617 1626 * large number of cpus that might not scale well. 1618 1627 */

+25 -21

mm/huge_memory.c

··· 3452 3452 new_folio->mapping = folio->mapping; 3453 3453 new_folio->index = folio->index + i; 3454 3454 3455 - /* 3456 - * page->private should not be set in tail pages. Fix up and warn once 3457 - * if private is unexpectedly set. 3458 - */ 3459 - if (unlikely(new_folio->private)) { 3460 - VM_WARN_ON_ONCE_PAGE(true, new_head); 3461 - new_folio->private = NULL; 3462 - } 3463 - 3464 3455 if (folio_test_swapcache(folio)) 3465 3456 new_folio->swap.val = folio->swap.val + i; 3466 3457 ··· 3652 3661 * @lock_at: a page within @folio to be left locked to caller 3653 3662 * @list: after-split folios will be put on it if non NULL 3654 3663 * @uniform_split: perform uniform split or not (non-uniform split) 3664 + * @unmapped: The pages are already unmapped, they are migration entries. 3655 3665 * 3656 3666 * It calls __split_unmapped_folio() to perform uniform and non-uniform split. 3657 3667 * It is in charge of checking whether the split is supported or not and ··· 3668 3676 */ 3669 3677 static int __folio_split(struct folio *folio, unsigned int new_order, 3670 3678 struct page *split_at, struct page *lock_at, 3671 - struct list_head *list, bool uniform_split) 3679 + struct list_head *list, bool uniform_split, bool unmapped) 3672 3680 { 3673 3681 struct deferred_split *ds_queue = get_deferred_split_queue(folio); 3674 3682 XA_STATE(xas, &folio->mapping->i_pages, folio->index); ··· 3728 3736 * is taken to serialise against parallel split or collapse 3729 3737 * operations. 3730 3738 */ 3731 - anon_vma = folio_get_anon_vma(folio); 3732 - if (!anon_vma) { 3733 - ret = -EBUSY; 3734 - goto out; 3739 + if (!unmapped) { 3740 + anon_vma = folio_get_anon_vma(folio); 3741 + if (!anon_vma) { 3742 + ret = -EBUSY; 3743 + goto out; 3744 + } 3745 + anon_vma_lock_write(anon_vma); 3735 3746 } 3736 3747 mapping = NULL; 3737 - anon_vma_lock_write(anon_vma); 3738 3748 } else { 3739 3749 unsigned int min_order; 3740 3750 gfp_t gfp; ··· 3789 3795 goto out_unlock; 3790 3796 } 3791 3797 3792 - unmap_folio(folio); 3798 + if (!unmapped) 3799 + unmap_folio(folio); 3793 3800 3794 3801 /* block interrupt reentry in xa_lock and spinlock */ 3795 3802 local_irq_disable(); ··· 3877 3882 3878 3883 next = folio_next(new_folio); 3879 3884 3885 + zone_device_private_split_cb(folio, new_folio); 3886 + 3880 3887 expected_refs = folio_expected_ref_count(new_folio) + 1; 3881 3888 folio_ref_unfreeze(new_folio, expected_refs); 3882 3889 3883 - lru_add_split_folio(folio, new_folio, lruvec, list); 3890 + if (!unmapped) 3891 + lru_add_split_folio(folio, new_folio, lruvec, list); 3884 3892 3885 3893 /* 3886 3894 * Anonymous folio with swap cache. ··· 3914 3916 __filemap_remove_folio(new_folio, NULL); 3915 3917 folio_put_refs(new_folio, nr_pages); 3916 3918 } 3919 + 3920 + zone_device_private_split_cb(folio, NULL); 3917 3921 /* 3918 3922 * Unfreeze @folio only after all page cache entries, which 3919 3923 * used to point to it, have been updated with new folios. ··· 3938 3938 xas_unlock(&xas); 3939 3939 3940 3940 local_irq_enable(); 3941 + 3942 + if (unmapped) 3943 + return ret; 3941 3944 3942 3945 if (nr_shmem_dropped) 3943 3946 shmem_uncharge(mapping->host, nr_shmem_dropped); ··· 4032 4029 * Returns -EINVAL when trying to split to an order that is incompatible 4033 4030 * with the folio. Splitting to order 0 is compatible with all folios. 4034 4031 */ 4035 - int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, 4036 - unsigned int new_order) 4032 + int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, 4033 + unsigned int new_order, bool unmapped) 4037 4034 { 4038 4035 struct folio *folio = page_folio(page); 4039 4036 4040 - return __folio_split(folio, new_order, &folio->page, page, list, true); 4037 + return __folio_split(folio, new_order, &folio->page, page, list, true, 4038 + unmapped); 4041 4039 } 4042 4040 4043 4041 /* ··· 4067 4063 struct page *split_at, struct list_head *list) 4068 4064 { 4069 4065 return __folio_split(folio, new_order, split_at, &folio->page, list, 4070 - false); 4066 + false, false); 4071 4067 } 4072 4068 4073 4069 int min_order_for_split(struct folio *folio)

+76 -11

mm/migrate_device.c

··· 309 309 pgmap->owner != migrate->pgmap_owner) 310 310 goto next; 311 311 312 + folio = page_folio(page); 313 + if (folio_test_large(folio)) { 314 + int ret; 315 + 316 + arch_leave_lazy_mmu_mode(); 317 + pte_unmap_unlock(ptep, ptl); 318 + ret = migrate_vma_split_folio(folio, 319 + migrate->fault_page); 320 + 321 + if (ret) { 322 + if (unmapped) 323 + flush_tlb_range(walk->vma, start, end); 324 + 325 + return migrate_vma_collect_skip(addr, end, walk); 326 + } 327 + 328 + goto again; 329 + } 330 + 312 331 mpfn = migrate_pfn(page_to_pfn(page)) | 313 332 MIGRATE_PFN_MIGRATE; 314 333 if (is_writable_device_private_entry(entry)) ··· 904 885 src[i] &= ~MIGRATE_PFN_MIGRATE; 905 886 return 0; 906 887 } 888 + 889 + static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate, 890 + unsigned long idx, unsigned long addr, 891 + struct folio *folio) 892 + { 893 + unsigned long i; 894 + unsigned long pfn; 895 + unsigned long flags; 896 + int ret = 0; 897 + 898 + folio_get(folio); 899 + split_huge_pmd_address(migrate->vma, addr, true); 900 + ret = __split_huge_page_to_list_to_order(folio_page(folio, 0), NULL, 901 + 0, true); 902 + if (ret) 903 + return ret; 904 + migrate->src[idx] &= ~MIGRATE_PFN_COMPOUND; 905 + flags = migrate->src[idx] & ((1UL << MIGRATE_PFN_SHIFT) - 1); 906 + pfn = migrate->src[idx] >> MIGRATE_PFN_SHIFT; 907 + for (i = 1; i < HPAGE_PMD_NR; i++) 908 + migrate->src[i+idx] = migrate_pfn(pfn + i) | flags; 909 + return ret; 910 + } 907 911 #else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */ 908 912 static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, 909 913 unsigned long addr, 910 914 struct page *page, 911 915 unsigned long *src, 912 916 pmd_t *pmdp) 917 + { 918 + return 0; 919 + } 920 + 921 + static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate, 922 + unsigned long idx, unsigned long addr, 923 + struct folio *folio) 913 924 { 914 925 return 0; 915 926 } ··· 1104 1055 struct migrate_vma *migrate) 1105 1056 { 1106 1057 struct mmu_notifier_range range; 1107 - unsigned long i; 1058 + unsigned long i, j; 1108 1059 bool notified = false; 1060 + unsigned long addr; 1109 1061 1110 1062 for (i = 0; i < npages; ) { 1111 1063 struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); ··· 1148 1098 (!(dst_pfns[i] & MIGRATE_PFN_COMPOUND))) { 1149 1099 nr = migrate_vma_nr_pages(&src_pfns[i]); 1150 1100 src_pfns[i] &= ~MIGRATE_PFN_COMPOUND; 1151 - src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; 1152 - goto next; 1101 + } else { 1102 + nr = 1; 1153 1103 } 1154 1104 1155 - migrate_vma_insert_page(migrate, addr, &dst_pfns[i], 1156 - &src_pfns[i]); 1105 + for (j = 0; j < nr && i + j < npages; j++) { 1106 + src_pfns[i+j] |= MIGRATE_PFN_MIGRATE; 1107 + migrate_vma_insert_page(migrate, 1108 + addr + j * PAGE_SIZE, 1109 + &dst_pfns[i+j], &src_pfns[i+j]); 1110 + } 1157 1111 goto next; 1158 1112 } 1159 1113 ··· 1179 1125 MIGRATE_PFN_COMPOUND); 1180 1126 goto next; 1181 1127 } 1182 - src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; 1128 + nr = 1 << folio_order(folio); 1129 + addr = migrate->start + i * PAGE_SIZE; 1130 + if (migrate_vma_split_unmapped_folio(migrate, i, addr, folio)) { 1131 + src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE | 1132 + MIGRATE_PFN_COMPOUND); 1133 + goto next; 1134 + } 1183 1135 } else if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) && 1184 1136 (dst_pfns[i] & MIGRATE_PFN_COMPOUND) && 1185 1137 !(src_pfns[i] & MIGRATE_PFN_COMPOUND)) { ··· 1221 1161 1222 1162 if (migrate && migrate->fault_page == page) 1223 1163 extra_cnt = 1; 1224 - r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt); 1225 - if (r) 1226 - src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; 1227 - else 1228 - folio_migrate_flags(newfolio, folio); 1164 + for (j = 0; j < nr && i + j < npages; j++) { 1165 + folio = page_folio(migrate_pfn_to_page(src_pfns[i+j])); 1166 + newfolio = page_folio(migrate_pfn_to_page(dst_pfns[i+j])); 1167 + 1168 + r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt); 1169 + if (r) 1170 + src_pfns[i+j] &= ~MIGRATE_PFN_MIGRATE; 1171 + else 1172 + folio_migrate_flags(newfolio, folio); 1173 + } 1229 1174 next: 1230 1175 i += nr; 1231 1176 }