Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: page_isolation: prepare for hygienic freelists

Page isolation currently sets MIGRATE_ISOLATE on a block, then drops
zone->lock and scans the block for straddling buddies to split up.
Because this happens non-atomically wrt the page allocator, it's possible
for allocations to get a buddy whose first block is a regular pcp
migratetype but whose tail is isolated. This means that in certain cases
memory can still be allocated after isolation. It will also trigger the
freelist type hygiene warnings in subsequent patches.

start_isolate_page_range()
isolate_single_pageblock()
set_migratetype_isolate(tail)
lock zone->lock
move_freepages_block(tail) // nop
set_pageblock_migratetype(tail)
unlock zone->lock
__rmqueue_smallest()
del_page_from_freelist(head)
expand(head, head_mt)
WARN(head_mt != tail_mt)
start_pfn = ALIGN_DOWN(MAX_ORDER_NR_PAGES)
for (pfn = start_pfn, pfn < end_pfn)
if (PageBuddy())
split_free_page(head)

Introduce a variant of move_freepages_block() provided by the allocator
specifically for page isolation; it moves free pages, converts the block,
and handles the splitting of straddling buddies while holding zone->lock.

The allocator knows that pageblocks and buddies are always naturally
aligned, which means that buddies can only straddle blocks if they're
actually >pageblock_order. This means the search-and-split part can be
simplified compared to what page isolation used to do.

Also tighten up the page isolation code around the expectations of which
pages can be large, and how they are freed.

Based on extensive discussions with and invaluable input from Zi Yan.

[hannes@cmpxchg.org: work around older gcc warning]
Link: https://lkml.kernel.org/r/20240321142426.GB777580@cmpxchg.org
Link: https://lkml.kernel.org/r/20240320180429.678181-10-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Johannes Weiner and committed by
Andrew Morton
fd919a85 f37c0f68

+155 -163
+3 -1
include/linux/page-isolation.h
··· 34 34 #define REPORT_FAILURE 0x2 35 35 36 36 void set_pageblock_migratetype(struct page *page, int migratetype); 37 - int move_freepages_block(struct zone *zone, struct page *page, int migratetype); 37 + 38 + bool move_freepages_block_isolate(struct zone *zone, struct page *page, 39 + int migratetype); 38 40 39 41 int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, 40 42 int migratetype, int flags, gfp_t gfp_flags);
-4
mm/internal.h
··· 562 562 void memmap_init_range(unsigned long, int, unsigned long, unsigned long, 563 563 unsigned long, enum meminit_context, struct vmem_altmap *, int); 564 564 565 - 566 - int split_free_page(struct page *free_page, 567 - unsigned int order, unsigned long split_pfn_offset); 568 - 569 565 #if defined CONFIG_COMPACTION || defined CONFIG_CMA 570 566 571 567 /*
+120 -84
mm/page_alloc.c
··· 833 833 page_reporting_notify_free(order); 834 834 } 835 835 836 - /** 837 - * split_free_page() -- split a free page at split_pfn_offset 838 - * @free_page: the original free page 839 - * @order: the order of the page 840 - * @split_pfn_offset: split offset within the page 841 - * 842 - * Return -ENOENT if the free page is changed, otherwise 0 843 - * 844 - * It is used when the free page crosses two pageblocks with different migratetypes 845 - * at split_pfn_offset within the page. The split free page will be put into 846 - * separate migratetype lists afterwards. Otherwise, the function achieves 847 - * nothing. 848 - */ 849 - int split_free_page(struct page *free_page, 850 - unsigned int order, unsigned long split_pfn_offset) 851 - { 852 - struct zone *zone = page_zone(free_page); 853 - unsigned long free_page_pfn = page_to_pfn(free_page); 854 - unsigned long pfn; 855 - unsigned long flags; 856 - int free_page_order; 857 - int mt; 858 - int ret = 0; 859 - 860 - if (split_pfn_offset == 0) 861 - return ret; 862 - 863 - spin_lock_irqsave(&zone->lock, flags); 864 - 865 - if (!PageBuddy(free_page) || buddy_order(free_page) != order) { 866 - ret = -ENOENT; 867 - goto out; 868 - } 869 - 870 - mt = get_pfnblock_migratetype(free_page, free_page_pfn); 871 - if (likely(!is_migrate_isolate(mt))) 872 - __mod_zone_freepage_state(zone, -(1UL << order), mt); 873 - 874 - del_page_from_free_list(free_page, zone, order); 875 - for (pfn = free_page_pfn; 876 - pfn < free_page_pfn + (1UL << order);) { 877 - int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); 878 - 879 - free_page_order = min_t(unsigned int, 880 - pfn ? __ffs(pfn) : order, 881 - __fls(split_pfn_offset)); 882 - __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, 883 - mt, FPI_NONE); 884 - pfn += 1UL << free_page_order; 885 - split_pfn_offset -= (1UL << free_page_order); 886 - /* we have done the first part, now switch to second part */ 887 - if (split_pfn_offset == 0) 888 - split_pfn_offset = (1UL << order) - (pfn - free_page_pfn); 889 - } 890 - out: 891 - spin_unlock_irqrestore(&zone->lock, flags); 892 - return ret; 893 - } 894 836 /* 895 837 * A bad page could be due to a number of fields. Instead of multiple branches, 896 838 * try and check multiple fields with one check. The caller must do a detailed ··· 1616 1674 return true; 1617 1675 } 1618 1676 1619 - int move_freepages_block(struct zone *zone, struct page *page, 1620 - int migratetype) 1677 + static int move_freepages_block(struct zone *zone, struct page *page, 1678 + int migratetype) 1621 1679 { 1622 1680 unsigned long start_pfn, end_pfn; 1623 1681 ··· 1627 1685 1628 1686 return move_freepages(zone, start_pfn, end_pfn, migratetype); 1629 1687 } 1688 + 1689 + #ifdef CONFIG_MEMORY_ISOLATION 1690 + /* Look for a buddy that straddles start_pfn */ 1691 + static unsigned long find_large_buddy(unsigned long start_pfn) 1692 + { 1693 + int order = 0; 1694 + struct page *page; 1695 + unsigned long pfn = start_pfn; 1696 + 1697 + while (!PageBuddy(page = pfn_to_page(pfn))) { 1698 + /* Nothing found */ 1699 + if (++order > MAX_PAGE_ORDER) 1700 + return start_pfn; 1701 + pfn &= ~0UL << order; 1702 + } 1703 + 1704 + /* 1705 + * Found a preceding buddy, but does it straddle? 1706 + */ 1707 + if (pfn + (1 << buddy_order(page)) > start_pfn) 1708 + return pfn; 1709 + 1710 + /* Nothing found */ 1711 + return start_pfn; 1712 + } 1713 + 1714 + /* Split a multi-block free page into its individual pageblocks */ 1715 + static void split_large_buddy(struct zone *zone, struct page *page, 1716 + unsigned long pfn, int order) 1717 + { 1718 + unsigned long end_pfn = pfn + (1 << order); 1719 + 1720 + VM_WARN_ON_ONCE(order <= pageblock_order); 1721 + VM_WARN_ON_ONCE(pfn & (pageblock_nr_pages - 1)); 1722 + 1723 + /* Caller removed page from freelist, buddy info cleared! */ 1724 + VM_WARN_ON_ONCE(PageBuddy(page)); 1725 + 1726 + while (pfn != end_pfn) { 1727 + int mt = get_pfnblock_migratetype(page, pfn); 1728 + 1729 + __free_one_page(page, pfn, zone, pageblock_order, mt, FPI_NONE); 1730 + pfn += pageblock_nr_pages; 1731 + page = pfn_to_page(pfn); 1732 + } 1733 + } 1734 + 1735 + /** 1736 + * move_freepages_block_isolate - move free pages in block for page isolation 1737 + * @zone: the zone 1738 + * @page: the pageblock page 1739 + * @migratetype: migratetype to set on the pageblock 1740 + * 1741 + * This is similar to move_freepages_block(), but handles the special 1742 + * case encountered in page isolation, where the block of interest 1743 + * might be part of a larger buddy spanning multiple pageblocks. 1744 + * 1745 + * Unlike the regular page allocator path, which moves pages while 1746 + * stealing buddies off the freelist, page isolation is interested in 1747 + * arbitrary pfn ranges that may have overlapping buddies on both ends. 1748 + * 1749 + * This function handles that. Straddling buddies are split into 1750 + * individual pageblocks. Only the block of interest is moved. 1751 + * 1752 + * Returns %true if pages could be moved, %false otherwise. 1753 + */ 1754 + bool move_freepages_block_isolate(struct zone *zone, struct page *page, 1755 + int migratetype) 1756 + { 1757 + unsigned long start_pfn, end_pfn, pfn; 1758 + int nr_moved, mt; 1759 + 1760 + if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn, 1761 + NULL, NULL)) 1762 + return false; 1763 + 1764 + /* No splits needed if buddies can't span multiple blocks */ 1765 + if (pageblock_order == MAX_PAGE_ORDER) 1766 + goto move; 1767 + 1768 + /* We're a tail block in a larger buddy */ 1769 + pfn = find_large_buddy(start_pfn); 1770 + if (pfn != start_pfn) { 1771 + struct page *buddy = pfn_to_page(pfn); 1772 + int order = buddy_order(buddy); 1773 + int mt = get_pfnblock_migratetype(buddy, pfn); 1774 + 1775 + if (!is_migrate_isolate(mt)) 1776 + __mod_zone_freepage_state(zone, -(1UL << order), mt); 1777 + del_page_from_free_list(buddy, zone, order); 1778 + set_pageblock_migratetype(page, migratetype); 1779 + split_large_buddy(zone, buddy, pfn, order); 1780 + return true; 1781 + } 1782 + 1783 + /* We're the starting block of a larger buddy */ 1784 + if (PageBuddy(page) && buddy_order(page) > pageblock_order) { 1785 + int mt = get_pfnblock_migratetype(page, pfn); 1786 + int order = buddy_order(page); 1787 + 1788 + if (!is_migrate_isolate(mt)) 1789 + __mod_zone_freepage_state(zone, -(1UL << order), mt); 1790 + del_page_from_free_list(page, zone, order); 1791 + set_pageblock_migratetype(page, migratetype); 1792 + split_large_buddy(zone, page, pfn, order); 1793 + return true; 1794 + } 1795 + move: 1796 + mt = get_pfnblock_migratetype(page, start_pfn); 1797 + nr_moved = move_freepages(zone, start_pfn, end_pfn, migratetype); 1798 + if (!is_migrate_isolate(mt)) 1799 + __mod_zone_freepage_state(zone, -nr_moved, mt); 1800 + else if (!is_migrate_isolate(migratetype)) 1801 + __mod_zone_freepage_state(zone, nr_moved, migratetype); 1802 + return true; 1803 + } 1804 + #endif /* CONFIG_MEMORY_ISOLATION */ 1630 1805 1631 1806 static void change_pageblock_range(struct page *pageblock_page, 1632 1807 int start_order, int migratetype) ··· 6424 6365 unsigned migratetype, gfp_t gfp_mask) 6425 6366 { 6426 6367 unsigned long outer_start, outer_end; 6427 - int order; 6428 6368 int ret = 0; 6429 6369 6430 6370 struct compact_control cc = { ··· 6496 6438 * We don't have to hold zone->lock here because the pages are 6497 6439 * isolated thus they won't get removed from buddy. 6498 6440 */ 6499 - 6500 - order = 0; 6501 - outer_start = start; 6502 - while (!PageBuddy(pfn_to_page(outer_start))) { 6503 - if (++order > MAX_PAGE_ORDER) { 6504 - outer_start = start; 6505 - break; 6506 - } 6507 - outer_start &= ~0UL << order; 6508 - } 6509 - 6510 - if (outer_start != start) { 6511 - order = buddy_order(pfn_to_page(outer_start)); 6512 - 6513 - /* 6514 - * outer_start page could be small order buddy page and 6515 - * it doesn't include start page. Adjust outer_start 6516 - * in this case to report failed page properly 6517 - * on tracepoint in test_pages_isolated() 6518 - */ 6519 - if (outer_start + (1UL << order) <= start) 6520 - outer_start = start; 6521 - } 6441 + outer_start = find_large_buddy(start); 6522 6442 6523 6443 /* Make sure the range is really isolated. */ 6524 6444 if (test_pages_isolated(outer_start, end, 0)) {
+32 -74
mm/page_isolation.c
··· 178 178 unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, 179 179 migratetype, isol_flags); 180 180 if (!unmovable) { 181 - int nr_pages; 182 - int mt = get_pageblock_migratetype(page); 183 - 184 - nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); 185 - /* Block spans zone boundaries? */ 186 - if (nr_pages == -1) { 181 + if (!move_freepages_block_isolate(zone, page, MIGRATE_ISOLATE)) { 187 182 spin_unlock_irqrestore(&zone->lock, flags); 188 183 return -EBUSY; 189 184 } 190 - __mod_zone_freepage_state(zone, -nr_pages, mt); 191 185 zone->nr_isolate_pageblock++; 192 186 spin_unlock_irqrestore(&zone->lock, flags); 193 187 return 0; ··· 248 254 * allocation. 249 255 */ 250 256 if (!isolated_page) { 251 - int nr_pages = move_freepages_block(zone, page, migratetype); 252 257 /* 253 258 * Isolating this block already succeeded, so this 254 259 * should not fail on zone boundaries. 255 260 */ 256 - WARN_ON_ONCE(nr_pages == -1); 257 - __mod_zone_freepage_state(zone, nr_pages, migratetype); 261 + WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype)); 258 262 } else { 259 263 set_pageblock_migratetype(page, migratetype); 260 264 __putback_isolated_page(page, order, migratetype); ··· 366 374 367 375 VM_BUG_ON(!page); 368 376 pfn = page_to_pfn(page); 369 - /* 370 - * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any 371 - * free pages in [start_pfn, boundary_pfn), its head page will 372 - * always be in the range. 373 - */ 377 + 374 378 if (PageBuddy(page)) { 375 379 int order = buddy_order(page); 376 380 377 - if (pfn + (1UL << order) > boundary_pfn) { 378 - /* free page changed before split, check it again */ 379 - if (split_free_page(page, order, boundary_pfn - pfn)) 380 - continue; 381 - } 381 + /* move_freepages_block_isolate() handled this */ 382 + VM_WARN_ON_ONCE(pfn + (1 << order) > boundary_pfn); 382 383 383 384 pfn += 1UL << order; 384 385 continue; 385 386 } 387 + 386 388 /* 387 - * migrate compound pages then let the free page handling code 388 - * above do the rest. If migration is not possible, just fail. 389 + * If a compound page is straddling our block, attempt 390 + * to migrate it out of the way. 391 + * 392 + * We don't have to worry about this creating a large 393 + * free page that straddles into our block: gigantic 394 + * pages are freed as order-0 chunks, and LRU pages 395 + * (currently) do not exceed pageblock_order. 396 + * 397 + * The block of interest has already been marked 398 + * MIGRATE_ISOLATE above, so when migration is done it 399 + * will free its pages onto the correct freelists. 389 400 */ 390 401 if (PageCompound(page)) { 391 402 struct page *head = compound_head(page); ··· 399 404 pfn = head_pfn + nr_pages; 400 405 continue; 401 406 } 407 + 402 408 #if defined CONFIG_COMPACTION || defined CONFIG_CMA 403 - /* 404 - * hugetlb, lru compound (THP), and movable compound pages 405 - * can be migrated. Otherwise, fail the isolation. 406 - */ 407 - if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) { 408 - int order; 409 - unsigned long outer_pfn; 409 + if (PageHuge(page)) { 410 410 int page_mt = get_pageblock_migratetype(page); 411 - bool isolate_page = !is_migrate_isolate_page(page); 412 411 struct compact_control cc = { 413 412 .nr_migratepages = 0, 414 413 .order = -1, ··· 415 426 }; 416 427 INIT_LIST_HEAD(&cc.migratepages); 417 428 418 - /* 419 - * XXX: mark the page as MIGRATE_ISOLATE so that 420 - * no one else can grab the freed page after migration. 421 - * Ideally, the page should be freed as two separate 422 - * pages to be added into separate migratetype free 423 - * lists. 424 - */ 425 - if (isolate_page) { 426 - ret = set_migratetype_isolate(page, page_mt, 427 - flags, head_pfn, head_pfn + nr_pages); 428 - if (ret) 429 - goto failed; 430 - } 431 - 432 429 ret = __alloc_contig_migrate_range(&cc, head_pfn, 433 430 head_pfn + nr_pages, page_mt); 434 - 435 - /* 436 - * restore the page's migratetype so that it can 437 - * be split into separate migratetype free lists 438 - * later. 439 - */ 440 - if (isolate_page) 441 - unset_migratetype_isolate(page, page_mt); 442 - 443 431 if (ret) 444 432 goto failed; 445 - /* 446 - * reset pfn to the head of the free page, so 447 - * that the free page handling code above can split 448 - * the free page to the right migratetype list. 449 - * 450 - * head_pfn is not used here as a hugetlb page order 451 - * can be bigger than MAX_PAGE_ORDER, but after it is 452 - * freed, the free page order is not. Use pfn within 453 - * the range to find the head of the free page. 454 - */ 455 - order = 0; 456 - outer_pfn = pfn; 457 - while (!PageBuddy(pfn_to_page(outer_pfn))) { 458 - /* stop if we cannot find the free page */ 459 - if (++order > MAX_PAGE_ORDER) 460 - goto failed; 461 - outer_pfn &= ~0UL << order; 462 - } 463 - pfn = outer_pfn; 433 + pfn = head_pfn + nr_pages; 464 434 continue; 465 - } else 435 + } 436 + 437 + /* 438 + * These pages are movable too, but they're 439 + * not expected to exceed pageblock_order. 440 + * 441 + * Let us know when they do, so we can add 442 + * proper free and split handling for them. 443 + */ 444 + VM_WARN_ON_ONCE_PAGE(PageLRU(page), page); 445 + VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page); 466 446 #endif 467 - goto failed; 447 + goto failed; 468 448 } 469 449 470 450 pfn++;