Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'mm-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull more MM updates from Andrew Morton:

- Two follow-on fixes for the post-5.19 series "Use pageblock_order for
cma and alloc_contig_range alignment", from Zi Yan.

- A series of z3fold cleanups and fixes from Miaohe Lin.

- Some memcg selftests work from Michal Koutný <mkoutny@suse.com>

- Some swap fixes and cleanups from Miaohe Lin

- Several individual minor fixups

* tag 'mm-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (25 commits)
mm/shmem.c: suppress shift warning
mm: Kconfig: reorganize misplaced mm options
mm: kasan: fix input of vmalloc_to_page()
mm: fix is_pinnable_page against a cma page
mm: filter out swapin error entry in shmem mapping
mm/shmem: fix infinite loop when swap in shmem error at swapoff time
mm/madvise: free hwpoison and swapin error entry in madvise_free_pte_range
mm/swapfile: fix lost swap bits in unuse_pte()
mm/swapfile: unuse_pte can map random data if swap read fails
selftests: memcg: factor out common parts of memory.{low,min} tests
selftests: memcg: remove protection from top level memcg
selftests: memcg: adjust expected reclaim values of protected cgroups
selftests: memcg: expect no low events in unprotected sibling
selftests: memcg: fix compilation
mm/z3fold: fix z3fold_page_migrate races with z3fold_map
mm/z3fold: fix z3fold_reclaim_page races with z3fold_free
mm/z3fold: always clear PAGE_CLAIMED under z3fold page lock
mm/z3fold: put z3fold page back into unbuddied list when reclaim or migration fails
revert "mm/z3fold.c: allow __GFP_HIGHMEM in z3fold_alloc"
mm/z3fold: throw warning on failure of trylock_page in z3fold_alloc
...

+441 -369
+1
MAINTAINERS
··· 5062 5062 S: Maintained 5063 5063 F: mm/memcontrol.c 5064 5064 F: mm/swap_cgroup.c 5065 + F: tools/testing/selftests/cgroup/memcg_protection.m 5065 5066 F: tools/testing/selftests/cgroup/test_kmem.c 5066 5067 F: tools/testing/selftests/cgroup/test_memcontrol.c 5067 5068
+7 -2
include/linux/mm.h
··· 1594 1594 #ifdef CONFIG_MIGRATION 1595 1595 static inline bool is_pinnable_page(struct page *page) 1596 1596 { 1597 - return !(is_zone_movable_page(page) || is_migrate_cma_page(page)) || 1598 - is_zero_pfn(page_to_pfn(page)); 1597 + #ifdef CONFIG_CMA 1598 + int mt = get_pageblock_migratetype(page); 1599 + 1600 + if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) 1601 + return false; 1602 + #endif 1603 + return !(is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page))); 1599 1604 } 1600 1605 #else 1601 1606 static inline bool is_pinnable_page(struct page *page)
+6 -1
include/linux/swap.h
··· 55 55 * actions on faults. 56 56 */ 57 57 58 + #define SWP_SWAPIN_ERROR_NUM 1 59 + #define SWP_SWAPIN_ERROR (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ 60 + SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \ 61 + SWP_PTE_MARKER_NUM) 58 62 /* 59 63 * PTE markers are used to persist information onto PTEs that are mapped with 60 64 * file-backed memories. As its name "PTE" hints, it should only be applied to ··· 124 120 125 121 #define MAX_SWAPFILES \ 126 122 ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ 127 - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_PTE_MARKER_NUM) 123 + SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \ 124 + SWP_PTE_MARKER_NUM - SWP_SWAPIN_ERROR_NUM) 128 125 129 126 /* 130 127 * Magic header for a swap area. The first part of the union is
+10
include/linux/swapops.h
··· 108 108 return xa_mk_value(entry.val); 109 109 } 110 110 111 + static inline swp_entry_t make_swapin_error_entry(struct page *page) 112 + { 113 + return swp_entry(SWP_SWAPIN_ERROR, page_to_pfn(page)); 114 + } 115 + 116 + static inline int is_swapin_error_entry(swp_entry_t entry) 117 + { 118 + return swp_type(entry) == SWP_SWAPIN_ERROR; 119 + } 120 + 111 121 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) 112 122 static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) 113 123 {
-54
init/Kconfig
··· 1842 1842 1843 1843 endmenu 1844 1844 1845 - config VM_EVENT_COUNTERS 1846 - default y 1847 - bool "Enable VM event counters for /proc/vmstat" if EXPERT 1848 - help 1849 - VM event counters are needed for event counts to be shown. 1850 - This option allows the disabling of the VM event counters 1851 - on EXPERT systems. /proc/vmstat will only show page counts 1852 - if VM event counters are disabled. 1853 - 1854 - config SLUB_DEBUG 1855 - default y 1856 - bool "Enable SLUB debugging support" if EXPERT 1857 - depends on SLUB && SYSFS 1858 - select STACKDEPOT if STACKTRACE_SUPPORT 1859 - help 1860 - SLUB has extensive debug support features. Disabling these can 1861 - result in significant savings in code size. This also disables 1862 - SLUB sysfs support. /sys/slab will not exist and there will be 1863 - no support for cache validation etc. 1864 - 1865 - config COMPAT_BRK 1866 - bool "Disable heap randomization" 1867 - default y 1868 - help 1869 - Randomizing heap placement makes heap exploits harder, but it 1870 - also breaks ancient binaries (including anything libc5 based). 1871 - This option changes the bootup default to heap randomization 1872 - disabled, and can be overridden at runtime by setting 1873 - /proc/sys/kernel/randomize_va_space to 2. 1874 - 1875 - On non-ancient distros (post-2000 ones) N is usually a safe choice. 1876 - 1877 - config MMAP_ALLOW_UNINITIALIZED 1878 - bool "Allow mmapped anonymous memory to be uninitialized" 1879 - depends on EXPERT && !MMU 1880 - default n 1881 - help 1882 - Normally, and according to the Linux spec, anonymous memory obtained 1883 - from mmap() has its contents cleared before it is passed to 1884 - userspace. Enabling this config option allows you to request that 1885 - mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus 1886 - providing a huge performance boost. If this option is not enabled, 1887 - then the flag will be ignored. 1888 - 1889 - This is taken advantage of by uClibc's malloc(), and also by 1890 - ELF-FDPIC binfmt's brk and stack allocator. 1891 - 1892 - Because of the obvious security issues, this option should only be 1893 - enabled on embedded devices where you control what is run in 1894 - userspace. Since that isn't generally a problem on no-MMU systems, 1895 - it is normally safe to say Y here. 1896 - 1897 - See Documentation/admin-guide/mm/nommu-mmap.rst for more information. 1898 - 1899 1845 config SYSTEM_DATA_VERIFICATION 1900 1846 def_bool n 1901 1847 select SYSTEM_TRUSTED_KEYRING
-35
lib/Kconfig.debug
··· 699 699 help 700 700 Debug objects boot parameter default value 701 701 702 - config DEBUG_SLAB 703 - bool "Debug slab memory allocations" 704 - depends on DEBUG_KERNEL && SLAB 705 - help 706 - Say Y here to have the kernel do limited verification on memory 707 - allocation as well as poisoning memory on free to catch use of freed 708 - memory. This can make kmalloc/kfree-intensive workloads much slower. 709 - 710 - config SLUB_DEBUG_ON 711 - bool "SLUB debugging on by default" 712 - depends on SLUB && SLUB_DEBUG 713 - select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT 714 - default n 715 - help 716 - Boot with debugging on by default. SLUB boots by default with 717 - the runtime debug capabilities switched off. Enabling this is 718 - equivalent to specifying the "slub_debug" parameter on boot. 719 - There is no support for more fine grained debug control like 720 - possible with slub_debug=xxx. SLUB debugging may be switched 721 - off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying 722 - "slub_debug=-". 723 - 724 - config SLUB_STATS 725 - default n 726 - bool "Enable SLUB performance statistics" 727 - depends on SLUB && SYSFS 728 - help 729 - SLUB statistics are useful to debug SLUBs allocation behavior in 730 - order find ways to optimize the allocator. This should never be 731 - enabled for production use since keeping statistics slows down 732 - the allocator by a few percentage points. The slabinfo command 733 - supports the determination of the most active slabs to figure 734 - out which slabs are relevant to a particular load. 735 - Try running: slabinfo -DA 736 - 737 702 config HAVE_DEBUG_KMEMLEAK 738 703 bool 739 704
+56
mm/Kconfig
··· 270 270 sanity-checking than others. This option is most effective with 271 271 CONFIG_SLUB. 272 272 273 + config SLUB_STATS 274 + default n 275 + bool "Enable SLUB performance statistics" 276 + depends on SLUB && SYSFS 277 + help 278 + SLUB statistics are useful to debug SLUBs allocation behavior in 279 + order find ways to optimize the allocator. This should never be 280 + enabled for production use since keeping statistics slows down 281 + the allocator by a few percentage points. The slabinfo command 282 + supports the determination of the most active slabs to figure 283 + out which slabs are relevant to a particular load. 284 + Try running: slabinfo -DA 285 + 273 286 config SLUB_CPU_PARTIAL 274 287 default y 275 288 depends on SLUB && SMP ··· 319 306 'page_alloc.shuffle' kernel command line parameter. 320 307 321 308 Say Y if unsure. 309 + 310 + config COMPAT_BRK 311 + bool "Disable heap randomization" 312 + default y 313 + help 314 + Randomizing heap placement makes heap exploits harder, but it 315 + also breaks ancient binaries (including anything libc5 based). 316 + This option changes the bootup default to heap randomization 317 + disabled, and can be overridden at runtime by setting 318 + /proc/sys/kernel/randomize_va_space to 2. 319 + 320 + On non-ancient distros (post-2000 ones) N is usually a safe choice. 321 + 322 + config MMAP_ALLOW_UNINITIALIZED 323 + bool "Allow mmapped anonymous memory to be uninitialized" 324 + depends on EXPERT && !MMU 325 + default n 326 + help 327 + Normally, and according to the Linux spec, anonymous memory obtained 328 + from mmap() has its contents cleared before it is passed to 329 + userspace. Enabling this config option allows you to request that 330 + mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus 331 + providing a huge performance boost. If this option is not enabled, 332 + then the flag will be ignored. 333 + 334 + This is taken advantage of by uClibc's malloc(), and also by 335 + ELF-FDPIC binfmt's brk and stack allocator. 336 + 337 + Because of the obvious security issues, this option should only be 338 + enabled on embedded devices where you control what is run in 339 + userspace. Since that isn't generally a problem on no-MMU systems, 340 + it is normally safe to say Y here. 341 + 342 + See Documentation/admin-guide/mm/nommu-mmap.rst for more information. 322 343 323 344 config SELECT_MEMORY_MODEL 324 345 def_bool y ··· 1010 963 bool 1011 964 config ARCH_HAS_PKEYS 1012 965 bool 966 + 967 + config VM_EVENT_COUNTERS 968 + default y 969 + bool "Enable VM event counters for /proc/vmstat" if EXPERT 970 + help 971 + VM event counters are needed for event counts to be shown. 972 + This option allows the disabling of the VM event counters 973 + on EXPERT systems. /proc/vmstat will only show page counts 974 + if VM event counters are disabled. 1013 975 1014 976 config PERCPU_STATS 1015 977 bool "Collect percpu memory statistics"
+33
mm/Kconfig.debug
··· 45 45 Enable debug page memory allocations by default? This value 46 46 can be overridden by debug_pagealloc=off|on. 47 47 48 + config DEBUG_SLAB 49 + bool "Debug slab memory allocations" 50 + depends on DEBUG_KERNEL && SLAB 51 + help 52 + Say Y here to have the kernel do limited verification on memory 53 + allocation as well as poisoning memory on free to catch use of freed 54 + memory. This can make kmalloc/kfree-intensive workloads much slower. 55 + 56 + config SLUB_DEBUG 57 + default y 58 + bool "Enable SLUB debugging support" if EXPERT 59 + depends on SLUB && SYSFS 60 + select STACKDEPOT if STACKTRACE_SUPPORT 61 + help 62 + SLUB has extensive debug support features. Disabling these can 63 + result in significant savings in code size. This also disables 64 + SLUB sysfs support. /sys/slab will not exist and there will be 65 + no support for cache validation etc. 66 + 67 + config SLUB_DEBUG_ON 68 + bool "SLUB debugging on by default" 69 + depends on SLUB && SLUB_DEBUG 70 + select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT 71 + default n 72 + help 73 + Boot with debugging on by default. SLUB boots by default with 74 + the runtime debug capabilities switched off. Enabling this is 75 + equivalent to specifying the "slub_debug" parameter on boot. 76 + There is no support for more fine grained debug control like 77 + possible with slub_debug=xxx. SLUB debugging may be switched 78 + off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying 79 + "slub_debug=-". 80 + 48 81 config PAGE_OWNER 49 82 bool "Track page owner" 50 83 depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
+2 -2
mm/internal.h
··· 374 374 phys_addr_t min_addr, 375 375 int nid, bool exact_nid); 376 376 377 - void split_free_page(struct page *free_page, 378 - int order, unsigned long split_pfn_offset); 377 + int split_free_page(struct page *free_page, 378 + unsigned int order, unsigned long split_pfn_offset); 379 379 380 380 #if defined CONFIG_COMPACTION || defined CONFIG_CMA 381 381
+1 -1
mm/kasan/report.c
··· 347 347 va->addr, va->addr + va->size, va->caller); 348 348 pr_err("\n"); 349 349 350 - page = vmalloc_to_page(page); 350 + page = vmalloc_to_page(addr); 351 351 } 352 352 } 353 353
+12 -6
mm/madvise.c
··· 248 248 249 249 if (!xa_is_value(page)) 250 250 continue; 251 + swap = radix_to_swp_entry(page); 252 + /* There might be swapin error entries in shmem mapping. */ 253 + if (non_swap_entry(swap)) 254 + continue; 251 255 xas_pause(&xas); 252 256 rcu_read_unlock(); 253 257 254 - swap = radix_to_swp_entry(page); 255 258 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, 256 259 NULL, 0, false, &splug); 257 260 if (page) ··· 627 624 swp_entry_t entry; 628 625 629 626 entry = pte_to_swp_entry(ptent); 630 - if (non_swap_entry(entry)) 631 - continue; 632 - nr_swap--; 633 - free_swap_and_cache(entry); 634 - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 627 + if (!non_swap_entry(entry)) { 628 + nr_swap--; 629 + free_swap_and_cache(entry); 630 + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 631 + } else if (is_hwpoison_entry(entry) || 632 + is_swapin_error_entry(entry)) { 633 + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 634 + } 635 635 continue; 636 636 } 637 637
+4 -1
mm/memory.c
··· 1487 1487 /* Only drop the uffd-wp marker if explicitly requested */ 1488 1488 if (!zap_drop_file_uffd_wp(details)) 1489 1489 continue; 1490 - } else if (is_hwpoison_entry(entry)) { 1490 + } else if (is_hwpoison_entry(entry) || 1491 + is_swapin_error_entry(entry)) { 1491 1492 if (!should_zap_cows(details)) 1492 1493 continue; 1493 1494 } else { ··· 3728 3727 ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); 3729 3728 } else if (is_hwpoison_entry(entry)) { 3730 3729 ret = VM_FAULT_HWPOISON; 3730 + } else if (is_swapin_error_entry(entry)) { 3731 + ret = VM_FAULT_SIGBUS; 3731 3732 } else if (is_pte_marker_entry(entry)) { 3732 3733 ret = handle_pte_marker(vmf); 3733 3734 } else {
+26 -6
mm/page_alloc.c
··· 482 482 bitidx = pfn_to_bitidx(page, pfn); 483 483 word_bitidx = bitidx / BITS_PER_LONG; 484 484 bitidx &= (BITS_PER_LONG-1); 485 - 486 - word = bitmap[word_bitidx]; 485 + /* 486 + * This races, without locks, with set_pfnblock_flags_mask(). Ensure 487 + * a consistent read of the memory array, so that results, even though 488 + * racy, are not corrupted. 489 + */ 490 + word = READ_ONCE(bitmap[word_bitidx]); 487 491 return (word >> bitidx) & mask; 488 492 } 489 493 ··· 1104 1100 * @order: the order of the page 1105 1101 * @split_pfn_offset: split offset within the page 1106 1102 * 1103 + * Return -ENOENT if the free page is changed, otherwise 0 1104 + * 1107 1105 * It is used when the free page crosses two pageblocks with different migratetypes 1108 1106 * at split_pfn_offset within the page. The split free page will be put into 1109 1107 * separate migratetype lists afterwards. Otherwise, the function achieves 1110 1108 * nothing. 1111 1109 */ 1112 - void split_free_page(struct page *free_page, 1113 - int order, unsigned long split_pfn_offset) 1110 + int split_free_page(struct page *free_page, 1111 + unsigned int order, unsigned long split_pfn_offset) 1114 1112 { 1115 1113 struct zone *zone = page_zone(free_page); 1116 1114 unsigned long free_page_pfn = page_to_pfn(free_page); 1117 1115 unsigned long pfn; 1118 1116 unsigned long flags; 1119 1117 int free_page_order; 1118 + int mt; 1119 + int ret = 0; 1120 1120 1121 1121 if (split_pfn_offset == 0) 1122 - return; 1122 + return ret; 1123 1123 1124 1124 spin_lock_irqsave(&zone->lock, flags); 1125 + 1126 + if (!PageBuddy(free_page) || buddy_order(free_page) != order) { 1127 + ret = -ENOENT; 1128 + goto out; 1129 + } 1130 + 1131 + mt = get_pageblock_migratetype(free_page); 1132 + if (likely(!is_migrate_isolate(mt))) 1133 + __mod_zone_freepage_state(zone, -(1UL << order), mt); 1134 + 1125 1135 del_page_from_free_list(free_page, zone, order); 1126 1136 for (pfn = free_page_pfn; 1127 1137 pfn < free_page_pfn + (1UL << order);) { 1128 1138 int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); 1129 1139 1130 - free_page_order = min_t(int, 1140 + free_page_order = min_t(unsigned int, 1131 1141 pfn ? __ffs(pfn) : order, 1132 1142 __fls(split_pfn_offset)); 1133 1143 __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, ··· 1152 1134 if (split_pfn_offset == 0) 1153 1135 split_pfn_offset = (1UL << order) - (pfn - free_page_pfn); 1154 1136 } 1137 + out: 1155 1138 spin_unlock_irqrestore(&zone->lock, flags); 1139 + return ret; 1156 1140 } 1157 1141 /* 1158 1142 * A bad page could be due to a number of fields. Instead of multiple branches,
+25 -11
mm/page_isolation.c
··· 300 300 * the in-use page then splitting the free page. 301 301 */ 302 302 static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, 303 - gfp_t gfp_flags, bool isolate_before) 303 + gfp_t gfp_flags, bool isolate_before, bool skip_isolation) 304 304 { 305 305 unsigned char saved_mt; 306 306 unsigned long start_pfn; ··· 327 327 zone->zone_start_pfn); 328 328 329 329 saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); 330 - ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags, 331 - isolate_pageblock, isolate_pageblock + pageblock_nr_pages); 332 330 333 - if (ret) 334 - return ret; 331 + if (skip_isolation) 332 + VM_BUG_ON(!is_migrate_isolate(saved_mt)); 333 + else { 334 + ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags, 335 + isolate_pageblock, isolate_pageblock + pageblock_nr_pages); 336 + 337 + if (ret) 338 + return ret; 339 + } 335 340 336 341 /* 337 342 * Bail out early when the to-be-isolated pageblock does not form ··· 371 366 if (PageBuddy(page)) { 372 367 int order = buddy_order(page); 373 368 374 - if (pfn + (1UL << order) > boundary_pfn) 375 - split_free_page(page, order, boundary_pfn - pfn); 376 - pfn += (1UL << order); 369 + if (pfn + (1UL << order) > boundary_pfn) { 370 + /* free page changed before split, check it again */ 371 + if (split_free_page(page, order, boundary_pfn - pfn)) 372 + continue; 373 + } 374 + 375 + pfn += 1UL << order; 377 376 continue; 378 377 } 379 378 /* ··· 472 463 return 0; 473 464 failed: 474 465 /* restore the original migratetype */ 475 - unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt); 466 + if (!skip_isolation) 467 + unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt); 476 468 return -EBUSY; 477 469 } 478 470 ··· 532 522 unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages); 533 523 unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); 534 524 int ret; 525 + bool skip_isolation = false; 535 526 536 527 /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */ 537 - ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false); 528 + ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, skip_isolation); 538 529 if (ret) 539 530 return ret; 540 531 532 + if (isolate_start == isolate_end - pageblock_nr_pages) 533 + skip_isolation = true; 534 + 541 535 /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */ 542 - ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true); 536 + ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, skip_isolation); 543 537 if (ret) { 544 538 unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype); 545 539 return ret;
+40 -1
mm/shmem.c
··· 1174 1174 continue; 1175 1175 1176 1176 entry = radix_to_swp_entry(folio); 1177 + /* 1178 + * swapin error entries can be found in the mapping. But they're 1179 + * deliberately ignored here as we've done everything we can do. 1180 + */ 1177 1181 if (swp_type(entry) != type) 1178 1182 continue; 1179 1183 ··· 1675 1671 return error; 1676 1672 } 1677 1673 1674 + static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, 1675 + struct folio *folio, swp_entry_t swap) 1676 + { 1677 + struct address_space *mapping = inode->i_mapping; 1678 + struct shmem_inode_info *info = SHMEM_I(inode); 1679 + swp_entry_t swapin_error; 1680 + void *old; 1681 + 1682 + swapin_error = make_swapin_error_entry(&folio->page); 1683 + old = xa_cmpxchg_irq(&mapping->i_pages, index, 1684 + swp_to_radix_entry(swap), 1685 + swp_to_radix_entry(swapin_error), 0); 1686 + if (old != swp_to_radix_entry(swap)) 1687 + return; 1688 + 1689 + folio_wait_writeback(folio); 1690 + delete_from_swap_cache(&folio->page); 1691 + spin_lock_irq(&info->lock); 1692 + /* 1693 + * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't 1694 + * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in 1695 + * shmem_evict_inode. 1696 + */ 1697 + info->alloced--; 1698 + info->swapped--; 1699 + shmem_recalc_inode(inode); 1700 + spin_unlock_irq(&info->lock); 1701 + swap_free(swap); 1702 + } 1703 + 1678 1704 /* 1679 1705 * Swap in the page pointed to by *pagep. 1680 1706 * Caller has to make sure that *pagep contains a valid swapped page. ··· 1727 1693 VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); 1728 1694 swap = radix_to_swp_entry(*foliop); 1729 1695 *foliop = NULL; 1696 + 1697 + if (is_swapin_error_entry(swap)) 1698 + return -EIO; 1730 1699 1731 1700 /* Look it up and read it in.. */ 1732 1701 page = lookup_swap_cache(swap, NULL, 0); ··· 1798 1761 failed: 1799 1762 if (!shmem_confirm_swap(mapping, index, swap)) 1800 1763 error = -EEXIST; 1764 + if (error == -EIO) 1765 + shmem_set_folio_swapin_error(inode, index, folio, swap); 1801 1766 unlock: 1802 1767 if (folio) { 1803 1768 folio_unlock(folio); ··· 1945 1906 1946 1907 spin_lock_irq(&info->lock); 1947 1908 info->alloced += folio_nr_pages(folio); 1948 - inode->i_blocks += BLOCKS_PER_PAGE << folio_order(folio); 1909 + inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio); 1949 1910 shmem_recalc_inode(inode); 1950 1911 spin_unlock_irq(&info->lock); 1951 1912 alloced = true;
+3
mm/swap_state.c
··· 410 410 return NULL; 411 411 412 412 swp = radix_to_swp_entry(page); 413 + /* There might be swapin error entries in shmem mapping. */ 414 + if (non_swap_entry(swp)) 415 + return NULL; 413 416 /* Prevent swapoff from happening to us */ 414 417 si = get_swap_device(swp); 415 418 if (!si)
+18 -3
mm/swapfile.c
··· 1775 1775 { 1776 1776 struct page *swapcache; 1777 1777 spinlock_t *ptl; 1778 - pte_t *pte; 1778 + pte_t *pte, new_pte; 1779 1779 int ret = 1; 1780 1780 1781 1781 swapcache = page; ··· 1785 1785 1786 1786 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 1787 1787 if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { 1788 + ret = 0; 1789 + goto out; 1790 + } 1791 + 1792 + if (unlikely(!PageUptodate(page))) { 1793 + pte_t pteval; 1794 + 1795 + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); 1796 + pteval = swp_entry_to_pte(make_swapin_error_entry(page)); 1797 + set_pte_at(vma->vm_mm, addr, pte, pteval); 1798 + swap_free(entry); 1788 1799 ret = 0; 1789 1800 goto out; 1790 1801 } ··· 1824 1813 page_add_new_anon_rmap(page, vma, addr); 1825 1814 lru_cache_add_inactive_or_unevictable(page, vma); 1826 1815 } 1827 - set_pte_at(vma->vm_mm, addr, pte, 1828 - pte_mkold(mk_pte(page, vma->vm_page_prot))); 1816 + new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); 1817 + if (pte_swp_soft_dirty(*pte)) 1818 + new_pte = pte_mksoft_dirty(new_pte); 1819 + if (pte_swp_uffd_wp(*pte)) 1820 + new_pte = pte_mkuffd_wp(new_pte); 1821 + set_pte_at(vma->vm_mm, addr, pte, new_pte); 1829 1822 swap_free(entry); 1830 1823 out: 1831 1824 pte_unmap_unlock(pte, ptl);
+41 -56
mm/z3fold.c
··· 181 181 NEEDS_COMPACTING, 182 182 PAGE_STALE, 183 183 PAGE_CLAIMED, /* by either reclaim or free */ 184 + PAGE_MIGRATED, /* page is migrated and soon to be released */ 184 185 }; 185 186 186 187 /* ··· 213 212 static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, 214 213 gfp_t gfp) 215 214 { 216 - struct z3fold_buddy_slots *slots; 217 - 218 - slots = kmem_cache_zalloc(pool->c_handle, 219 - (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE))); 215 + struct z3fold_buddy_slots *slots = kmem_cache_zalloc(pool->c_handle, 216 + gfp); 220 217 221 218 if (slots) { 222 219 /* It will be freed separately in free_handle(). */ ··· 271 272 zhdr = (struct z3fold_header *)(addr & PAGE_MASK); 272 273 locked = z3fold_page_trylock(zhdr); 273 274 read_unlock(&slots->lock); 274 - if (locked) 275 - break; 275 + if (locked) { 276 + struct page *page = virt_to_page(zhdr); 277 + 278 + if (!test_bit(PAGE_MIGRATED, &page->private)) 279 + break; 280 + z3fold_page_unlock(zhdr); 281 + } 276 282 cpu_relax(); 277 283 } while (true); 278 284 } else { ··· 395 391 clear_bit(NEEDS_COMPACTING, &page->private); 396 392 clear_bit(PAGE_STALE, &page->private); 397 393 clear_bit(PAGE_CLAIMED, &page->private); 394 + clear_bit(PAGE_MIGRATED, &page->private); 398 395 if (headless) 399 396 return zhdr; 400 397 ··· 524 519 spin_unlock(&pool->stale_lock); 525 520 526 521 atomic64_dec(&pool->pages_nr); 527 - } 528 - 529 - static void release_z3fold_page(struct kref *ref) 530 - { 531 - struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, 532 - refcount); 533 - __release_z3fold_page(zhdr, false); 534 522 } 535 523 536 524 static void release_z3fold_page_locked(struct kref *ref) ··· 938 940 } 939 941 } 940 942 941 - if (zhdr && !zhdr->slots) 942 - zhdr->slots = alloc_slots(pool, 943 - can_sleep ? GFP_NOIO : GFP_ATOMIC); 943 + if (zhdr && !zhdr->slots) { 944 + zhdr->slots = alloc_slots(pool, GFP_ATOMIC); 945 + if (!zhdr->slots) 946 + goto out_fail; 947 + } 944 948 return zhdr; 949 + 950 + out_fail: 951 + if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) { 952 + add_to_unbuddied(pool, zhdr); 953 + z3fold_page_unlock(zhdr); 954 + } 955 + return NULL; 945 956 } 946 957 947 958 /* ··· 1073 1066 enum buddy bud; 1074 1067 bool can_sleep = gfpflags_allow_blocking(gfp); 1075 1068 1076 - if (!size) 1069 + if (!size || (gfp & __GFP_HIGHMEM)) 1077 1070 return -EINVAL; 1078 1071 1079 1072 if (size > PAGE_SIZE) ··· 1100 1093 bud = FIRST; 1101 1094 } 1102 1095 1103 - page = NULL; 1104 - if (can_sleep) { 1105 - spin_lock(&pool->stale_lock); 1106 - zhdr = list_first_entry_or_null(&pool->stale, 1107 - struct z3fold_header, buddy); 1108 - /* 1109 - * Before allocating a page, let's see if we can take one from 1110 - * the stale pages list. cancel_work_sync() can sleep so we 1111 - * limit this case to the contexts where we can sleep 1112 - */ 1113 - if (zhdr) { 1114 - list_del(&zhdr->buddy); 1115 - spin_unlock(&pool->stale_lock); 1116 - cancel_work_sync(&zhdr->work); 1117 - page = virt_to_page(zhdr); 1118 - } else { 1119 - spin_unlock(&pool->stale_lock); 1120 - } 1121 - } 1122 - if (!page) 1123 - page = alloc_page(gfp); 1124 - 1096 + page = alloc_page(gfp); 1125 1097 if (!page) 1126 1098 return -ENOMEM; 1127 1099 ··· 1120 1134 __SetPageMovable(page, pool->inode->i_mapping); 1121 1135 unlock_page(page); 1122 1136 } else { 1123 - if (trylock_page(page)) { 1124 - __SetPageMovable(page, pool->inode->i_mapping); 1125 - unlock_page(page); 1126 - } 1137 + WARN_ON(!trylock_page(page)); 1138 + __SetPageMovable(page, pool->inode->i_mapping); 1139 + unlock_page(page); 1127 1140 } 1128 1141 z3fold_page_lock(zhdr); 1129 1142 ··· 1221 1236 return; 1222 1237 } 1223 1238 if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { 1224 - put_z3fold_header(zhdr); 1225 1239 clear_bit(PAGE_CLAIMED, &page->private); 1240 + put_z3fold_header(zhdr); 1226 1241 return; 1227 1242 } 1228 1243 if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) { ··· 1317 1332 break; 1318 1333 } 1319 1334 1320 - if (kref_get_unless_zero(&zhdr->refcount) == 0) { 1321 - zhdr = NULL; 1322 - break; 1323 - } 1324 1335 if (!z3fold_page_trylock(zhdr)) { 1325 - kref_put(&zhdr->refcount, release_z3fold_page); 1326 1336 zhdr = NULL; 1327 1337 continue; /* can't evict at this point */ 1328 1338 } ··· 1328 1348 */ 1329 1349 if (zhdr->foreign_handles || 1330 1350 test_and_set_bit(PAGE_CLAIMED, &page->private)) { 1331 - if (!kref_put(&zhdr->refcount, 1332 - release_z3fold_page_locked)) 1333 - z3fold_page_unlock(zhdr); 1351 + z3fold_page_unlock(zhdr); 1334 1352 zhdr = NULL; 1335 1353 continue; /* can't evict such page */ 1336 1354 } 1337 1355 list_del_init(&zhdr->buddy); 1338 1356 zhdr->cpu = -1; 1357 + /* See comment in __z3fold_alloc. */ 1358 + kref_get(&zhdr->refcount); 1339 1359 break; 1340 1360 } 1341 1361 ··· 1417 1437 spin_lock(&pool->lock); 1418 1438 list_add(&page->lru, &pool->lru); 1419 1439 spin_unlock(&pool->lock); 1420 - z3fold_page_unlock(zhdr); 1440 + if (list_empty(&zhdr->buddy)) 1441 + add_to_unbuddied(pool, zhdr); 1421 1442 clear_bit(PAGE_CLAIMED, &page->private); 1443 + z3fold_page_unlock(zhdr); 1422 1444 } 1423 1445 1424 1446 /* We started off locked to we need to lock the pool back */ ··· 1572 1590 if (!z3fold_page_trylock(zhdr)) 1573 1591 return -EAGAIN; 1574 1592 if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) { 1575 - z3fold_page_unlock(zhdr); 1576 1593 clear_bit(PAGE_CLAIMED, &page->private); 1594 + z3fold_page_unlock(zhdr); 1577 1595 return -EBUSY; 1578 1596 } 1579 1597 if (work_pending(&zhdr->work)) { ··· 1583 1601 new_zhdr = page_address(newpage); 1584 1602 memcpy(new_zhdr, zhdr, PAGE_SIZE); 1585 1603 newpage->private = page->private; 1586 - page->private = 0; 1604 + set_bit(PAGE_MIGRATED, &page->private); 1587 1605 z3fold_page_unlock(zhdr); 1588 1606 spin_lock_init(&new_zhdr->page_lock); 1589 1607 INIT_WORK(&new_zhdr->work, compact_page_work); ··· 1613 1631 1614 1632 queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); 1615 1633 1616 - clear_bit(PAGE_CLAIMED, &page->private); 1634 + /* PAGE_CLAIMED and PAGE_MIGRATED are cleared now. */ 1635 + page->private = 0; 1617 1636 put_page(page); 1618 1637 return 0; 1619 1638 } ··· 1636 1653 spin_lock(&pool->lock); 1637 1654 list_add(&page->lru, &pool->lru); 1638 1655 spin_unlock(&pool->lock); 1656 + if (list_empty(&zhdr->buddy)) 1657 + add_to_unbuddied(pool, zhdr); 1639 1658 clear_bit(PAGE_CLAIMED, &page->private); 1640 1659 z3fold_page_unlock(zhdr); 1641 1660 }
+89
tools/testing/selftests/cgroup/memcg_protection.m
··· 1 + % SPDX-License-Identifier: GPL-2.0 2 + % 3 + % run as: octave-cli memcg_protection.m 4 + % 5 + % This script simulates reclaim protection behavior on a single level of memcg 6 + % hierarchy to illustrate how overcommitted protection spreads among siblings 7 + % (as it depends also on their current consumption). 8 + % 9 + % Simulation assumes siblings consumed the initial amount of memory (w/out 10 + % reclaim) and then the reclaim starts, all memory is reclaimable, i.e. treated 11 + % same. It simulates only non-low reclaim and assumes all memory.min = 0. 12 + % 13 + % Input configurations 14 + % -------------------- 15 + % E number parent effective protection 16 + % n vector nominal protection of siblings set at the given level (memory.low) 17 + % c vector current consumption -,,- (memory.current) 18 + 19 + % example from testcase (values in GB) 20 + E = 50 / 1024; 21 + n = [75 25 0 500 ] / 1024; 22 + c = [50 50 50 0] / 1024; 23 + 24 + % Reclaim parameters 25 + % ------------------ 26 + 27 + % Minimal reclaim amount (GB) 28 + cluster = 32*4 / 2**20; 29 + 30 + % Reclaim coefficient (think as 0.5^sc->priority) 31 + alpha = .1 32 + 33 + % Simulation parameters 34 + % --------------------- 35 + epsilon = 1e-7; 36 + timeout = 1000; 37 + 38 + % Simulation loop 39 + % --------------- 40 + 41 + ch = []; 42 + eh = []; 43 + rh = []; 44 + 45 + for t = 1:timeout 46 + % low_usage 47 + u = min(c, n); 48 + siblings = sum(u); 49 + 50 + % effective_protection() 51 + protected = min(n, c); % start with nominal 52 + e = protected * min(1, E / siblings); % normalize overcommit 53 + 54 + % recursive protection 55 + unclaimed = max(0, E - siblings); 56 + parent_overuse = sum(c) - siblings; 57 + if (unclaimed > 0 && parent_overuse > 0) 58 + overuse = max(0, c - protected); 59 + e += unclaimed * (overuse / parent_overuse); 60 + endif 61 + 62 + % get_scan_count() 63 + r = alpha * c; % assume all memory is in a single LRU list 64 + 65 + % commit 1bc63fb1272b ("mm, memcg: make scan aggression always exclude protection") 66 + sz = max(e, c); 67 + r .*= (1 - (e+epsilon) ./ (sz+epsilon)); 68 + 69 + % uncomment to debug prints 70 + % e, c, r 71 + 72 + % nothing to reclaim, reached equilibrium 73 + if max(r) < epsilon 74 + break; 75 + endif 76 + 77 + % SWAP_CLUSTER_MAX roundup 78 + r = max(r, (r > epsilon) .* cluster); 79 + % XXX here I do parallel reclaim of all siblings 80 + % in reality reclaim is serialized and each sibling recalculates own residual 81 + c = max(c - r, 0); 82 + 83 + ch = [ch ; c]; 84 + eh = [eh ; e]; 85 + rh = [rh ; r]; 86 + endfor 87 + 88 + t 89 + c, e
+67 -190
tools/testing/selftests/cgroup/test_memcontrol.c
··· 190 190 return ret; 191 191 } 192 192 193 - static int alloc_pagecache_50M(const char *cgroup, void *arg) 194 - { 195 - int fd = (long)arg; 196 - 197 - return alloc_pagecache(fd, MB(50)); 198 - } 199 - 200 193 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg) 201 194 { 202 195 int fd = (long)arg; ··· 240 247 241 248 /* 242 249 * First, this test creates the following hierarchy: 243 - * A memory.min = 50M, memory.max = 200M 244 - * A/B memory.min = 50M, memory.current = 50M 250 + * A memory.min = 0, memory.max = 200M 251 + * A/B memory.min = 50M 245 252 * A/B/C memory.min = 75M, memory.current = 50M 246 253 * A/B/D memory.min = 25M, memory.current = 50M 247 254 * A/B/E memory.min = 0, memory.current = 50M 248 255 * A/B/F memory.min = 500M, memory.current = 0 249 256 * 250 - * Usages are pagecache, but the test keeps a running 257 + * (or memory.low if we test soft protection) 258 + * 259 + * Usages are pagecache and the test keeps a running 251 260 * process in every leaf cgroup. 252 261 * Then it creates A/G and creates a significant 253 - * memory pressure in it. 262 + * memory pressure in A. 254 263 * 264 + * Then it checks actual memory usages and expects that: 255 265 * A/B memory.current ~= 50M 256 - * A/B/C memory.current ~= 33M 257 - * A/B/D memory.current ~= 17M 258 - * A/B/F memory.current ~= 0 266 + * A/B/C memory.current ~= 29M 267 + * A/B/D memory.current ~= 21M 268 + * A/B/E memory.current ~= 0 269 + * A/B/F memory.current = 0 270 + * (for origin of the numbers, see model in memcg_protection.m.) 259 271 * 260 272 * After that it tries to allocate more than there is 261 - * unprotected memory in A available, and checks 262 - * checks that memory.min protects pagecache even 263 - * in this case. 273 + * unprotected memory in A available, and checks that: 274 + * a) memory.min protects pagecache even in this case, 275 + * b) memory.low allows reclaiming page cache with low events. 264 276 */ 265 - static int test_memcg_min(const char *root) 277 + static int test_memcg_protection(const char *root, bool min) 266 278 { 267 - int ret = KSFT_FAIL; 279 + int ret = KSFT_FAIL, rc; 268 280 char *parent[3] = {NULL}; 269 281 char *children[4] = {NULL}; 282 + const char *attribute = min ? "memory.min" : "memory.low"; 270 283 long c[4]; 271 284 int i, attempts; 272 285 int fd; ··· 296 297 if (cg_create(parent[0])) 297 298 goto cleanup; 298 299 299 - if (cg_read_long(parent[0], "memory.min")) { 300 - ret = KSFT_SKIP; 300 + if (cg_read_long(parent[0], attribute)) { 301 + /* No memory.min on older kernels is fine */ 302 + if (min) 303 + ret = KSFT_SKIP; 301 304 goto cleanup; 302 305 } 303 306 ··· 336 335 (void *)(long)fd); 337 336 } 338 337 339 - if (cg_write(parent[0], "memory.min", "50M")) 338 + if (cg_write(parent[1], attribute, "50M")) 340 339 goto cleanup; 341 - if (cg_write(parent[1], "memory.min", "50M")) 340 + if (cg_write(children[0], attribute, "75M")) 342 341 goto cleanup; 343 - if (cg_write(children[0], "memory.min", "75M")) 342 + if (cg_write(children[1], attribute, "25M")) 344 343 goto cleanup; 345 - if (cg_write(children[1], "memory.min", "25M")) 344 + if (cg_write(children[2], attribute, "0")) 346 345 goto cleanup; 347 - if (cg_write(children[2], "memory.min", "0")) 348 - goto cleanup; 349 - if (cg_write(children[3], "memory.min", "500M")) 346 + if (cg_write(children[3], attribute, "500M")) 350 347 goto cleanup; 351 348 352 349 attempts = 0; ··· 364 365 for (i = 0; i < ARRAY_SIZE(children); i++) 365 366 c[i] = cg_read_long(children[i], "memory.current"); 366 367 367 - if (!values_close(c[0], MB(33), 10)) 368 + if (!values_close(c[0], MB(29), 10)) 368 369 goto cleanup; 369 370 370 - if (!values_close(c[1], MB(17), 10)) 371 - goto cleanup; 372 - 373 - if (c[3] != 0) 374 - goto cleanup; 375 - 376 - if (!cg_run(parent[2], alloc_anon, (void *)MB(170))) 377 - goto cleanup; 378 - 379 - if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3)) 380 - goto cleanup; 381 - 382 - ret = KSFT_PASS; 383 - 384 - cleanup: 385 - for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) { 386 - if (!children[i]) 387 - continue; 388 - 389 - cg_destroy(children[i]); 390 - free(children[i]); 391 - } 392 - 393 - for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) { 394 - if (!parent[i]) 395 - continue; 396 - 397 - cg_destroy(parent[i]); 398 - free(parent[i]); 399 - } 400 - close(fd); 401 - return ret; 402 - } 403 - 404 - /* 405 - * First, this test creates the following hierarchy: 406 - * A memory.low = 50M, memory.max = 200M 407 - * A/B memory.low = 50M, memory.current = 50M 408 - * A/B/C memory.low = 75M, memory.current = 50M 409 - * A/B/D memory.low = 25M, memory.current = 50M 410 - * A/B/E memory.low = 0, memory.current = 50M 411 - * A/B/F memory.low = 500M, memory.current = 0 412 - * 413 - * Usages are pagecache. 414 - * Then it creates A/G an creates a significant 415 - * memory pressure in it. 416 - * 417 - * Then it checks actual memory usages and expects that: 418 - * A/B memory.current ~= 50M 419 - * A/B/ memory.current ~= 33M 420 - * A/B/D memory.current ~= 17M 421 - * A/B/F memory.current ~= 0 422 - * 423 - * After that it tries to allocate more than there is 424 - * unprotected memory in A available, 425 - * and checks low and oom events in memory.events. 426 - */ 427 - static int test_memcg_low(const char *root) 428 - { 429 - int ret = KSFT_FAIL; 430 - char *parent[3] = {NULL}; 431 - char *children[4] = {NULL}; 432 - long low, oom; 433 - long c[4]; 434 - int i; 435 - int fd; 436 - 437 - fd = get_temp_fd(); 438 - if (fd < 0) 439 - goto cleanup; 440 - 441 - parent[0] = cg_name(root, "memcg_test_0"); 442 - if (!parent[0]) 443 - goto cleanup; 444 - 445 - parent[1] = cg_name(parent[0], "memcg_test_1"); 446 - if (!parent[1]) 447 - goto cleanup; 448 - 449 - parent[2] = cg_name(parent[0], "memcg_test_2"); 450 - if (!parent[2]) 451 - goto cleanup; 452 - 453 - if (cg_create(parent[0])) 454 - goto cleanup; 455 - 456 - if (cg_read_long(parent[0], "memory.low")) 457 - goto cleanup; 458 - 459 - if (cg_write(parent[0], "cgroup.subtree_control", "+memory")) 460 - goto cleanup; 461 - 462 - if (cg_write(parent[0], "memory.max", "200M")) 463 - goto cleanup; 464 - 465 - if (cg_write(parent[0], "memory.swap.max", "0")) 466 - goto cleanup; 467 - 468 - if (cg_create(parent[1])) 469 - goto cleanup; 470 - 471 - if (cg_write(parent[1], "cgroup.subtree_control", "+memory")) 472 - goto cleanup; 473 - 474 - if (cg_create(parent[2])) 475 - goto cleanup; 476 - 477 - for (i = 0; i < ARRAY_SIZE(children); i++) { 478 - children[i] = cg_name_indexed(parent[1], "child_memcg", i); 479 - if (!children[i]) 480 - goto cleanup; 481 - 482 - if (cg_create(children[i])) 483 - goto cleanup; 484 - 485 - if (i > 2) 486 - continue; 487 - 488 - if (cg_run(children[i], alloc_pagecache_50M, (void *)(long)fd)) 489 - goto cleanup; 490 - } 491 - 492 - if (cg_write(parent[0], "memory.low", "50M")) 493 - goto cleanup; 494 - if (cg_write(parent[1], "memory.low", "50M")) 495 - goto cleanup; 496 - if (cg_write(children[0], "memory.low", "75M")) 497 - goto cleanup; 498 - if (cg_write(children[1], "memory.low", "25M")) 499 - goto cleanup; 500 - if (cg_write(children[2], "memory.low", "0")) 501 - goto cleanup; 502 - if (cg_write(children[3], "memory.low", "500M")) 503 - goto cleanup; 504 - 505 - if (cg_run(parent[2], alloc_anon, (void *)MB(148))) 506 - goto cleanup; 507 - 508 - if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3)) 509 - goto cleanup; 510 - 511 - for (i = 0; i < ARRAY_SIZE(children); i++) 512 - c[i] = cg_read_long(children[i], "memory.current"); 513 - 514 - if (!values_close(c[0], MB(33), 10)) 515 - goto cleanup; 516 - 517 - if (!values_close(c[1], MB(17), 10)) 371 + if (!values_close(c[1], MB(21), 10)) 518 372 goto cleanup; 519 373 520 374 if (c[3] != 0) 521 375 goto cleanup; 522 376 523 - if (cg_run(parent[2], alloc_anon, (void *)MB(166))) { 377 + rc = cg_run(parent[2], alloc_anon, (void *)MB(170)); 378 + if (min && !rc) 379 + goto cleanup; 380 + else if (!min && rc) { 524 381 fprintf(stderr, 525 382 "memory.low prevents from allocating anon memory\n"); 526 383 goto cleanup; 527 384 } 528 385 386 + if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3)) 387 + goto cleanup; 388 + 389 + if (min) { 390 + ret = KSFT_PASS; 391 + goto cleanup; 392 + } 393 + 529 394 for (i = 0; i < ARRAY_SIZE(children); i++) { 530 - int no_low_events_index = has_recursiveprot ? 2 : 1; 395 + int no_low_events_index = 1; 396 + long low, oom; 531 397 532 398 oom = cg_read_key_long(children[i], "memory.events", "oom "); 533 399 low = cg_read_key_long(children[i], "memory.events", "low "); ··· 426 562 } 427 563 close(fd); 428 564 return ret; 565 + } 566 + 567 + static int test_memcg_min(const char *root) 568 + { 569 + return test_memcg_protection(root, true); 570 + } 571 + 572 + static int test_memcg_low(const char *root) 573 + { 574 + return test_memcg_protection(root, false); 429 575 } 430 576 431 577 static int alloc_pagecache_max_30M(const char *cgroup, void *arg) ··· 1115 1241 if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0) 1116 1242 goto cleanup; 1117 1243 1118 - if (cg_read_key_long(parent, "memory.events", "oom_kill ") <= 0) 1244 + parent_oom_events = cg_read_key_long( 1245 + parent, "memory.events", "oom_kill "); 1246 + /* 1247 + * If memory_localevents is not enabled (the default), the parent should 1248 + * count OOM events in its children groups. Otherwise, it should not 1249 + * have observed any events. 1250 + */ 1251 + if (has_localevents && parent_oom_events != 0) 1252 + goto cleanup; 1253 + else if (!has_localevents && parent_oom_events <= 0) 1119 1254 goto cleanup; 1120 1255 1121 1256 ret = KSFT_PASS; ··· 1232 1349 if (!cg_run(memcg, alloc_anon, (void *)MB(100))) 1233 1350 goto cleanup; 1234 1351 1235 - parent_oom_events = cg_read_key_long( 1236 - parent, "memory.events", "oom_kill "); 1237 - /* 1238 - * If memory_localevents is not enabled (the default), the parent should 1239 - * count OOM events in its children groups. Otherwise, it should not 1240 - * have observed any events. 1241 - */ 1242 - if ((has_localevents && parent_oom_events == 0) || 1243 - parent_oom_events > 0) 1244 - ret = KSFT_PASS; 1352 + if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3) 1353 + goto cleanup; 1245 1354 1246 1355 if (kill(safe_pid, SIGKILL)) 1247 1356 goto cleanup; 1357 + 1358 + ret = KSFT_PASS; 1248 1359 1249 1360 cleanup: 1250 1361 if (memcg)