Sync mm-stable with mm-hotfixes-stable to pick up dependent patches

+1

.mailmap

··· 130 130 Douglas Gilbert <dougg@torque.net> 131 131 Ed L. Cashin <ecashin@coraid.com> 132 132 Erik Kaneda <erik.kaneda@intel.com> <erik.schmauss@intel.com> 133 + Eugen Hristev <eugen.hristev@collabora.com> <eugen.hristev@microchip.com> 133 134 Evgeniy Polyakov <johnpol@2ka.mipt.ru> 134 135 Ezequiel Garcia <ezequiel@vanguardiasur.com.ar> <ezequiel@collabora.com> 135 136 Felipe W Damasio <felipewd@terra.com.br>

+6 -9

Documentation/admin-guide/cgroup-v2.rst

··· 1245 1245 This is a simple interface to trigger memory reclaim in the 1246 1246 target cgroup. 1247 1247 1248 - This file accepts a string which contains the number of bytes to 1249 - reclaim. 1248 + This file accepts a single key, the number of bytes to reclaim. 1249 + No nested keys are currently supported. 1250 1250 1251 1251 Example:: 1252 1252 1253 1253 echo "1G" > memory.reclaim 1254 + 1255 + The interface can be later extended with nested keys to 1256 + configure the reclaim behavior. For example, specify the 1257 + type of memory to reclaim from (anon, file, ..). 1254 1258 1255 1259 Please note that the kernel can over or under reclaim from 1256 1260 the target cgroup. If less bytes are reclaimed than the ··· 1266 1262 the memory reclaim normally is not exercised in this case. 1267 1263 This means that the networking layer will not adapt based on 1268 1264 reclaim induced by memory.reclaim. 1269 - 1270 - This file also allows the user to specify the nodes to reclaim from, 1271 - via the 'nodes=' key, for example:: 1272 - 1273 - echo "1G nodes=0,1" > memory.reclaim 1274 - 1275 - The above instructs the kernel to reclaim memory from nodes 0,1. 1276 1265 1277 1266 memory.peak 1278 1267 A read-only single value file which exists on non-root

+5 -2

arch/ia64/kernel/sys_ia64.c

··· 170 170 asmlinkage long 171 171 ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user *tp) 172 172 { 173 + struct timespec64 rtn_tp; 174 + s64 tick_ns; 175 + 173 176 /* 174 177 * ia64's clock_gettime() syscall is implemented as a vdso call 175 178 * fsys_clock_gettime(). Currently it handles only ··· 188 185 switch (which_clock) { 189 186 case CLOCK_REALTIME: 190 187 case CLOCK_MONOTONIC: 191 - s64 tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq); 192 - struct timespec64 rtn_tp = ns_to_timespec64(tick_ns); 188 + tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq); 189 + rtn_tp = ns_to_timespec64(tick_ns); 193 190 return put_timespec64(&rtn_tp, tp); 194 191 } 195 192

+1

arch/sh/kernel/vmlinux.lds.S

··· 4 4 * Written by Niibe Yutaka and Paul Mundt 5 5 */ 6 6 OUTPUT_ARCH(sh) 7 + #define RUNTIME_DISCARD_EXIT 7 8 #include <asm/thread_info.h> 8 9 #include <asm/cache.h> 9 10 #include <asm/vmlinux.lds.h>

+1 -5

drivers/of/fdt.c

··· 26 26 #include <linux/serial_core.h> 27 27 #include <linux/sysfs.h> 28 28 #include <linux/random.h> 29 - #include <linux/kmemleak.h> 30 29 31 30 #include <asm/setup.h> /* for COMMAND_LINE_SIZE */ 32 31 #include <asm/page.h> ··· 524 525 size = dt_mem_next_cell(dt_root_size_cells, &prop); 525 526 526 527 if (size && 527 - early_init_dt_reserve_memory(base, size, nomap) == 0) { 528 + early_init_dt_reserve_memory(base, size, nomap) == 0) 528 529 pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n", 529 530 uname, &base, (unsigned long)(size / SZ_1M)); 530 - if (!nomap) 531 - kmemleak_alloc_phys(base, size, 0); 532 - } 533 531 else 534 532 pr_err("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n", 535 533 uname, &base, (unsigned long)(size / SZ_1M));

+1 -1

fs/freevxfs/Kconfig

··· 8 8 of SCO UnixWare (and possibly others) and optionally available 9 9 for Sunsoft Solaris, HP-UX and many other operating systems. However 10 10 these particular OS implementations of vxfs may differ in on-disk 11 - data endianess and/or superblock offset. The vxfs module has been 11 + data endianness and/or superblock offset. The vxfs module has been 12 12 tested with SCO UnixWare and HP-UX B.10.20 (pa-risc 1.1 arch.) 13 13 Currently only readonly access is supported and VxFX versions 14 14 2, 3 and 4. Tests were performed with HP-UX VxFS version 3.

+1 -3

fs/proc/task_mmu.c

··· 745 745 page = pfn_swap_entry_to_page(swpent); 746 746 } 747 747 if (page) { 748 - int mapcount = page_mapcount(page); 749 - 750 - if (mapcount >= 2) 748 + if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte)) 751 749 mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); 752 750 else 753 751 mss->private_hugetlb += huge_page_size(hstate_vma(vma));

+1 -1

fs/squashfs/squashfs_fs.h

··· 183 183 #define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ 184 184 sizeof(u64)) 185 185 /* xattr id lookup table defines */ 186 - #define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id)) 186 + #define SQUASHFS_XATTR_BYTES(A) (((u64) (A)) * sizeof(struct squashfs_xattr_id)) 187 187 188 188 #define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \ 189 189 SQUASHFS_METADATA_SIZE)

+1 -1

fs/squashfs/squashfs_fs_sb.h

··· 63 63 long long bytes_used; 64 64 unsigned int inodes; 65 65 unsigned int fragments; 66 - int xattr_ids; 66 + unsigned int xattr_ids; 67 67 unsigned int ids; 68 68 bool panic_on_errors; 69 69 const struct squashfs_decompressor_thread_ops *thread_ops;

+2 -2

fs/squashfs/xattr.h

··· 10 10 11 11 #ifdef CONFIG_SQUASHFS_XATTR 12 12 extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, 13 - u64 *, int *); 13 + u64 *, unsigned int *); 14 14 extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, 15 15 unsigned int *, unsigned long long *); 16 16 #else 17 17 static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, 18 - u64 start, u64 *xattr_table_start, int *xattr_ids) 18 + u64 start, u64 *xattr_table_start, unsigned int *xattr_ids) 19 19 { 20 20 struct squashfs_xattr_id_table *id_table; 21 21

+2 -2

fs/squashfs/xattr_id.c

··· 56 56 * Read uncompressed xattr id lookup table indexes from disk into memory 57 57 */ 58 58 __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start, 59 - u64 *xattr_table_start, int *xattr_ids) 59 + u64 *xattr_table_start, unsigned int *xattr_ids) 60 60 { 61 61 struct squashfs_sb_info *msblk = sb->s_fs_info; 62 62 unsigned int len, indexes; ··· 76 76 /* Sanity check values */ 77 77 78 78 /* there is always at least one xattr id */ 79 - if (*xattr_ids == 0) 79 + if (*xattr_ids <= 0) 80 80 return ERR_PTR(-EINVAL); 81 81 82 82 len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);

+2 -2

include/linux/highmem-internal.h

··· 200 200 static inline void __kunmap_local(const void *addr) 201 201 { 202 202 #ifdef ARCH_HAS_FLUSH_ON_KUNMAP 203 - kunmap_flush_on_unmap(addr); 203 + kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); 204 204 #endif 205 205 } 206 206 ··· 227 227 static inline void __kunmap_atomic(const void *addr) 228 228 { 229 229 #ifdef ARCH_HAS_FLUSH_ON_KUNMAP 230 - kunmap_flush_on_unmap(addr); 230 + kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); 231 231 #endif 232 232 pagefault_enable(); 233 233 if (IS_ENABLED(CONFIG_PREEMPT_RT))

+13

include/linux/hugetlb.h

··· 8 8 #include <linux/fs.h> 9 9 #include <linux/hugetlb_inline.h> 10 10 #include <linux/cgroup.h> 11 + #include <linux/page_ref.h> 11 12 #include <linux/list.h> 12 13 #include <linux/kref.h> 13 14 #include <linux/pgtable.h> ··· 1223 1222 #else 1224 1223 static inline __init void hugetlb_cma_reserve(int order) 1225 1224 { 1225 + } 1226 + #endif 1227 + 1228 + #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 1229 + static inline bool hugetlb_pmd_shared(pte_t *pte) 1230 + { 1231 + return page_count(virt_to_page(pte)) > 1; 1232 + } 1233 + #else 1234 + static inline bool hugetlb_pmd_shared(pte_t *pte) 1235 + { 1236 + return false; 1226 1237 } 1227 1238 #endif 1228 1239

+4 -1

include/linux/memcontrol.h

··· 1688 1688 static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, 1689 1689 struct bdi_writeback *wb) 1690 1690 { 1691 + struct mem_cgroup *memcg; 1692 + 1691 1693 if (mem_cgroup_disabled()) 1692 1694 return; 1693 1695 1694 - if (unlikely(&folio_memcg(folio)->css != wb->memcg_css)) 1696 + memcg = folio_memcg(folio); 1697 + if (unlikely(memcg && &memcg->css != wb->memcg_css)) 1695 1698 mem_cgroup_track_foreign_dirty_slowpath(folio, wb); 1696 1699 } 1697 1700

+1 -2

include/linux/swap.h

··· 418 418 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, 419 419 unsigned long nr_pages, 420 420 gfp_t gfp_mask, 421 - unsigned int reclaim_options, 422 - nodemask_t *nodemask); 421 + unsigned int reclaim_options); 423 422 extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, 424 423 gfp_t gfp_mask, bool noswap, 425 424 pg_data_t *pgdat,

+2 -1

lib/Kconfig.debug

··· 754 754 select KALLSYMS 755 755 select CRC32 756 756 select STACKDEPOT 757 + select STACKDEPOT_ALWAYS_INIT if !DEBUG_KMEMLEAK_DEFAULT_OFF 757 758 help 758 759 Say Y here if you want to enable the memory leak 759 760 detector. The memory allocation/freeing is traced in a way ··· 1208 1207 depends on DEBUG_KERNEL && PROC_FS 1209 1208 default y 1210 1209 help 1211 - If you say Y here, the /proc/sched_debug file will be provided 1210 + If you say Y here, the /sys/kernel/debug/sched file will be provided 1212 1211 that can help debug the scheduler. The runtime overhead of this 1213 1212 option is minimal. 1214 1213

+11 -11

lib/maple_tree.c

··· 667 667 unsigned char piv) 668 668 { 669 669 struct maple_node *node = mte_to_node(mn); 670 + enum maple_type type = mte_node_type(mn); 670 671 671 - if (piv >= mt_pivots[piv]) { 672 + if (piv >= mt_pivots[type]) { 672 673 WARN_ON(1); 673 674 return 0; 674 675 } 675 - switch (mte_node_type(mn)) { 676 + switch (type) { 676 677 case maple_arange_64: 677 678 return node->ma64.pivot[piv]; 678 679 case maple_range_64: ··· 4877 4876 unsigned long *pivots, *gaps; 4878 4877 void __rcu **slots; 4879 4878 unsigned long gap = 0; 4880 - unsigned long max, min, index; 4879 + unsigned long max, min; 4881 4880 unsigned char offset; 4882 4881 4883 4882 if (unlikely(mas_is_err(mas))) ··· 4899 4898 min = mas_safe_min(mas, pivots, --offset); 4900 4899 4901 4900 max = mas_safe_pivot(mas, pivots, offset, type); 4902 - index = mas->index; 4903 - while (index <= max) { 4901 + while (mas->index <= max) { 4904 4902 gap = 0; 4905 4903 if (gaps) 4906 4904 gap = gaps[offset]; ··· 4930 4930 min = mas_safe_min(mas, pivots, offset); 4931 4931 } 4932 4932 4933 - if (unlikely(index > max)) { 4934 - mas_set_err(mas, -EBUSY); 4935 - return false; 4936 - } 4933 + if (unlikely((mas->index > max) || (size - 1 > max - mas->index))) 4934 + goto no_space; 4937 4935 4938 4936 if (unlikely(ma_is_leaf(type))) { 4939 4937 mas->offset = offset; ··· 4948 4950 return false; 4949 4951 4950 4952 ascend: 4951 - if (mte_is_root(mas->node)) 4952 - mas_set_err(mas, -EBUSY); 4953 + if (!mte_is_root(mas->node)) 4954 + return false; 4953 4955 4956 + no_space: 4957 + mas_set_err(mas, -EBUSY); 4954 4958 return false; 4955 4959 } 4956 4960

+89

lib/test_maple_tree.c

··· 2517 2517 mt_set_non_kernel(0); 2518 2518 } 2519 2519 2520 + static noinline void check_empty_area_window(struct maple_tree *mt) 2521 + { 2522 + unsigned long i, nr_entries = 20; 2523 + MA_STATE(mas, mt, 0, 0); 2524 + 2525 + for (i = 1; i <= nr_entries; i++) 2526 + mtree_store_range(mt, i*10, i*10 + 9, 2527 + xa_mk_value(i), GFP_KERNEL); 2528 + 2529 + /* Create another hole besides the one at 0 */ 2530 + mtree_store_range(mt, 160, 169, NULL, GFP_KERNEL); 2531 + 2532 + /* Check lower bounds that don't fit */ 2533 + rcu_read_lock(); 2534 + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 90, 10) != -EBUSY); 2535 + 2536 + mas_reset(&mas); 2537 + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 6, 90, 5) != -EBUSY); 2538 + 2539 + /* Check lower bound that does fit */ 2540 + mas_reset(&mas); 2541 + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 90, 5) != 0); 2542 + MT_BUG_ON(mt, mas.index != 5); 2543 + MT_BUG_ON(mt, mas.last != 9); 2544 + rcu_read_unlock(); 2545 + 2546 + /* Check one gap that doesn't fit and one that does */ 2547 + rcu_read_lock(); 2548 + mas_reset(&mas); 2549 + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 217, 9) != 0); 2550 + MT_BUG_ON(mt, mas.index != 161); 2551 + MT_BUG_ON(mt, mas.last != 169); 2552 + 2553 + /* Check one gap that does fit above the min */ 2554 + mas_reset(&mas); 2555 + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 218, 3) != 0); 2556 + MT_BUG_ON(mt, mas.index != 216); 2557 + MT_BUG_ON(mt, mas.last != 218); 2558 + 2559 + /* Check size that doesn't fit any gap */ 2560 + mas_reset(&mas); 2561 + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 218, 16) != -EBUSY); 2562 + 2563 + /* 2564 + * Check size that doesn't fit the lower end of the window but 2565 + * does fit the gap 2566 + */ 2567 + mas_reset(&mas); 2568 + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 167, 200, 4) != -EBUSY); 2569 + 2570 + /* 2571 + * Check size that doesn't fit the upper end of the window but 2572 + * does fit the gap 2573 + */ 2574 + mas_reset(&mas); 2575 + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 162, 4) != -EBUSY); 2576 + 2577 + /* Check mas_empty_area forward */ 2578 + mas_reset(&mas); 2579 + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 9) != 0); 2580 + MT_BUG_ON(mt, mas.index != 0); 2581 + MT_BUG_ON(mt, mas.last != 8); 2582 + 2583 + mas_reset(&mas); 2584 + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 4) != 0); 2585 + MT_BUG_ON(mt, mas.index != 0); 2586 + MT_BUG_ON(mt, mas.last != 3); 2587 + 2588 + mas_reset(&mas); 2589 + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 11) != -EBUSY); 2590 + 2591 + mas_reset(&mas); 2592 + MT_BUG_ON(mt, mas_empty_area(&mas, 5, 100, 6) != -EBUSY); 2593 + 2594 + mas_reset(&mas); 2595 + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 8, 10) != -EBUSY); 2596 + 2597 + mas_reset(&mas); 2598 + mas_empty_area(&mas, 100, 165, 3); 2599 + 2600 + mas_reset(&mas); 2601 + MT_BUG_ON(mt, mas_empty_area(&mas, 100, 163, 6) != -EBUSY); 2602 + rcu_read_unlock(); 2603 + } 2604 + 2520 2605 static DEFINE_MTREE(tree); 2521 2606 static int maple_tree_seed(void) 2522 2607 { ··· 2848 2763 2849 2764 mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); 2850 2765 check_bnode_min_spanning(&tree); 2766 + mtree_destroy(&tree); 2767 + 2768 + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); 2769 + check_empty_area_window(&tree); 2851 2770 mtree_destroy(&tree); 2852 2771 2853 2772 #if defined(BENCH)

+21 -1

mm/khugepaged.c

··· 847 847 return SCAN_SUCCEED; 848 848 } 849 849 850 + /* 851 + * See pmd_trans_unstable() for how the result may change out from 852 + * underneath us, even if we hold mmap_lock in read. 853 + */ 850 854 static int find_pmd_or_thp_or_none(struct mm_struct *mm, 851 855 unsigned long address, 852 856 pmd_t **pmd) ··· 869 865 #endif 870 866 if (pmd_none(pmde)) 871 867 return SCAN_PMD_NONE; 868 + if (!pmd_present(pmde)) 869 + return SCAN_PMD_NULL; 872 870 if (pmd_trans_huge(pmde)) 873 871 return SCAN_PMD_MAPPED; 872 + if (pmd_devmap(pmde)) 873 + return SCAN_PMD_NULL; 874 874 if (pmd_bad(pmde)) 875 875 return SCAN_PMD_NULL; 876 876 return SCAN_SUCCEED; ··· 1650 1642 * has higher cost too. It would also probably require locking 1651 1643 * the anon_vma. 1652 1644 */ 1653 - if (vma->anon_vma) { 1645 + if (READ_ONCE(vma->anon_vma)) { 1654 1646 result = SCAN_PAGE_ANON; 1655 1647 goto next; 1656 1648 } ··· 1678 1670 result = SCAN_PTE_MAPPED_HUGEPAGE; 1679 1671 if ((cc->is_khugepaged || is_target) && 1680 1672 mmap_write_trylock(mm)) { 1673 + /* 1674 + * Re-check whether we have an ->anon_vma, because 1675 + * collapse_and_free_pmd() requires that either no 1676 + * ->anon_vma exists or the anon_vma is locked. 1677 + * We already checked ->anon_vma above, but that check 1678 + * is racy because ->anon_vma can be populated under the 1679 + * mmap lock in read mode. 1680 + */ 1681 + if (vma->anon_vma) { 1682 + result = SCAN_PAGE_ANON; 1683 + goto unlock_next; 1684 + } 1681 1685 /* 1682 1686 * When a vma is registered with uffd-wp, we can't 1683 1687 * recycle the pmd pgtable because there can be pte

+3 -2

mm/kmemleak.c

··· 2070 2070 return -EINVAL; 2071 2071 if (strcmp(str, "off") == 0) 2072 2072 kmemleak_disable(); 2073 - else if (strcmp(str, "on") == 0) 2073 + else if (strcmp(str, "on") == 0) { 2074 2074 kmemleak_skip_disable = 1; 2075 + stack_depot_want_early_init(); 2076 + } 2075 2077 else 2076 2078 return -EINVAL; 2077 2079 return 0; ··· 2095 2093 if (kmemleak_error) 2096 2094 return; 2097 2095 2098 - stack_depot_init(); 2099 2096 jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); 2100 2097 jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); 2101 2098

+13 -54

mm/memcontrol.c

··· 63 63 #include <linux/resume_user_mode.h> 64 64 #include <linux/psi.h> 65 65 #include <linux/seq_buf.h> 66 - #include <linux/parser.h> 67 66 #include "internal.h" 68 67 #include <net/sock.h> 69 68 #include <net/ip.h> ··· 2402 2403 psi_memstall_enter(&pflags); 2403 2404 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, 2404 2405 gfp_mask, 2405 - MEMCG_RECLAIM_MAY_SWAP, 2406 - NULL); 2406 + MEMCG_RECLAIM_MAY_SWAP); 2407 2407 psi_memstall_leave(&pflags); 2408 2408 } while ((memcg = parent_mem_cgroup(memcg)) && 2409 2409 !mem_cgroup_is_root(memcg)); ··· 2693 2695 2694 2696 psi_memstall_enter(&pflags); 2695 2697 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2696 - gfp_mask, reclaim_options, 2697 - NULL); 2698 + gfp_mask, reclaim_options); 2698 2699 psi_memstall_leave(&pflags); 2699 2700 2700 2701 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) ··· 3513 3516 } 3514 3517 3515 3518 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 3516 - memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, 3517 - NULL)) { 3519 + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { 3518 3520 ret = -EBUSY; 3519 3521 break; 3520 3522 } ··· 3627 3631 return -EINTR; 3628 3632 3629 3633 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 3630 - MEMCG_RECLAIM_MAY_SWAP, 3631 - NULL)) 3634 + MEMCG_RECLAIM_MAY_SWAP)) 3632 3635 nr_retries--; 3633 3636 } 3634 3637 ··· 6468 6473 } 6469 6474 6470 6475 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, 6471 - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, 6472 - NULL); 6476 + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); 6473 6477 6474 6478 if (!reclaimed && !nr_retries--) 6475 6479 break; ··· 6517 6523 6518 6524 if (nr_reclaims) { 6519 6525 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, 6520 - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, 6521 - NULL)) 6526 + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) 6522 6527 nr_reclaims--; 6523 6528 continue; 6524 6529 } ··· 6640 6647 return nbytes; 6641 6648 } 6642 6649 6643 - enum { 6644 - MEMORY_RECLAIM_NODES = 0, 6645 - MEMORY_RECLAIM_NULL, 6646 - }; 6647 - 6648 - static const match_table_t if_tokens = { 6649 - { MEMORY_RECLAIM_NODES, "nodes=%s" }, 6650 - { MEMORY_RECLAIM_NULL, NULL }, 6651 - }; 6652 - 6653 6650 static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, 6654 6651 size_t nbytes, loff_t off) 6655 6652 { 6656 6653 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6657 6654 unsigned int nr_retries = MAX_RECLAIM_RETRIES; 6658 6655 unsigned long nr_to_reclaim, nr_reclaimed = 0; 6659 - unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP | 6660 - MEMCG_RECLAIM_PROACTIVE; 6661 - char *old_buf, *start; 6662 - substring_t args[MAX_OPT_ARGS]; 6663 - int token; 6664 - char value[256]; 6665 - nodemask_t nodemask = NODE_MASK_ALL; 6656 + unsigned int reclaim_options; 6657 + int err; 6666 6658 6667 6659 buf = strstrip(buf); 6660 + err = page_counter_memparse(buf, "", &nr_to_reclaim); 6661 + if (err) 6662 + return err; 6668 6663 6669 - old_buf = buf; 6670 - nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; 6671 - if (buf == old_buf) 6672 - return -EINVAL; 6673 - 6674 - buf = strstrip(buf); 6675 - 6676 - while ((start = strsep(&buf, " ")) != NULL) { 6677 - if (!strlen(start)) 6678 - continue; 6679 - token = match_token(start, if_tokens, args); 6680 - match_strlcpy(value, args, sizeof(value)); 6681 - switch (token) { 6682 - case MEMORY_RECLAIM_NODES: 6683 - if (nodelist_parse(value, nodemask) < 0) 6684 - return -EINVAL; 6685 - break; 6686 - default: 6687 - return -EINVAL; 6688 - } 6689 - } 6690 - 6664 + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; 6691 6665 while (nr_reclaimed < nr_to_reclaim) { 6692 6666 unsigned long reclaimed; 6693 6667 ··· 6671 6711 6672 6712 reclaimed = try_to_free_mem_cgroup_pages(memcg, 6673 6713 nr_to_reclaim - nr_reclaimed, 6674 - GFP_KERNEL, reclaim_options, 6675 - &nodemask); 6714 + GFP_KERNEL, reclaim_options); 6676 6715 6677 6716 if (!reclaimed && !nr_retries--) 6678 6717 return -EAGAIN;

+2 -1

mm/mempolicy.c

··· 600 600 601 601 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ 602 602 if (flags & (MPOL_MF_MOVE_ALL) || 603 - (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) { 603 + (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 && 604 + !hugetlb_pmd_shared(pte))) { 604 605 if (isolate_hugetlb(page, qp->pagelist) && 605 606 (flags & MPOL_MF_STRICT)) 606 607 /*

+19 -6

mm/mremap.c

··· 1027 1027 } 1028 1028 1029 1029 /* 1030 - * Function vma_merge() is called on the extension we are adding to 1031 - * the already existing vma, vma_merge() will merge this extension with 1032 - * the already existing vma (expand operation itself) and possibly also 1033 - * with the next vma if it becomes adjacent to the expanded vma and 1034 - * otherwise compatible. 1030 + * Function vma_merge() is called on the extension we 1031 + * are adding to the already existing vma, vma_merge() 1032 + * will merge this extension with the already existing 1033 + * vma (expand operation itself) and possibly also with 1034 + * the next vma if it becomes adjacent to the expanded 1035 + * vma and otherwise compatible. 1036 + * 1037 + * However, vma_merge() can currently fail due to 1038 + * is_mergeable_vma() check for vm_ops->close (see the 1039 + * comment there). Yet this should not prevent vma 1040 + * expanding, so perform a simple expand for such vma. 1041 + * Ideally the check for close op should be only done 1042 + * when a vma would be actually removed due to a merge. 1035 1043 */ 1036 - vma = vma_merge(mm, vma, extension_start, extension_end, 1044 + if (!vma->vm_ops || !vma->vm_ops->close) { 1045 + vma = vma_merge(mm, vma, extension_start, extension_end, 1037 1046 vma->vm_flags, vma->anon_vma, vma->vm_file, 1038 1047 extension_pgoff, vma_policy(vma), 1039 1048 vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 1049 + } else if (vma_adjust(vma, vma->vm_start, addr + new_len, 1050 + vma->vm_pgoff, NULL)) { 1051 + vma = NULL; 1052 + } 1040 1053 if (!vma) { 1041 1054 vm_unacct_memory(pages); 1042 1055 ret = -ENOMEM;

+1

mm/swapfile.c

··· 1100 1100 goto check_out; 1101 1101 pr_debug("scan_swap_map of si %d failed to find offset\n", 1102 1102 si->type); 1103 + cond_resched(); 1103 1104 1104 1105 spin_lock(&swap_avail_lock); 1105 1106 nextsi:

+5 -4

mm/vmscan.c

··· 3335 3335 if (mem_cgroup_disabled()) 3336 3336 return; 3337 3337 3338 + /* migration can happen before addition */ 3339 + if (!mm->lru_gen.memcg) 3340 + return; 3341 + 3338 3342 rcu_read_lock(); 3339 3343 memcg = mem_cgroup_from_task(task); 3340 3344 rcu_read_unlock(); 3341 3345 if (memcg == mm->lru_gen.memcg) 3342 3346 return; 3343 3347 3344 - VM_WARN_ON_ONCE(!mm->lru_gen.memcg); 3345 3348 VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); 3346 3349 3347 3350 lru_gen_del_mm(mm); ··· 7025 7022 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, 7026 7023 unsigned long nr_pages, 7027 7024 gfp_t gfp_mask, 7028 - unsigned int reclaim_options, 7029 - nodemask_t *nodemask) 7025 + unsigned int reclaim_options) 7030 7026 { 7031 7027 unsigned long nr_reclaimed; 7032 7028 unsigned int noreclaim_flag; ··· 7040 7038 .may_unmap = 1, 7041 7039 .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), 7042 7040 .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), 7043 - .nodemask = nodemask, 7044 7041 }; 7045 7042 /* 7046 7043 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put

+205 -32

mm/zsmalloc.c

··· 113 113 * have room for two bit at least. 114 114 */ 115 115 #define OBJ_ALLOCATED_TAG 1 116 - #define OBJ_TAG_BITS 1 116 + 117 + #ifdef CONFIG_ZPOOL 118 + /* 119 + * The second least-significant bit in the object's header identifies if the 120 + * value stored at the header is a deferred handle from the last reclaim 121 + * attempt. 122 + * 123 + * As noted above, this is valid because we have room for two bits. 124 + */ 125 + #define OBJ_DEFERRED_HANDLE_TAG 2 126 + #define OBJ_TAG_BITS 2 127 + #define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG) 128 + #else 129 + #define OBJ_TAG_BITS 1 130 + #define OBJ_TAG_MASK OBJ_ALLOCATED_TAG 131 + #endif /* CONFIG_ZPOOL */ 132 + 117 133 #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) 118 134 #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) 119 135 ··· 238 222 * Handle of allocated object. 239 223 */ 240 224 unsigned long handle; 225 + #ifdef CONFIG_ZPOOL 226 + /* 227 + * Deferred handle of a reclaimed object. 228 + */ 229 + unsigned long deferred_handle; 230 + #endif 241 231 }; 242 232 }; 243 233 ··· 294 272 /* links the zspage to the lru list in the pool */ 295 273 struct list_head lru; 296 274 bool under_reclaim; 297 - /* list of unfreed handles whose objects have been reclaimed */ 298 - unsigned long *deferred_handles; 299 275 #endif 300 276 301 277 struct zs_pool *pool; ··· 917 897 return *(unsigned long *)handle; 918 898 } 919 899 920 - static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) 900 + static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle, 901 + int tag) 921 902 { 922 903 unsigned long handle; 923 904 struct zspage *zspage = get_zspage(page); ··· 929 908 } else 930 909 handle = *(unsigned long *)obj; 931 910 932 - if (!(handle & OBJ_ALLOCATED_TAG)) 911 + if (!(handle & tag)) 933 912 return false; 934 913 935 - *phandle = handle & ~OBJ_ALLOCATED_TAG; 914 + /* Clear all tags before returning the handle */ 915 + *phandle = handle & ~OBJ_TAG_MASK; 936 916 return true; 937 917 } 918 + 919 + static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) 920 + { 921 + return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG); 922 + } 923 + 924 + #ifdef CONFIG_ZPOOL 925 + static bool obj_stores_deferred_handle(struct page *page, void *obj, 926 + unsigned long *phandle) 927 + { 928 + return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG); 929 + } 930 + #endif 938 931 939 932 static void reset_page(struct page *page) 940 933 { ··· 981 946 } 982 947 983 948 #ifdef CONFIG_ZPOOL 949 + static unsigned long find_deferred_handle_obj(struct size_class *class, 950 + struct page *page, int *obj_idx); 951 + 984 952 /* 985 953 * Free all the deferred handles whose objects are freed in zs_free. 986 954 */ 987 - static void free_handles(struct zs_pool *pool, struct zspage *zspage) 955 + static void free_handles(struct zs_pool *pool, struct size_class *class, 956 + struct zspage *zspage) 988 957 { 989 - unsigned long handle = (unsigned long)zspage->deferred_handles; 958 + int obj_idx = 0; 959 + struct page *page = get_first_page(zspage); 960 + unsigned long handle; 990 961 991 - while (handle) { 992 - unsigned long nxt_handle = handle_to_obj(handle); 962 + while (1) { 963 + handle = find_deferred_handle_obj(class, page, &obj_idx); 964 + if (!handle) { 965 + page = get_next_page(page); 966 + if (!page) 967 + break; 968 + obj_idx = 0; 969 + continue; 970 + } 993 971 994 972 cache_free_handle(pool, handle); 995 - handle = nxt_handle; 973 + obj_idx++; 996 974 } 997 975 } 998 976 #else 999 - static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {} 977 + static inline void free_handles(struct zs_pool *pool, struct size_class *class, 978 + struct zspage *zspage) {} 1000 979 #endif 1001 980 1002 981 static void __free_zspage(struct zs_pool *pool, struct size_class *class, ··· 1028 979 VM_BUG_ON(fg != ZS_EMPTY); 1029 980 1030 981 /* Free all deferred handles from zs_free */ 1031 - free_handles(pool, zspage); 982 + free_handles(pool, class, zspage); 1032 983 1033 984 next = page = get_first_page(zspage); 1034 985 do { ··· 1116 1067 #ifdef CONFIG_ZPOOL 1117 1068 INIT_LIST_HEAD(&zspage->lru); 1118 1069 zspage->under_reclaim = false; 1119 - zspage->deferred_handles = NULL; 1120 1070 #endif 1121 1071 1122 1072 set_freeobj(zspage, 0); ··· 1616 1568 } 1617 1569 EXPORT_SYMBOL_GPL(zs_malloc); 1618 1570 1619 - static void obj_free(int class_size, unsigned long obj) 1571 + static void obj_free(int class_size, unsigned long obj, unsigned long *handle) 1620 1572 { 1621 1573 struct link_free *link; 1622 1574 struct zspage *zspage; ··· 1630 1582 zspage = get_zspage(f_page); 1631 1583 1632 1584 vaddr = kmap_atomic(f_page); 1633 - 1634 - /* Insert this object in containing zspage's freelist */ 1635 1585 link = (struct link_free *)(vaddr + f_offset); 1636 - if (likely(!ZsHugePage(zspage))) 1637 - link->next = get_freeobj(zspage) << OBJ_TAG_BITS; 1638 - else 1639 - f_page->index = 0; 1586 + 1587 + if (handle) { 1588 + #ifdef CONFIG_ZPOOL 1589 + /* Stores the (deferred) handle in the object's header */ 1590 + *handle |= OBJ_DEFERRED_HANDLE_TAG; 1591 + *handle &= ~OBJ_ALLOCATED_TAG; 1592 + 1593 + if (likely(!ZsHugePage(zspage))) 1594 + link->deferred_handle = *handle; 1595 + else 1596 + f_page->index = *handle; 1597 + #endif 1598 + } else { 1599 + /* Insert this object in containing zspage's freelist */ 1600 + if (likely(!ZsHugePage(zspage))) 1601 + link->next = get_freeobj(zspage) << OBJ_TAG_BITS; 1602 + else 1603 + f_page->index = 0; 1604 + set_freeobj(zspage, f_objidx); 1605 + } 1606 + 1640 1607 kunmap_atomic(vaddr); 1641 - set_freeobj(zspage, f_objidx); 1642 1608 mod_zspage_inuse(zspage, -1); 1643 1609 } 1644 1610 ··· 1677 1615 zspage = get_zspage(f_page); 1678 1616 class = zspage_class(pool, zspage); 1679 1617 1680 - obj_free(class->size, obj); 1681 1618 class_stat_dec(class, OBJ_USED, 1); 1682 1619 1683 1620 #ifdef CONFIG_ZPOOL ··· 1685 1624 * Reclaim needs the handles during writeback. It'll free 1686 1625 * them along with the zspage when it's done with them. 1687 1626 * 1688 - * Record current deferred handle at the memory location 1689 - * whose address is given by handle. 1627 + * Record current deferred handle in the object's header. 1690 1628 */ 1691 - record_obj(handle, (unsigned long)zspage->deferred_handles); 1692 - zspage->deferred_handles = (unsigned long *)handle; 1629 + obj_free(class->size, obj, &handle); 1693 1630 spin_unlock(&pool->lock); 1694 1631 return; 1695 1632 } 1696 1633 #endif 1634 + obj_free(class->size, obj, NULL); 1635 + 1697 1636 fullness = fix_fullness_group(class, zspage); 1698 1637 if (fullness == ZS_EMPTY) 1699 1638 free_zspage(pool, class, zspage); ··· 1774 1713 } 1775 1714 1776 1715 /* 1777 - * Find alloced object in zspage from index object and 1716 + * Find object with a certain tag in zspage from index object and 1778 1717 * return handle. 1779 1718 */ 1780 - static unsigned long find_alloced_obj(struct size_class *class, 1781 - struct page *page, int *obj_idx) 1719 + static unsigned long find_tagged_obj(struct size_class *class, 1720 + struct page *page, int *obj_idx, int tag) 1782 1721 { 1783 1722 unsigned int offset; 1784 1723 int index = *obj_idx; ··· 1789 1728 offset += class->size * index; 1790 1729 1791 1730 while (offset < PAGE_SIZE) { 1792 - if (obj_allocated(page, addr + offset, &handle)) 1731 + if (obj_tagged(page, addr + offset, &handle, tag)) 1793 1732 break; 1794 1733 1795 1734 offset += class->size; ··· 1802 1741 1803 1742 return handle; 1804 1743 } 1744 + 1745 + /* 1746 + * Find alloced object in zspage from index object and 1747 + * return handle. 1748 + */ 1749 + static unsigned long find_alloced_obj(struct size_class *class, 1750 + struct page *page, int *obj_idx) 1751 + { 1752 + return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG); 1753 + } 1754 + 1755 + #ifdef CONFIG_ZPOOL 1756 + /* 1757 + * Find object storing a deferred handle in header in zspage from index object 1758 + * and return handle. 1759 + */ 1760 + static unsigned long find_deferred_handle_obj(struct size_class *class, 1761 + struct page *page, int *obj_idx) 1762 + { 1763 + return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG); 1764 + } 1765 + #endif 1805 1766 1806 1767 struct zs_compact_control { 1807 1768 /* Source spage for migration which could be a subpage of zspage */ ··· 1867 1784 zs_object_copy(class, free_obj, used_obj); 1868 1785 obj_idx++; 1869 1786 record_obj(handle, free_obj); 1870 - obj_free(class->size, used_obj); 1787 + obj_free(class->size, used_obj, NULL); 1871 1788 } 1872 1789 1873 1790 /* Remember last position in this iteration */ ··· 2558 2475 EXPORT_SYMBOL_GPL(zs_destroy_pool); 2559 2476 2560 2477 #ifdef CONFIG_ZPOOL 2478 + static void restore_freelist(struct zs_pool *pool, struct size_class *class, 2479 + struct zspage *zspage) 2480 + { 2481 + unsigned int obj_idx = 0; 2482 + unsigned long handle, off = 0; /* off is within-page offset */ 2483 + struct page *page = get_first_page(zspage); 2484 + struct link_free *prev_free = NULL; 2485 + void *prev_page_vaddr = NULL; 2486 + 2487 + /* in case no free object found */ 2488 + set_freeobj(zspage, (unsigned int)(-1UL)); 2489 + 2490 + while (page) { 2491 + void *vaddr = kmap_atomic(page); 2492 + struct page *next_page; 2493 + 2494 + while (off < PAGE_SIZE) { 2495 + void *obj_addr = vaddr + off; 2496 + 2497 + /* skip allocated object */ 2498 + if (obj_allocated(page, obj_addr, &handle)) { 2499 + obj_idx++; 2500 + off += class->size; 2501 + continue; 2502 + } 2503 + 2504 + /* free deferred handle from reclaim attempt */ 2505 + if (obj_stores_deferred_handle(page, obj_addr, &handle)) 2506 + cache_free_handle(pool, handle); 2507 + 2508 + if (prev_free) 2509 + prev_free->next = obj_idx << OBJ_TAG_BITS; 2510 + else /* first free object found */ 2511 + set_freeobj(zspage, obj_idx); 2512 + 2513 + prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free); 2514 + /* if last free object in a previous page, need to unmap */ 2515 + if (prev_page_vaddr) { 2516 + kunmap_atomic(prev_page_vaddr); 2517 + prev_page_vaddr = NULL; 2518 + } 2519 + 2520 + obj_idx++; 2521 + off += class->size; 2522 + } 2523 + 2524 + /* 2525 + * Handle the last (full or partial) object on this page. 2526 + */ 2527 + next_page = get_next_page(page); 2528 + if (next_page) { 2529 + if (!prev_free || prev_page_vaddr) { 2530 + /* 2531 + * There is no free object in this page, so we can safely 2532 + * unmap it. 2533 + */ 2534 + kunmap_atomic(vaddr); 2535 + } else { 2536 + /* update prev_page_vaddr since prev_free is on this page */ 2537 + prev_page_vaddr = vaddr; 2538 + } 2539 + } else { /* this is the last page */ 2540 + if (prev_free) { 2541 + /* 2542 + * Reset OBJ_TAG_BITS bit to last link to tell 2543 + * whether it's allocated object or not. 2544 + */ 2545 + prev_free->next = -1UL << OBJ_TAG_BITS; 2546 + } 2547 + 2548 + /* unmap previous page (if not done yet) */ 2549 + if (prev_page_vaddr) { 2550 + kunmap_atomic(prev_page_vaddr); 2551 + prev_page_vaddr = NULL; 2552 + } 2553 + 2554 + kunmap_atomic(vaddr); 2555 + } 2556 + 2557 + page = next_page; 2558 + off %= PAGE_SIZE; 2559 + } 2560 + } 2561 + 2561 2562 static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries) 2562 2563 { 2563 2564 int i, obj_idx, ret = 0; ··· 2725 2558 return 0; 2726 2559 } 2727 2560 2561 + /* 2562 + * Eviction fails on one of the handles, so we need to restore zspage. 2563 + * We need to rebuild its freelist (and free stored deferred handles), 2564 + * put it back to the correct size class, and add it to the LRU list. 2565 + */ 2566 + restore_freelist(pool, class, zspage); 2728 2567 putback_zspage(class, zspage); 2729 2568 list_add(&zspage->lru, &pool->lru); 2730 2569 unlock_zspage(zspage);

tools/testing/selftests/filesystems/fat/run_fat_tests.sh

-1

tools/testing/selftests/mm/hugetlb-madvise.c

··· 17 17 #include <stdio.h> 18 18 #include <unistd.h> 19 19 #include <sys/mman.h> 20 - #define __USE_GNU 21 20 #include <fcntl.h> 22 21 23 22 #define MIN_FREE_PAGES 20