Merge branch 'akpm' (patches from Andrew)

+9 -4

Documentation/ABI/testing/sysfs-kernel-slab

··· 429 429 Contact: Pekka Enberg <penberg@cs.helsinki.fi>, 430 430 Christoph Lameter <cl@linux-foundation.org> 431 431 Description: 432 - The shrink file is written when memory should be reclaimed from 433 - a cache. Empty partial slabs are freed and the partial list is 434 - sorted so the slabs with the fewest available objects are used 435 - first. 432 + The shrink file is used to reclaim unused slab cache 433 + memory from a cache. Empty per-cpu or partial slabs 434 + are freed and the partial list is sorted so the slabs 435 + with the fewest available objects are used first. 436 + It only accepts a value of "1" on write for shrinking 437 + the cache. Other input values are considered invalid. 438 + Shrinking slab caches might be expensive and can 439 + adversely impact other running applications. So it 440 + should be used with care. 436 441 437 442 What: /sys/kernel/slab/cache/slab_size 438 443 Date: May 2007

+3 -1

Documentation/admin-guide/cgroup-v1/memory.rst

··· 85 85 memory.oom_control set/show oom controls. 86 86 memory.numa_stat show the number of memory usage per numa 87 87 node 88 - 89 88 memory.kmem.limit_in_bytes set/show hard limit for kernel memory 89 + This knob is deprecated and shouldn't be 90 + used. It is planned that this be removed in 91 + the foreseeable future. 90 92 memory.kmem.usage_in_bytes show current kernel memory allocation 91 93 memory.kmem.failcnt show the number of kernel memory usage 92 94 hits limits

+2

Documentation/admin-guide/kernel-parameters.txt

··· 809 809 enables the feature at boot time. By default, it is 810 810 disabled and the system will work mostly the same as a 811 811 kernel built without CONFIG_DEBUG_PAGEALLOC. 812 + Note: to get most of debug_pagealloc error reports, it's 813 + useful to also enable the page_owner functionality. 812 814 on: enable the feature 813 815 814 816 debugpat [X86] Enable PAT debugging

+11

arch/Kconfig

··· 706 706 and vice-versa 32-bit applications to call 64-bit mmap(). 707 707 Required for applications doing different bitness syscalls. 708 708 709 + # This allows to use a set of generic functions to determine mmap base 710 + # address by giving priority to top-down scheme only if the process 711 + # is not in legacy mode (compat task, unlimited stack size or 712 + # sysctl_legacy_va_layout). 713 + # Architecture that selects this option can provide its own version of: 714 + # - STACK_RND_MASK 715 + config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT 716 + bool 717 + depends on MMU 718 + select ARCH_HAS_ELF_RANDOMIZE 719 + 709 720 config HAVE_COPY_THREAD_TLS 710 721 bool 711 722 help

-2

arch/alpha/include/asm/pgalloc.h

··· 53 53 free_page((unsigned long)pmd); 54 54 } 55 55 56 - #define check_pgt_cache() do { } while (0) 57 - 58 56 #endif /* _ALPHA_PGALLOC_H */

-5

arch/alpha/include/asm/pgtable.h

··· 359 359 360 360 #include <asm-generic/pgtable.h> 361 361 362 - /* 363 - * No page table caches to initialise 364 - */ 365 - #define pgtable_cache_init() do { } while (0) 366 - 367 362 /* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT. */ 368 363 #define HAVE_ARCH_UNMAPPED_AREA 369 364

-1

arch/arc/include/asm/pgalloc.h

··· 129 129 130 130 #define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte) 131 131 132 - #define check_pgt_cache() do { } while (0) 133 132 #define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd)) 134 133 135 134 #endif /* _ASM_ARC_PGALLOC_H */

-5

arch/arc/include/asm/pgtable.h

··· 395 395 /* to cope with aliasing VIPT cache */ 396 396 #define HAVE_ARCH_UNMAPPED_AREA 397 397 398 - /* 399 - * No page table caches to initialise 400 - */ 401 - #define pgtable_cache_init() do { } while (0) 402 - 403 398 #endif /* __ASSEMBLY__ */ 404 399 405 400 #endif

+1

arch/arm/Kconfig

··· 34 34 select ARCH_SUPPORTS_ATOMIC_RMW 35 35 select ARCH_USE_BUILTIN_BSWAP 36 36 select ARCH_USE_CMPXCHG_LOCKREF 37 + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU 37 38 select ARCH_WANT_IPC_PARSE_VERSION 38 39 select BINFMT_FLAT_ARGVP_ENVP_ON_STACK 39 40 select BUILDTIME_EXTABLE_SORT if MMU

-2

arch/arm/include/asm/pgalloc.h

··· 15 15 #include <asm/cacheflush.h> 16 16 #include <asm/tlbflush.h> 17 17 18 - #define check_pgt_cache() do { } while (0) 19 - 20 18 #ifdef CONFIG_MMU 21 19 22 20 #define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_USER))

-5

arch/arm/include/asm/pgtable-nommu.h

··· 71 71 extern unsigned int kobjsize(const void *objp); 72 72 73 73 /* 74 - * No page table caches to initialise. 75 - */ 76 - #define pgtable_cache_init() do { } while (0) 77 - 78 - /* 79 74 * All 32bit addresses are effectively valid for vmalloc... 80 75 * Sort of meaningless for non-VM targets. 81 76 */

-2

arch/arm/include/asm/pgtable.h

··· 368 368 #define HAVE_ARCH_UNMAPPED_AREA 369 369 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 370 370 371 - #define pgtable_cache_init() do { } while (0) 372 - 373 371 #endif /* !__ASSEMBLY__ */ 374 372 375 373 #endif /* CONFIG_MMU */

-2

arch/arm/include/asm/processor.h

··· 140 140 #endif 141 141 #endif 142 142 143 - #define HAVE_ARCH_PICK_MMAP_LAYOUT 144 - 145 143 #endif 146 144 147 145 #endif /* __ASM_ARM_PROCESSOR_H */

-5

arch/arm/kernel/process.c

··· 319 319 return 0; 320 320 } 321 321 322 - unsigned long arch_randomize_brk(struct mm_struct *mm) 323 - { 324 - return randomize_page(mm->brk, 0x02000000); 325 - } 326 - 327 322 #ifdef CONFIG_MMU 328 323 #ifdef CONFIG_KUSER_HELPERS 329 324 /*

+3 -4

arch/arm/mm/flush.c

··· 204 204 * coherent with the kernels mapping. 205 205 */ 206 206 if (!PageHighMem(page)) { 207 - size_t page_size = PAGE_SIZE << compound_order(page); 208 - __cpuc_flush_dcache_area(page_address(page), page_size); 207 + __cpuc_flush_dcache_area(page_address(page), page_size(page)); 209 208 } else { 210 209 unsigned long i; 211 210 if (cache_is_vipt_nonaliasing()) { 212 - for (i = 0; i < (1 << compound_order(page)); i++) { 211 + for (i = 0; i < compound_nr(page); i++) { 213 212 void *addr = kmap_atomic(page + i); 214 213 __cpuc_flush_dcache_area(addr, PAGE_SIZE); 215 214 kunmap_atomic(addr); 216 215 } 217 216 } else { 218 - for (i = 0; i < (1 << compound_order(page)); i++) { 217 + for (i = 0; i < compound_nr(page); i++) { 219 218 void *addr = kmap_high_get(page + i); 220 219 if (addr) { 221 220 __cpuc_flush_dcache_area(addr, PAGE_SIZE);

-52

arch/arm/mm/mmap.c

··· 17 17 ((((addr)+SHMLBA-1)&~(SHMLBA-1)) + \ 18 18 (((pgoff)<<PAGE_SHIFT) & (SHMLBA-1))) 19 19 20 - /* gap between mmap and stack */ 21 - #define MIN_GAP (128*1024*1024UL) 22 - #define MAX_GAP ((TASK_SIZE)/6*5) 23 - 24 - static int mmap_is_legacy(struct rlimit *rlim_stack) 25 - { 26 - if (current->personality & ADDR_COMPAT_LAYOUT) 27 - return 1; 28 - 29 - if (rlim_stack->rlim_cur == RLIM_INFINITY) 30 - return 1; 31 - 32 - return sysctl_legacy_va_layout; 33 - } 34 - 35 - static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) 36 - { 37 - unsigned long gap = rlim_stack->rlim_cur; 38 - 39 - if (gap < MIN_GAP) 40 - gap = MIN_GAP; 41 - else if (gap > MAX_GAP) 42 - gap = MAX_GAP; 43 - 44 - return PAGE_ALIGN(TASK_SIZE - gap - rnd); 45 - } 46 - 47 20 /* 48 21 * We need to ensure that shared mappings are correctly aligned to 49 22 * avoid aliasing issues with VIPT caches. We need to ensure that ··· 142 169 } 143 170 144 171 return addr; 145 - } 146 - 147 - unsigned long arch_mmap_rnd(void) 148 - { 149 - unsigned long rnd; 150 - 151 - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); 152 - 153 - return rnd << PAGE_SHIFT; 154 - } 155 - 156 - void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) 157 - { 158 - unsigned long random_factor = 0UL; 159 - 160 - if (current->flags & PF_RANDOMIZE) 161 - random_factor = arch_mmap_rnd(); 162 - 163 - if (mmap_is_legacy(rlim_stack)) { 164 - mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; 165 - mm->get_unmapped_area = arch_get_unmapped_area; 166 - } else { 167 - mm->mmap_base = mmap_base(random_factor, rlim_stack); 168 - mm->get_unmapped_area = arch_get_unmapped_area_topdown; 169 - } 170 172 } 171 173 172 174 /*

+1 -1

arch/arm64/Kconfig

··· 15 15 select ARCH_HAS_DMA_COHERENT_TO_PFN 16 16 select ARCH_HAS_DMA_PREP_COHERENT 17 17 select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI 18 - select ARCH_HAS_ELF_RANDOMIZE 19 18 select ARCH_HAS_FAST_MULTIPLIER 20 19 select ARCH_HAS_FORTIFY_SOURCE 21 20 select ARCH_HAS_GCOV_PROFILE_ALL ··· 70 71 select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 || CC_IS_CLANG 71 72 select ARCH_SUPPORTS_NUMA_BALANCING 72 73 select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT 74 + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT 73 75 select ARCH_WANT_FRAME_POINTERS 74 76 select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) 75 77 select ARCH_HAS_UBSAN_SANITIZE_ALL

-2

arch/arm64/include/asm/pgalloc.h

··· 15 15 16 16 #include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ 17 17 18 - #define check_pgt_cache() do { } while (0) 19 - 20 18 #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) 21 19 22 20 #if CONFIG_PGTABLE_LEVELS > 2

-2

arch/arm64/include/asm/pgtable.h

··· 861 861 862 862 #include <asm-generic/pgtable.h> 863 863 864 - static inline void pgtable_cache_init(void) { } 865 - 866 864 /* 867 865 * On AArch64, the cache coherency is handled via the set_pte_at() function. 868 866 */

-2

arch/arm64/include/asm/processor.h

··· 280 280 "nop") : : "p" (ptr)); 281 281 } 282 282 283 - #define HAVE_ARCH_PICK_MMAP_LAYOUT 284 - 285 283 extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */ 286 284 extern void __init minsigstksz_setup(void); 287 285

-8

arch/arm64/kernel/process.c

··· 557 557 return sp & ~0xf; 558 558 } 559 559 560 - unsigned long arch_randomize_brk(struct mm_struct *mm) 561 - { 562 - if (is_compat_task()) 563 - return randomize_page(mm->brk, SZ_32M); 564 - else 565 - return randomize_page(mm->brk, SZ_1G); 566 - } 567 - 568 560 /* 569 561 * Called from setup_new_exec() after (COMPAT_)SET_PERSONALITY. 570 562 */

+1 -2

arch/arm64/mm/flush.c

··· 56 56 struct page *page = pte_page(pte); 57 57 58 58 if (!test_and_set_bit(PG_dcache_clean, &page->flags)) 59 - sync_icache_aliases(page_address(page), 60 - PAGE_SIZE << compound_order(page)); 59 + sync_icache_aliases(page_address(page), page_size(page)); 61 60 } 62 61 EXPORT_SYMBOL_GPL(__sync_icache_dcache); 63 62

-72

arch/arm64/mm/mmap.c

··· 21 21 #include <asm/cputype.h> 22 22 23 23 /* 24 - * Leave enough space between the mmap area and the stack to honour ulimit in 25 - * the face of randomisation. 26 - */ 27 - #define MIN_GAP (SZ_128M) 28 - #define MAX_GAP (STACK_TOP/6*5) 29 - 30 - static int mmap_is_legacy(struct rlimit *rlim_stack) 31 - { 32 - if (current->personality & ADDR_COMPAT_LAYOUT) 33 - return 1; 34 - 35 - if (rlim_stack->rlim_cur == RLIM_INFINITY) 36 - return 1; 37 - 38 - return sysctl_legacy_va_layout; 39 - } 40 - 41 - unsigned long arch_mmap_rnd(void) 42 - { 43 - unsigned long rnd; 44 - 45 - #ifdef CONFIG_COMPAT 46 - if (test_thread_flag(TIF_32BIT)) 47 - rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); 48 - else 49 - #endif 50 - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); 51 - return rnd << PAGE_SHIFT; 52 - } 53 - 54 - static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) 55 - { 56 - unsigned long gap = rlim_stack->rlim_cur; 57 - unsigned long pad = (STACK_RND_MASK << PAGE_SHIFT) + stack_guard_gap; 58 - 59 - /* Values close to RLIM_INFINITY can overflow. */ 60 - if (gap + pad > gap) 61 - gap += pad; 62 - 63 - if (gap < MIN_GAP) 64 - gap = MIN_GAP; 65 - else if (gap > MAX_GAP) 66 - gap = MAX_GAP; 67 - 68 - return PAGE_ALIGN(STACK_TOP - gap - rnd); 69 - } 70 - 71 - /* 72 - * This function, called very early during the creation of a new process VM 73 - * image, sets up which VM layout function to use: 74 - */ 75 - void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) 76 - { 77 - unsigned long random_factor = 0UL; 78 - 79 - if (current->flags & PF_RANDOMIZE) 80 - random_factor = arch_mmap_rnd(); 81 - 82 - /* 83 - * Fall back to the standard layout if the personality bit is set, or 84 - * if the expected stack growth is unlimited: 85 - */ 86 - if (mmap_is_legacy(rlim_stack)) { 87 - mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; 88 - mm->get_unmapped_area = arch_get_unmapped_area; 89 - } else { 90 - mm->mmap_base = mmap_base(random_factor, rlim_stack); 91 - mm->get_unmapped_area = arch_get_unmapped_area_topdown; 92 - } 93 - } 94 - 95 - /* 96 24 * You really shouldn't be using read() or write() on /dev/mem. This might go 97 25 * away in the future. 98 26 */

+1 -1

arch/arm64/mm/pgd.c

··· 35 35 kmem_cache_free(pgd_cache, pgd); 36 36 } 37 37 38 - void __init pgd_cache_init(void) 38 + void __init pgtable_cache_init(void) 39 39 { 40 40 if (PGD_SIZE == PAGE_SIZE) 41 41 return;

-5

arch/c6x/include/asm/pgtable.h

··· 60 60 #define swapper_pg_dir ((pgd_t *) 0) 61 61 62 62 /* 63 - * No page table caches to initialise 64 - */ 65 - #define pgtable_cache_init() do { } while (0) 66 - 67 - /* 68 63 * c6x is !MMU, so define the simpliest implementation 69 64 */ 70 65 #define pgprot_writecombine pgprot_noncached

-2

arch/csky/include/asm/pgalloc.h

··· 75 75 tlb_remove_page(tlb, pte); \ 76 76 } while (0) 77 77 78 - #define check_pgt_cache() do {} while (0) 79 - 80 78 extern void pagetable_init(void); 81 79 extern void pre_mmu_init(void); 82 80 extern void pre_trap_init(void);

-5

arch/csky/include/asm/pgtable.h

··· 296 296 /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ 297 297 #define kern_addr_valid(addr) (1) 298 298 299 - /* 300 - * No page table caches to initialise 301 - */ 302 - #define pgtable_cache_init() do {} while (0) 303 - 304 299 #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ 305 300 remap_pfn_range(vma, vaddr, pfn, size, prot) 306 301

-6

arch/h8300/include/asm/pgtable.h

··· 4 4 #define __ARCH_USE_5LEVEL_HACK 5 5 #include <asm-generic/pgtable-nopud.h> 6 6 #include <asm-generic/pgtable.h> 7 - #define pgtable_cache_init() do { } while (0) 8 7 extern void paging_init(void); 9 8 #define PAGE_NONE __pgprot(0) /* these mean nothing to NO_MM */ 10 9 #define PAGE_SHARED __pgprot(0) /* these mean nothing to NO_MM */ ··· 32 33 */ 33 34 extern unsigned int kobjsize(const void *objp); 34 35 extern int is_in_rom(unsigned long); 35 - 36 - /* 37 - * No page table caches to initialise 38 - */ 39 - #define pgtable_cache_init() do { } while (0) 40 36 41 37 /* 42 38 * All 32bit addresses are effectively valid for vmalloc...

-2

arch/hexagon/include/asm/pgalloc.h

··· 13 13 14 14 #include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ 15 15 16 - #define check_pgt_cache() do {} while (0) 17 - 18 16 extern unsigned long long kmap_generation; 19 17 20 18 /*

-3

arch/hexagon/include/asm/pgtable.h

··· 431 431 432 432 #define __pte_offset(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) 433 433 434 - /* I think this is in case we have page table caches; needed by init/main.c */ 435 - #define pgtable_cache_init() do { } while (0) 436 - 437 434 /* 438 435 * Swap/file PTE definitions. If _PAGE_PRESENT is zero, the rest of the PTE is 439 436 * interpreted as swap information. The remaining free bits are interpreted as

+1 -1

arch/hexagon/mm/Makefile

··· 3 3 # Makefile for Hexagon memory management subsystem 4 4 # 5 5 6 - obj-y := init.o pgalloc.o ioremap.o uaccess.o vm_fault.o cache.o 6 + obj-y := init.o ioremap.o uaccess.o vm_fault.o cache.o 7 7 obj-y += copy_to_user.o copy_from_user.o strnlen_user.o vm_tlb.o

-10

arch/hexagon/mm/pgalloc.c

··· 1 - // SPDX-License-Identifier: GPL-2.0-only 2 - /* 3 - * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved. 4 - */ 5 - 6 - #include <linux/init.h> 7 - 8 - void __init pgtable_cache_init(void) 9 - { 10 - }

-4

arch/ia64/Kconfig

··· 72 72 config ZONE_DMA32 73 73 def_bool y 74 74 75 - config QUICKLIST 76 - bool 77 - default y 78 - 79 75 config MMU 80 76 bool 81 77 default y

+8 -44

arch/ia64/include/asm/pgalloc.h

··· 19 19 #include <linux/mm.h> 20 20 #include <linux/page-flags.h> 21 21 #include <linux/threads.h> 22 - #include <linux/quicklist.h> 22 + 23 + #include <asm-generic/pgalloc.h> 23 24 24 25 #include <asm/mmu_context.h> 25 26 26 27 static inline pgd_t *pgd_alloc(struct mm_struct *mm) 27 28 { 28 - return quicklist_alloc(0, GFP_KERNEL, NULL); 29 + return (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 29 30 } 30 31 31 32 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) 32 33 { 33 - quicklist_free(0, NULL, pgd); 34 + free_page((unsigned long)pgd); 34 35 } 35 36 36 37 #if CONFIG_PGTABLE_LEVELS == 4 ··· 43 42 44 43 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) 45 44 { 46 - return quicklist_alloc(0, GFP_KERNEL, NULL); 45 + return (pud_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 47 46 } 48 47 49 48 static inline void pud_free(struct mm_struct *mm, pud_t *pud) 50 49 { 51 - quicklist_free(0, NULL, pud); 50 + free_page((unsigned long)pud); 52 51 } 53 52 #define __pud_free_tlb(tlb, pud, address) pud_free((tlb)->mm, pud) 54 53 #endif /* CONFIG_PGTABLE_LEVELS == 4 */ ··· 61 60 62 61 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) 63 62 { 64 - return quicklist_alloc(0, GFP_KERNEL, NULL); 63 + return (pmd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 65 64 } 66 65 67 66 static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) 68 67 { 69 - quicklist_free(0, NULL, pmd); 68 + free_page((unsigned long)pmd); 70 69 } 71 70 72 71 #define __pmd_free_tlb(tlb, pmd, address) pmd_free((tlb)->mm, pmd) ··· 82 81 pmd_populate_kernel(struct mm_struct *mm, pmd_t * pmd_entry, pte_t * pte) 83 82 { 84 83 pmd_val(*pmd_entry) = __pa(pte); 85 - } 86 - 87 - static inline pgtable_t pte_alloc_one(struct mm_struct *mm) 88 - { 89 - struct page *page; 90 - void *pg; 91 - 92 - pg = quicklist_alloc(0, GFP_KERNEL, NULL); 93 - if (!pg) 94 - return NULL; 95 - page = virt_to_page(pg); 96 - if (!pgtable_page_ctor(page)) { 97 - quicklist_free(0, NULL, pg); 98 - return NULL; 99 - } 100 - return page; 101 - } 102 - 103 - static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) 104 - { 105 - return quicklist_alloc(0, GFP_KERNEL, NULL); 106 - } 107 - 108 - static inline void pte_free(struct mm_struct *mm, pgtable_t pte) 109 - { 110 - pgtable_page_dtor(pte); 111 - quicklist_free_page(0, NULL, pte); 112 - } 113 - 114 - static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) 115 - { 116 - quicklist_free(0, NULL, pte); 117 - } 118 - 119 - static inline void check_pgt_cache(void) 120 - { 121 - quicklist_trim(0, NULL, 25, 16); 122 84 } 123 85 124 86 #define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte)

-5

arch/ia64/include/asm/pgtable.h

··· 566 566 #define KERNEL_TR_PAGE_SHIFT _PAGE_SIZE_64M 567 567 #define KERNEL_TR_PAGE_SIZE (1 << KERNEL_TR_PAGE_SHIFT) 568 568 569 - /* 570 - * No page table caches to initialise 571 - */ 572 - #define pgtable_cache_init() do { } while (0) 573 - 574 569 /* These tell get_user_pages() that the first gate page is accessible from user-level. */ 575 570 #define FIXADDR_USER_START GATE_ADDR 576 571 #ifdef HAVE_BUGGY_SEGREL

+1 -1

arch/ia64/mm/init.c

··· 64 64 if (test_bit(PG_arch_1, &page->flags)) 65 65 return; /* i-cache is already coherent with d-cache */ 66 66 67 - flush_icache_range(addr, addr + (PAGE_SIZE << compound_order(page))); 67 + flush_icache_range(addr, addr + page_size(page)); 68 68 set_bit(PG_arch_1, &page->flags); /* mark page as clean */ 69 69 } 70 70

-7

arch/m68k/include/asm/pgtable_mm.h

··· 176 176 #include <asm-generic/pgtable.h> 177 177 #endif /* !__ASSEMBLY__ */ 178 178 179 - /* 180 - * No page table caches to initialise 181 - */ 182 - #define pgtable_cache_init() do { } while (0) 183 - 184 - #define check_pgt_cache() do { } while (0) 185 - 186 179 #endif /* _M68K_PGTABLE_H */

-7

arch/m68k/include/asm/pgtable_no.h

··· 45 45 #define ZERO_PAGE(vaddr) (virt_to_page(0)) 46 46 47 47 /* 48 - * No page table caches to initialise. 49 - */ 50 - #define pgtable_cache_init() do { } while (0) 51 - 52 - /* 53 48 * All 32bit addresses are effectively valid for vmalloc... 54 49 * Sort of meaningless for non-VM targets. 55 50 */ ··· 54 59 #define KMAP_END 0xffffffff 55 60 56 61 #include <asm-generic/pgtable.h> 57 - 58 - #define check_pgt_cache() do { } while (0) 59 62 60 63 #endif /* _M68KNOMMU_PGTABLE_H */

+7 -115

arch/microblaze/include/asm/pgalloc.h

··· 21 21 #include <asm/cache.h> 22 22 #include <asm/pgtable.h> 23 23 24 - #define PGDIR_ORDER 0 25 - 26 - /* 27 - * This is handled very differently on MicroBlaze since out page tables 28 - * are all 0's and I want to be able to use these zero'd pages elsewhere 29 - * as well - it gives us quite a speedup. 30 - * -- Cort 31 - */ 32 - extern struct pgtable_cache_struct { 33 - unsigned long *pgd_cache; 34 - unsigned long *pte_cache; 35 - unsigned long pgtable_cache_sz; 36 - } quicklists; 37 - 38 - #define pgd_quicklist (quicklists.pgd_cache) 39 - #define pmd_quicklist ((unsigned long *)0) 40 - #define pte_quicklist (quicklists.pte_cache) 41 - #define pgtable_cache_size (quicklists.pgtable_cache_sz) 42 - 43 - extern unsigned long *zero_cache; /* head linked list of pre-zero'd pages */ 44 - extern atomic_t zero_sz; /* # currently pre-zero'd pages */ 45 - extern atomic_t zeropage_hits; /* # zero'd pages request that we've done */ 46 - extern atomic_t zeropage_calls; /* # zero'd pages request that've been made */ 47 - extern atomic_t zerototal; /* # pages zero'd over time */ 48 - 49 - #define zero_quicklist (zero_cache) 50 - #define zero_cache_sz (zero_sz) 51 - #define zero_cache_calls (zeropage_calls) 52 - #define zero_cache_hits (zeropage_hits) 53 - #define zero_cache_total (zerototal) 54 - 55 - /* 56 - * return a pre-zero'd page from the list, 57 - * return NULL if none available -- Cort 58 - */ 59 - extern unsigned long get_zero_page_fast(void); 24 + #define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL 25 + #include <asm-generic/pgalloc.h> 60 26 61 27 extern void __bad_pte(pmd_t *pmd); 62 28 63 - static inline pgd_t *get_pgd_slow(void) 29 + static inline pgd_t *get_pgd(void) 64 30 { 65 - pgd_t *ret; 66 - 67 - ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGDIR_ORDER); 68 - if (ret != NULL) 69 - clear_page(ret); 70 - return ret; 31 + return (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0); 71 32 } 72 33 73 - static inline pgd_t *get_pgd_fast(void) 74 - { 75 - unsigned long *ret; 76 - 77 - ret = pgd_quicklist; 78 - if (ret != NULL) { 79 - pgd_quicklist = (unsigned long *)(*ret); 80 - ret[0] = 0; 81 - pgtable_cache_size--; 82 - } else 83 - ret = (unsigned long *)get_pgd_slow(); 84 - return (pgd_t *)ret; 85 - } 86 - 87 - static inline void free_pgd_fast(pgd_t *pgd) 88 - { 89 - *(unsigned long **)pgd = pgd_quicklist; 90 - pgd_quicklist = (unsigned long *) pgd; 91 - pgtable_cache_size++; 92 - } 93 - 94 - static inline void free_pgd_slow(pgd_t *pgd) 34 + static inline void free_pgd(pgd_t *pgd) 95 35 { 96 36 free_page((unsigned long)pgd); 97 37 } 98 38 99 - #define pgd_free(mm, pgd) free_pgd_fast(pgd) 100 - #define pgd_alloc(mm) get_pgd_fast() 39 + #define pgd_free(mm, pgd) free_pgd(pgd) 40 + #define pgd_alloc(mm) get_pgd() 101 41 102 42 #define pmd_pgtable(pmd) pmd_page(pmd) 103 43 ··· 49 109 #define pmd_alloc_one(mm, address) ({ BUG(); ((pmd_t *)2); }) 50 110 51 111 extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); 52 - 53 - static inline struct page *pte_alloc_one(struct mm_struct *mm) 54 - { 55 - struct page *ptepage; 56 - 57 - #ifdef CONFIG_HIGHPTE 58 - int flags = GFP_KERNEL | __GFP_HIGHMEM; 59 - #else 60 - int flags = GFP_KERNEL; 61 - #endif 62 - 63 - ptepage = alloc_pages(flags, 0); 64 - if (!ptepage) 65 - return NULL; 66 - clear_highpage(ptepage); 67 - if (!pgtable_page_ctor(ptepage)) { 68 - __free_page(ptepage); 69 - return NULL; 70 - } 71 - return ptepage; 72 - } 73 - 74 - static inline void pte_free_fast(pte_t *pte) 75 - { 76 - *(unsigned long **)pte = pte_quicklist; 77 - pte_quicklist = (unsigned long *) pte; 78 - pgtable_cache_size++; 79 - } 80 - 81 - static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) 82 - { 83 - free_page((unsigned long)pte); 84 - } 85 - 86 - static inline void pte_free_slow(struct page *ptepage) 87 - { 88 - __free_page(ptepage); 89 - } 90 - 91 - static inline void pte_free(struct mm_struct *mm, struct page *ptepage) 92 - { 93 - pgtable_page_dtor(ptepage); 94 - __free_page(ptepage); 95 - } 96 112 97 113 #define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, (pte)) 98 114 ··· 67 171 #define __pmd_free_tlb(tlb, x, addr) pmd_free((tlb)->mm, x) 68 172 #define pgd_populate(mm, pmd, pte) BUG() 69 173 70 - extern int do_check_pgt_cache(int, int); 71 - 72 174 #endif /* CONFIG_MMU */ 73 - 74 - #define check_pgt_cache() do { } while (0) 75 175 76 176 #endif /* _ASM_MICROBLAZE_PGALLOC_H */

-7

arch/microblaze/include/asm/pgtable.h

··· 46 46 47 47 #define swapper_pg_dir ((pgd_t *) NULL) 48 48 49 - #define pgtable_cache_init() do {} while (0) 50 - 51 49 #define arch_enter_lazy_cpu_mode() do {} while (0) 52 50 53 51 #define pgprot_noncached_wc(prot) prot ··· 523 525 524 526 /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ 525 527 #define kern_addr_valid(addr) (1) 526 - 527 - /* 528 - * No page table caches to initialise 529 - */ 530 - #define pgtable_cache_init() do { } while (0) 531 528 532 529 void do_page_fault(struct pt_regs *regs, unsigned long address, 533 530 unsigned long error_code);

-4

arch/microblaze/mm/pgtable.c

··· 44 44 unsigned long ioremap_bot; 45 45 EXPORT_SYMBOL(ioremap_bot); 46 46 47 - #ifndef CONFIG_SMP 48 - struct pgtable_cache_struct quicklists; 49 - #endif 50 - 51 47 static void __iomem *__ioremap(phys_addr_t addr, unsigned long size, 52 48 unsigned long flags) 53 49 {

+1 -1

arch/mips/Kconfig

··· 5 5 select ARCH_32BIT_OFF_T if !64BIT 6 6 select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT 7 7 select ARCH_CLOCKSOURCE_DATA 8 - select ARCH_HAS_ELF_RANDOMIZE 9 8 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST 10 9 select ARCH_HAS_UBSAN_SANITIZE_ALL 11 10 select ARCH_SUPPORTS_UPROBES ··· 12 13 select ARCH_USE_CMPXCHG_LOCKREF if 64BIT 13 14 select ARCH_USE_QUEUED_RWLOCKS 14 15 select ARCH_USE_QUEUED_SPINLOCKS 16 + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU 15 17 select ARCH_WANT_IPC_PARSE_VERSION 16 18 select BUILDTIME_EXTABLE_SORT 17 19 select CLONE_BACKWARDS

-2

arch/mips/include/asm/pgalloc.h

··· 105 105 106 106 #endif /* __PAGETABLE_PUD_FOLDED */ 107 107 108 - #define check_pgt_cache() do { } while (0) 109 - 110 108 extern void pagetable_init(void); 111 109 112 110 #endif /* _ASM_PGALLOC_H */

-5

arch/mips/include/asm/pgtable.h

··· 661 661 #define HAVE_ARCH_UNMAPPED_AREA 662 662 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 663 663 664 - /* 665 - * No page table caches to initialise 666 - */ 667 - #define pgtable_cache_init() do { } while (0) 668 - 669 664 #endif /* _ASM_PGTABLE_H */

-5

arch/mips/include/asm/processor.h

··· 29 29 30 30 extern unsigned int vced_count, vcei_count; 31 31 32 - /* 33 - * MIPS does have an arch_pick_mmap_layout() 34 - */ 35 - #define HAVE_ARCH_PICK_MMAP_LAYOUT 1 36 - 37 32 #ifdef CONFIG_32BIT 38 33 #ifdef CONFIG_KVM_GUEST 39 34 /* User space process size is limited to 1GB in KVM Guest Mode */

-84

arch/mips/mm/mmap.c

··· 20 20 unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */ 21 21 EXPORT_SYMBOL(shm_align_mask); 22 22 23 - /* gap between mmap and stack */ 24 - #define MIN_GAP (128*1024*1024UL) 25 - #define MAX_GAP ((TASK_SIZE)/6*5) 26 - 27 - static int mmap_is_legacy(struct rlimit *rlim_stack) 28 - { 29 - if (current->personality & ADDR_COMPAT_LAYOUT) 30 - return 1; 31 - 32 - if (rlim_stack->rlim_cur == RLIM_INFINITY) 33 - return 1; 34 - 35 - return sysctl_legacy_va_layout; 36 - } 37 - 38 - static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) 39 - { 40 - unsigned long gap = rlim_stack->rlim_cur; 41 - 42 - if (gap < MIN_GAP) 43 - gap = MIN_GAP; 44 - else if (gap > MAX_GAP) 45 - gap = MAX_GAP; 46 - 47 - return PAGE_ALIGN(TASK_SIZE - gap - rnd); 48 - } 49 - 50 23 #define COLOUR_ALIGN(addr, pgoff) \ 51 24 ((((addr) + shm_align_mask) & ~shm_align_mask) + \ 52 25 (((pgoff) << PAGE_SHIFT) & shm_align_mask)) ··· 115 142 { 116 143 return arch_get_unmapped_area_common(filp, 117 144 addr0, len, pgoff, flags, DOWN); 118 - } 119 - 120 - unsigned long arch_mmap_rnd(void) 121 - { 122 - unsigned long rnd; 123 - 124 - #ifdef CONFIG_COMPAT 125 - if (TASK_IS_32BIT_ADDR) 126 - rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); 127 - else 128 - #endif /* CONFIG_COMPAT */ 129 - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); 130 - 131 - return rnd << PAGE_SHIFT; 132 - } 133 - 134 - void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) 135 - { 136 - unsigned long random_factor = 0UL; 137 - 138 - if (current->flags & PF_RANDOMIZE) 139 - random_factor = arch_mmap_rnd(); 140 - 141 - if (mmap_is_legacy(rlim_stack)) { 142 - mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; 143 - mm->get_unmapped_area = arch_get_unmapped_area; 144 - } else { 145 - mm->mmap_base = mmap_base(random_factor, rlim_stack); 146 - mm->get_unmapped_area = arch_get_unmapped_area_topdown; 147 - } 148 - } 149 - 150 - static inline unsigned long brk_rnd(void) 151 - { 152 - unsigned long rnd = get_random_long(); 153 - 154 - rnd = rnd << PAGE_SHIFT; 155 - /* 8MB for 32bit, 256MB for 64bit */ 156 - if (TASK_IS_32BIT_ADDR) 157 - rnd = rnd & 0x7ffffful; 158 - else 159 - rnd = rnd & 0xffffffful; 160 - 161 - return rnd; 162 - } 163 - 164 - unsigned long arch_randomize_brk(struct mm_struct *mm) 165 - { 166 - unsigned long base = mm->brk; 167 - unsigned long ret; 168 - 169 - ret = PAGE_ALIGN(base + brk_rnd()); 170 - 171 - if (ret < mm->brk) 172 - return mm->brk; 173 - 174 - return ret; 175 145 } 176 146 177 147 bool __virt_addr_valid(const volatile void *kaddr)

-2

arch/nds32/include/asm/pgalloc.h

··· 23 23 extern pgd_t *pgd_alloc(struct mm_struct *mm); 24 24 extern void pgd_free(struct mm_struct *mm, pgd_t * pgd); 25 25 26 - #define check_pgt_cache() do { } while (0) 27 - 28 26 static inline pgtable_t pte_alloc_one(struct mm_struct *mm) 29 27 { 30 28 pgtable_t pte;

-2

arch/nds32/include/asm/pgtable.h

··· 403 403 * into virtual address `from' 404 404 */ 405 405 406 - #define pgtable_cache_init() do { } while (0) 407 - 408 406 #endif /* !__ASSEMBLY__ */ 409 407 410 408 #endif /* _ASMNDS32_PGTABLE_H */

-2

arch/nios2/include/asm/pgalloc.h

··· 45 45 tlb_remove_page((tlb), (pte)); \ 46 46 } while (0) 47 47 48 - #define check_pgt_cache() do { } while (0) 49 - 50 48 #endif /* _ASM_NIOS2_PGALLOC_H */

-2

arch/nios2/include/asm/pgtable.h

··· 291 291 292 292 #include <asm-generic/pgtable.h> 293 293 294 - #define pgtable_cache_init() do { } while (0) 295 - 296 294 extern void __init paging_init(void); 297 295 extern void __init mmu_init(void); 298 296

-2

arch/openrisc/include/asm/pgalloc.h

··· 101 101 102 102 #define pmd_pgtable(pmd) pmd_page(pmd) 103 103 104 - #define check_pgt_cache() do { } while (0) 105 - 106 104 #endif

-5

arch/openrisc/include/asm/pgtable.h

··· 443 443 444 444 #include <asm-generic/pgtable.h> 445 445 446 - /* 447 - * No page table caches to initialise 448 - */ 449 - #define pgtable_cache_init() do { } while (0) 450 - 451 446 typedef pte_t *pte_addr_t; 452 447 453 448 #endif /* __ASSEMBLY__ */

-2

arch/parisc/include/asm/pgalloc.h

··· 124 124 pmd_populate_kernel(mm, pmd, page_address(pte_page)) 125 125 #define pmd_pgtable(pmd) pmd_page(pmd) 126 126 127 - #define check_pgt_cache() do { } while (0) 128 - 129 127 #endif

-2

arch/parisc/include/asm/pgtable.h

··· 132 132 #define PTRS_PER_PTE (1UL << BITS_PER_PTE) 133 133 134 134 /* Definitions for 2nd level */ 135 - #define pgtable_cache_init() do { } while (0) 136 - 137 135 #define PMD_SHIFT (PLD_SHIFT + BITS_PER_PTE) 138 136 #define PMD_SIZE (1UL << PMD_SHIFT) 139 137 #define PMD_MASK (~(PMD_SIZE-1))

-2

arch/powerpc/include/asm/pgalloc.h

··· 64 64 extern struct kmem_cache *pgtable_cache[]; 65 65 #define PGT_CACHE(shift) pgtable_cache[shift] 66 66 67 - static inline void check_pgt_cache(void) { } 68 - 69 67 #ifdef CONFIG_PPC_BOOK3S 70 68 #include <asm/book3s/pgalloc.h> 71 69 #else

-1

arch/powerpc/include/asm/pgtable.h

··· 87 87 unsigned long vmalloc_to_phys(void *vmalloc_addr); 88 88 89 89 void pgtable_cache_add(unsigned int shift); 90 - void pgtable_cache_init(void); 91 90 92 91 #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_PPC32) 93 92 void mark_initmem_nx(void);

+1 -1

arch/powerpc/mm/book3s64/hash_utils.c

··· 1748 1748 /* 1749 1749 * IF we try to do a HUGE PTE update after a withdraw is done. 1750 1750 * we will find the below NULL. This happens when we do 1751 - * split_huge_page_pmd 1751 + * split_huge_pmd 1752 1752 */ 1753 1753 if (!hpte_slot_array) 1754 1754 return;

+2 -5

arch/powerpc/mm/book3s64/iommu_api.c

··· 129 129 * Allow to use larger than 64k IOMMU pages. Only do that 130 130 * if we are backed by hugetlb. 131 131 */ 132 - if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) { 133 - struct page *head = compound_head(page); 134 - 135 - pageshift = compound_order(head) + PAGE_SHIFT; 136 - } 132 + if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) 133 + pageshift = page_shift(compound_head(page)); 137 134 mem->pageshift = min(mem->pageshift, pageshift); 138 135 /* 139 136 * We don't need struct page reference any more, switch

+1 -1

arch/powerpc/mm/hugetlbpage.c

··· 667 667 668 668 BUG_ON(!PageCompound(page)); 669 669 670 - for (i = 0; i < (1UL << compound_order(page)); i++) { 670 + for (i = 0; i < compound_nr(page); i++) { 671 671 if (!PageHighMem(page)) { 672 672 __flush_dcache_icache(page_address(page+i)); 673 673 } else {

+12

arch/riscv/Kconfig

··· 59 59 select ARCH_HAS_GIGANTIC_PAGE 60 60 select ARCH_WANT_HUGE_PMD_SHARE if 64BIT 61 61 select SPARSEMEM_STATIC if 32BIT 62 + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU 63 + select HAVE_ARCH_MMAP_RND_BITS 64 + 65 + config ARCH_MMAP_RND_BITS_MIN 66 + default 18 if 64BIT 67 + default 8 68 + 69 + # max bits determined by the following formula: 70 + # VA_BITS - PAGE_SHIFT - 3 71 + config ARCH_MMAP_RND_BITS_MAX 72 + default 24 if 64BIT # SV39 based 73 + default 17 62 74 63 75 config MMU 64 76 def_bool y

-4

arch/riscv/include/asm/pgalloc.h

··· 82 82 tlb_remove_page((tlb), pte); \ 83 83 } while (0) 84 84 85 - static inline void check_pgt_cache(void) 86 - { 87 - } 88 - 89 85 #endif /* _ASM_RISCV_PGALLOC_H */

-5

arch/riscv/include/asm/pgtable.h

··· 424 424 extern void setup_bootmem(void); 425 425 extern void paging_init(void); 426 426 427 - static inline void pgtable_cache_init(void) 428 - { 429 - /* No page table caches to initialize */ 430 - } 431 - 432 427 #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) 433 428 #define VMALLOC_END (PAGE_OFFSET - 1) 434 429 #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE)

-6

arch/s390/include/asm/pgtable.h

··· 1682 1682 #define HAVE_ARCH_UNMAPPED_AREA 1683 1683 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 1684 1684 1685 - /* 1686 - * No page table caches to initialise 1687 - */ 1688 - static inline void pgtable_cache_init(void) { } 1689 - static inline void check_pgt_cache(void) { } 1690 - 1691 1685 #include <asm-generic/pgtable.h> 1692 1686 1693 1687 #endif /* _S390_PAGE_H */

+1 -43

arch/sh/include/asm/pgalloc.h

··· 2 2 #ifndef __ASM_SH_PGALLOC_H 3 3 #define __ASM_SH_PGALLOC_H 4 4 5 - #include <linux/quicklist.h> 6 5 #include <asm/page.h> 7 - 8 - #define QUICK_PT 0 /* Other page table pages that are zero on free */ 6 + #include <asm-generic/pgalloc.h> 9 7 10 8 extern pgd_t *pgd_alloc(struct mm_struct *); 11 9 extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); ··· 27 29 } 28 30 #define pmd_pgtable(pmd) pmd_page(pmd) 29 31 30 - /* 31 - * Allocate and free page tables. 32 - */ 33 - static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) 34 - { 35 - return quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL); 36 - } 37 - 38 - static inline pgtable_t pte_alloc_one(struct mm_struct *mm) 39 - { 40 - struct page *page; 41 - void *pg; 42 - 43 - pg = quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL); 44 - if (!pg) 45 - return NULL; 46 - page = virt_to_page(pg); 47 - if (!pgtable_page_ctor(page)) { 48 - quicklist_free(QUICK_PT, NULL, pg); 49 - return NULL; 50 - } 51 - return page; 52 - } 53 - 54 - static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) 55 - { 56 - quicklist_free(QUICK_PT, NULL, pte); 57 - } 58 - 59 - static inline void pte_free(struct mm_struct *mm, pgtable_t pte) 60 - { 61 - pgtable_page_dtor(pte); 62 - quicklist_free_page(QUICK_PT, NULL, pte); 63 - } 64 - 65 32 #define __pte_free_tlb(tlb,pte,addr) \ 66 33 do { \ 67 34 pgtable_page_dtor(pte); \ ··· 41 78 tlb_remove_page((tlb), page); \ 42 79 } while (0); 43 80 #endif 44 - 45 - static inline void check_pgt_cache(void) 46 - { 47 - quicklist_trim(QUICK_PT, NULL, 25, 16); 48 - } 49 81 50 82 #endif /* __ASM_SH_PGALLOC_H */

-5

arch/sh/include/asm/pgtable.h

··· 123 123 124 124 #define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT))) 125 125 126 - /* 127 - * Initialise the page table caches 128 - */ 129 - extern void pgtable_cache_init(void); 130 - 131 126 struct vm_area_struct; 132 127 struct mm_struct; 133 128

-3

arch/sh/mm/Kconfig

··· 1 1 # SPDX-License-Identifier: GPL-2.0 2 2 menu "Memory management options" 3 3 4 - config QUICKLIST 5 - def_bool y 6 - 7 4 config MMU 8 5 bool "Support for memory management hardware" 9 6 depends on !CPU_SH2

-4

arch/sh/mm/nommu.c

··· 97 97 void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) 98 98 { 99 99 } 100 - 101 - void pgtable_cache_init(void) 102 - { 103 - }

-2

arch/sparc/include/asm/pgalloc_32.h

··· 17 17 18 18 extern struct resource sparc_iomap; 19 19 20 - #define check_pgt_cache() do { } while (0) 21 - 22 20 pgd_t *get_pgd_fast(void); 23 21 static inline void free_pgd_fast(pgd_t *pgd) 24 22 {

-2

arch/sparc/include/asm/pgalloc_64.h

··· 69 69 #define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE) 70 70 #define pmd_pgtable(PMD) ((pte_t *)__pmd_page(PMD)) 71 71 72 - #define check_pgt_cache() do { } while (0) 73 - 74 72 void pgtable_free(void *table, bool is_page); 75 73 76 74 #ifdef CONFIG_SMP

-5

arch/sparc/include/asm/pgtable_32.h

··· 445 445 /* We provide our own get_unmapped_area to cope with VA holes for userland */ 446 446 #define HAVE_ARCH_UNMAPPED_AREA 447 447 448 - /* 449 - * No page table caches to initialise 450 - */ 451 - #define pgtable_cache_init() do { } while (0) 452 - 453 448 #endif /* !(_SPARC_PGTABLE_H) */

-1

arch/sparc/include/asm/pgtable_64.h

··· 1135 1135 unsigned long); 1136 1136 #define HAVE_ARCH_FB_UNMAPPED_AREA 1137 1137 1138 - void pgtable_cache_init(void); 1139 1138 void sun4v_register_fault_status(void); 1140 1139 void sun4v_ktsb_register(void); 1141 1140 void __init cheetah_ecache_flush_init(void);

-1

arch/sparc/mm/init_32.c

··· 31 31 #include <asm/page.h> 32 32 #include <asm/pgtable.h> 33 33 #include <asm/vaddrs.h> 34 - #include <asm/pgalloc.h> /* bug in asm-generic/tlb.h: check_pgt_cache */ 35 34 #include <asm/setup.h> 36 35 #include <asm/tlb.h> 37 36 #include <asm/prom.h>

-2

arch/um/include/asm/pgalloc.h

··· 43 43 #define __pmd_free_tlb(tlb,x, address) tlb_remove_page((tlb),virt_to_page(x)) 44 44 #endif 45 45 46 - #define check_pgt_cache() do { } while (0) 47 - 48 46 #endif 49 47

-2

arch/um/include/asm/pgtable.h

··· 32 32 /* zero page used for uninitialized stuff */ 33 33 extern unsigned long *empty_zero_page; 34 34 35 - #define pgtable_cache_init() do ; while (0) 36 - 37 35 /* Just any arbitrary offset to the start of the vmalloc VM area: the 38 36 * current 8MB value just means that there will be a 8MB "hole" after the 39 37 * physical memory until the kernel virtual memory starts. That means that

-2

arch/unicore32/include/asm/pgalloc.h

··· 18 18 #define __HAVE_ARCH_PTE_ALLOC_ONE 19 19 #include <asm-generic/pgalloc.h> 20 20 21 - #define check_pgt_cache() do { } while (0) 22 - 23 21 #define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) 24 22 #define _PAGE_KERNEL_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) 25 23

-2

arch/unicore32/include/asm/pgtable.h

··· 285 285 286 286 #include <asm-generic/pgtable.h> 287 287 288 - #define pgtable_cache_init() do { } while (0) 289 - 290 288 #endif /* !__ASSEMBLY__ */ 291 289 292 290 #endif /* __UNICORE_PGTABLE_H__ */

-2

arch/x86/include/asm/pgtable_32.h

··· 29 29 extern pgd_t initial_page_table[1024]; 30 30 extern pmd_t initial_pg_pmd[]; 31 31 32 - static inline void pgtable_cache_init(void) { } 33 - static inline void check_pgt_cache(void) { } 34 32 void paging_init(void); 35 33 void sync_initial_page_table(void); 36 34

-3

arch/x86/include/asm/pgtable_64.h

··· 241 241 #define HAVE_ARCH_UNMAPPED_AREA 242 242 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 243 243 244 - #define pgtable_cache_init() do { } while (0) 245 - #define check_pgt_cache() do { } while (0) 246 - 247 244 #define PAGE_AGP PAGE_KERNEL_NOCACHE 248 245 #define HAVE_PAGE_AGP 1 249 246

+1 -5

arch/x86/mm/pgtable.c

··· 357 357 358 358 static struct kmem_cache *pgd_cache; 359 359 360 - void __init pgd_cache_init(void) 360 + void __init pgtable_cache_init(void) 361 361 { 362 362 /* 363 363 * When PAE kernel is running as a Xen domain, it does not use ··· 401 401 kmem_cache_free(pgd_cache, pgd); 402 402 } 403 403 #else 404 - 405 - void __init pgd_cache_init(void) 406 - { 407 - } 408 404 409 405 static inline pgd_t *_pgd_alloc(void) 410 406 {

-1

arch/xtensa/include/asm/pgtable.h

··· 238 238 # define swapper_pg_dir NULL 239 239 static inline void paging_init(void) { } 240 240 #endif 241 - static inline void pgtable_cache_init(void) { } 242 241 243 242 /* 244 243 * The pmd contains the kernel virtual address of the pte page.

-3

arch/xtensa/include/asm/tlbflush.h

··· 160 160 invalidate_dtlb_entry(tlb_entry); 161 161 } 162 162 163 - #define check_pgt_cache() do { } while (0) 164 - 165 - 166 163 /* 167 164 * DO NOT USE THESE FUNCTIONS. These instructions aren't part of the Xtensa 168 165 * ISA and exist only for test purposes..

+16 -28

drivers/base/memory.c

··· 100 100 } 101 101 EXPORT_SYMBOL_GPL(memory_block_size_bytes); 102 102 103 - static unsigned long get_memory_block_size(void) 104 - { 105 - unsigned long block_sz; 106 - 107 - block_sz = memory_block_size_bytes(); 108 - 109 - /* Validate blk_sz is a power of 2 and not less than section size */ 110 - if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { 111 - WARN_ON(1); 112 - block_sz = MIN_MEMORY_BLOCK_SIZE; 113 - } 114 - 115 - return block_sz; 116 - } 117 - 118 103 /* 119 - * use this as the physical section index that this memsection 120 - * uses. 104 + * Show the first physical section index (number) of this memory block. 121 105 */ 122 - 123 106 static ssize_t phys_index_show(struct device *dev, 124 107 struct device_attribute *attr, char *buf) 125 108 { ··· 114 131 } 115 132 116 133 /* 117 - * Show whether the section of memory is likely to be hot-removable 134 + * Show whether the memory block is likely to be offlineable (or is already 135 + * offline). Once offline, the memory block could be removed. The return 136 + * value does, however, not indicate that there is a way to remove the 137 + * memory block. 118 138 */ 119 139 static ssize_t removable_show(struct device *dev, struct device_attribute *attr, 120 140 char *buf) ··· 441 455 static DEVICE_ATTR_RO(removable); 442 456 443 457 /* 444 - * Block size attribute stuff 458 + * Show the memory block size (shared by all memory blocks). 445 459 */ 446 460 static ssize_t block_size_bytes_show(struct device *dev, 447 461 struct device_attribute *attr, char *buf) 448 462 { 449 - return sprintf(buf, "%lx\n", get_memory_block_size()); 463 + return sprintf(buf, "%lx\n", memory_block_size_bytes()); 450 464 } 451 465 452 466 static DEVICE_ATTR_RO(block_size_bytes); ··· 656 670 return -ENOMEM; 657 671 658 672 mem->start_section_nr = block_id * sections_per_block; 659 - mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; 660 673 mem->state = state; 661 674 start_pfn = section_nr_to_pfn(mem->start_section_nr); 662 675 mem->phys_device = arch_get_memory_phys_device(start_pfn); 676 + mem->nid = NUMA_NO_NODE; 663 677 664 678 ret = register_memory(mem); 665 679 ··· 796 810 /* 797 811 * Initialize the sysfs support for memory devices... 798 812 */ 799 - int __init memory_dev_init(void) 813 + void __init memory_dev_init(void) 800 814 { 801 815 int ret; 802 816 int err; 803 817 unsigned long block_sz, nr; 804 818 819 + /* Validate the configured memory block size */ 820 + block_sz = memory_block_size_bytes(); 821 + if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE) 822 + panic("Memory block size not suitable: 0x%lx\n", block_sz); 823 + sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 824 + 805 825 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); 806 826 if (ret) 807 827 goto out; 808 - 809 - block_sz = get_memory_block_size(); 810 - sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 811 828 812 829 /* 813 830 * Create entries for memory sections that were found ··· 827 838 828 839 out: 829 840 if (ret) 830 - printk(KERN_ERR "%s() failed: %d\n", __func__, ret); 831 - return ret; 841 + panic("%s() failed: %d\n", __func__, ret); 832 842 } 833 843 834 844 /**

+26 -29

drivers/base/node.c

··· 427 427 "Node %d AnonHugePages: %8lu kB\n" 428 428 "Node %d ShmemHugePages: %8lu kB\n" 429 429 "Node %d ShmemPmdMapped: %8lu kB\n" 430 + "Node %d FileHugePages: %8lu kB\n" 431 + "Node %d FilePmdMapped: %8lu kB\n" 430 432 #endif 431 433 , 432 434 nid, K(node_page_state(pgdat, NR_FILE_DIRTY)), ··· 454 452 nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * 455 453 HPAGE_PMD_NR), 456 454 nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * 455 + HPAGE_PMD_NR), 456 + nid, K(node_page_state(pgdat, NR_FILE_THPS) * 457 + HPAGE_PMD_NR), 458 + nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) * 457 459 HPAGE_PMD_NR) 458 460 #endif 459 461 ); ··· 762 756 static int register_mem_sect_under_node(struct memory_block *mem_blk, 763 757 void *arg) 764 758 { 759 + unsigned long memory_block_pfns = memory_block_size_bytes() / PAGE_SIZE; 760 + unsigned long start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); 761 + unsigned long end_pfn = start_pfn + memory_block_pfns - 1; 765 762 int ret, nid = *(int *)arg; 766 - unsigned long pfn, sect_start_pfn, sect_end_pfn; 763 + unsigned long pfn; 767 764 768 - mem_blk->nid = nid; 769 - 770 - sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); 771 - sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); 772 - sect_end_pfn += PAGES_PER_SECTION - 1; 773 - for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { 765 + for (pfn = start_pfn; pfn <= end_pfn; pfn++) { 774 766 int page_nid; 775 767 776 768 /* ··· 793 789 if (page_nid != nid) 794 790 continue; 795 791 } 792 + 793 + /* 794 + * If this memory block spans multiple nodes, we only indicate 795 + * the last processed node. 796 + */ 797 + mem_blk->nid = nid; 798 + 796 799 ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, 797 800 &mem_blk->dev.kobj, 798 801 kobject_name(&mem_blk->dev.kobj)); ··· 815 804 } 816 805 817 806 /* 818 - * Unregister memory block device under all nodes that it spans. 819 - * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes). 807 + * Unregister a memory block device under the node it spans. Memory blocks 808 + * with multiple nodes cannot be offlined and therefore also never be removed. 820 809 */ 821 810 void unregister_memory_block_under_nodes(struct memory_block *mem_blk) 822 811 { 823 - unsigned long pfn, sect_start_pfn, sect_end_pfn; 824 - static nodemask_t unlinked_nodes; 812 + if (mem_blk->nid == NUMA_NO_NODE) 813 + return; 825 814 826 - nodes_clear(unlinked_nodes); 827 - sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); 828 - sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); 829 - for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { 830 - int nid; 831 - 832 - nid = get_nid_for_pfn(pfn); 833 - if (nid < 0) 834 - continue; 835 - if (!node_online(nid)) 836 - continue; 837 - if (node_test_and_set(nid, unlinked_nodes)) 838 - continue; 839 - sysfs_remove_link(&node_devices[nid]->dev.kobj, 840 - kobject_name(&mem_blk->dev.kobj)); 841 - sysfs_remove_link(&mem_blk->dev.kobj, 842 - kobject_name(&node_devices[nid]->dev.kobj)); 843 - } 815 + sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj, 816 + kobject_name(&mem_blk->dev.kobj)); 817 + sysfs_remove_link(&mem_blk->dev.kobj, 818 + kobject_name(&node_devices[mem_blk->nid]->dev.kobj)); 844 819 } 845 820 846 821 int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)

+2 -3

drivers/crypto/chelsio/chtls/chtls_io.c

··· 1078 1078 bool merge; 1079 1079 1080 1080 if (page) 1081 - pg_size <<= compound_order(page); 1081 + pg_size = page_size(page); 1082 1082 if (off < pg_size && 1083 1083 skb_can_coalesce(skb, i, page, off)) { 1084 1084 merge = 1; ··· 1105 1105 __GFP_NORETRY, 1106 1106 order); 1107 1107 if (page) 1108 - pg_size <<= 1109 - compound_order(page); 1108 + pg_size <<= order; 1110 1109 } 1111 1110 if (!page) { 1112 1111 page = alloc_page(gfp);

+2 -8

drivers/gpu/drm/via/via_dmablit.c

··· 174 174 static void 175 175 via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg) 176 176 { 177 - struct page *page; 178 177 int i; 179 178 180 179 switch (vsg->state) { ··· 188 189 kfree(vsg->desc_pages); 189 190 /* fall through */ 190 191 case dr_via_pages_locked: 191 - for (i = 0; i < vsg->num_pages; ++i) { 192 - if (NULL != (page = vsg->pages[i])) { 193 - if (!PageReserved(page) && (DMA_FROM_DEVICE == vsg->direction)) 194 - SetPageDirty(page); 195 - put_page(page); 196 - } 197 - } 192 + put_user_pages_dirty_lock(vsg->pages, vsg->num_pages, 193 + (vsg->direction == DMA_FROM_DEVICE)); 198 194 /* fall through */ 199 195 case dr_via_pages_alloc: 200 196 vfree(vsg->pages);

+1 -4

drivers/infiniband/core/umem.c

··· 54 54 55 55 for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { 56 56 page = sg_page_iter_page(&sg_iter); 57 - if (umem->writable && dirty) 58 - put_user_pages_dirty_lock(&page, 1); 59 - else 60 - put_user_page(page); 57 + put_user_pages_dirty_lock(&page, 1, umem->writable && dirty); 61 58 } 62 59 63 60 sg_free_table(&umem->sg_head);

+1 -4

drivers/infiniband/hw/hfi1/user_pages.c

··· 118 118 void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, 119 119 size_t npages, bool dirty) 120 120 { 121 - if (dirty) 122 - put_user_pages_dirty_lock(p, npages); 123 - else 124 - put_user_pages(p, npages); 121 + put_user_pages_dirty_lock(p, npages, dirty); 125 122 126 123 if (mm) { /* during close after signal, mm can be NULL */ 127 124 atomic64_sub(npages, &mm->pinned_vm);

+1 -4

drivers/infiniband/hw/qib/qib_user_pages.c

··· 40 40 static void __qib_release_user_pages(struct page **p, size_t num_pages, 41 41 int dirty) 42 42 { 43 - if (dirty) 44 - put_user_pages_dirty_lock(p, num_pages); 45 - else 46 - put_user_pages(p, num_pages); 43 + put_user_pages_dirty_lock(p, num_pages, dirty); 47 44 } 48 45 49 46 /**

+1 -4

drivers/infiniband/hw/usnic/usnic_uiom.c

··· 75 75 for_each_sg(chunk->page_list, sg, chunk->nents, i) { 76 76 page = sg_page(sg); 77 77 pa = sg_phys(sg); 78 - if (dirty) 79 - put_user_pages_dirty_lock(&page, 1); 80 - else 81 - put_user_page(page); 78 + put_user_pages_dirty_lock(&page, 1, dirty); 82 79 usnic_dbg("pa: %pa\n", &pa); 83 80 } 84 81 kfree(chunk);

+1 -9

drivers/infiniband/sw/siw/siw_mem.c

··· 63 63 static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages, 64 64 bool dirty) 65 65 { 66 - struct page **p = chunk->plist; 67 - 68 - while (num_pages--) { 69 - if (!PageDirty(*p) && dirty) 70 - put_user_pages_dirty_lock(p, 1); 71 - else 72 - put_user_page(*p); 73 - p++; 74 - } 66 + put_user_pages_dirty_lock(chunk->plist, num_pages, dirty); 75 67 } 76 68 77 69 void siw_umem_release(struct siw_umem *umem, bool dirty)

+2 -2

drivers/staging/android/ion/ion_system_heap.c

··· 120 120 if (!page) 121 121 goto free_pages; 122 122 list_add_tail(&page->lru, &pages); 123 - size_remaining -= PAGE_SIZE << compound_order(page); 123 + size_remaining -= page_size(page); 124 124 max_order = compound_order(page); 125 125 i++; 126 126 } ··· 133 133 134 134 sg = table->sgl; 135 135 list_for_each_entry_safe(page, tmp_page, &pages, lru) { 136 - sg_set_page(sg, page, PAGE_SIZE << compound_order(page), 0); 136 + sg_set_page(sg, page, page_size(page), 0); 137 137 sg = sg_next(sg); 138 138 list_del(&page->lru); 139 139 }

+1 -2

drivers/target/tcm_fc/tfc_io.c

··· 136 136 page, off_in_page, tlen); 137 137 fr_len(fp) += tlen; 138 138 fp_skb(fp)->data_len += tlen; 139 - fp_skb(fp)->truesize += 140 - PAGE_SIZE << compound_order(page); 139 + fp_skb(fp)->truesize += page_size(page); 141 140 } else { 142 141 BUG_ON(!page); 143 142 from = kmap_atomic(page + (mem_off >> PAGE_SHIFT));

+4 -4

drivers/vfio/vfio_iommu_spapr_tce.c

··· 176 176 } 177 177 178 178 static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, 179 - unsigned int page_shift) 179 + unsigned int it_page_shift) 180 180 { 181 181 struct page *page; 182 182 unsigned long size = 0; 183 183 184 - if (mm_iommu_is_devmem(mm, hpa, page_shift, &size)) 185 - return size == (1UL << page_shift); 184 + if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size)) 185 + return size == (1UL << it_page_shift); 186 186 187 187 page = pfn_to_page(hpa >> PAGE_SHIFT); 188 188 /* ··· 190 190 * a page we just found. Otherwise the hardware can get access to 191 191 * a bigger memory chunk that it should. 192 192 */ 193 - return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; 193 + return page_shift(compound_head(page)) >= it_page_shift; 194 194 } 195 195 196 196 static inline bool tce_groups_attached(struct tce_container *container)

-20

fs/binfmt_elf.c

··· 670 670 * libraries. There is no binary dependent code anywhere else. 671 671 */ 672 672 673 - #ifndef STACK_RND_MASK 674 - #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ 675 - #endif 676 - 677 - static unsigned long randomize_stack_top(unsigned long stack_top) 678 - { 679 - unsigned long random_variable = 0; 680 - 681 - if (current->flags & PF_RANDOMIZE) { 682 - random_variable = get_random_long(); 683 - random_variable &= STACK_RND_MASK; 684 - random_variable <<= PAGE_SHIFT; 685 - } 686 - #ifdef CONFIG_STACK_GROWSUP 687 - return PAGE_ALIGN(stack_top) + random_variable; 688 - #else 689 - return PAGE_ALIGN(stack_top) - random_variable; 690 - #endif 691 - } 692 - 693 673 static int load_elf_binary(struct linux_binprm *bprm) 694 674 { 695 675 struct file *interpreter = NULL; /* to shut gcc up */

+11 -2

fs/fat/dir.c

··· 1100 1100 err = -ENOMEM; 1101 1101 goto error; 1102 1102 } 1103 + /* Avoid race with userspace read via bdev */ 1104 + lock_buffer(bhs[n]); 1103 1105 memset(bhs[n]->b_data, 0, sb->s_blocksize); 1104 1106 set_buffer_uptodate(bhs[n]); 1107 + unlock_buffer(bhs[n]); 1105 1108 mark_buffer_dirty_inode(bhs[n], dir); 1106 1109 1107 1110 n++; ··· 1161 1158 fat_time_unix2fat(sbi, ts, &time, &date, &time_cs); 1162 1159 1163 1160 de = (struct msdos_dir_entry *)bhs[0]->b_data; 1161 + /* Avoid race with userspace read via bdev */ 1162 + lock_buffer(bhs[0]); 1164 1163 /* filling the new directory slots ("." and ".." entries) */ 1165 1164 memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME); 1166 1165 memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME); ··· 1185 1180 de[0].size = de[1].size = 0; 1186 1181 memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de)); 1187 1182 set_buffer_uptodate(bhs[0]); 1183 + unlock_buffer(bhs[0]); 1188 1184 mark_buffer_dirty_inode(bhs[0], dir); 1189 1185 1190 1186 err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE); ··· 1243 1237 1244 1238 /* fill the directory entry */ 1245 1239 copy = min(size, sb->s_blocksize); 1240 + /* Avoid race with userspace read via bdev */ 1241 + lock_buffer(bhs[n]); 1246 1242 memcpy(bhs[n]->b_data, slots, copy); 1243 + set_buffer_uptodate(bhs[n]); 1244 + unlock_buffer(bhs[n]); 1245 + mark_buffer_dirty_inode(bhs[n], dir); 1247 1246 slots += copy; 1248 1247 size -= copy; 1249 - set_buffer_uptodate(bhs[n]); 1250 - mark_buffer_dirty_inode(bhs[n], dir); 1251 1248 if (!size) 1252 1249 break; 1253 1250 n++;

+3

fs/fat/fatent.c

··· 388 388 err = -ENOMEM; 389 389 goto error; 390 390 } 391 + /* Avoid race with userspace read via bdev */ 392 + lock_buffer(c_bh); 391 393 memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); 392 394 set_buffer_uptodate(c_bh); 395 + unlock_buffer(c_bh); 393 396 mark_buffer_dirty_inode(c_bh, sbi->fat_inode); 394 397 if (sb->s_flags & SB_SYNCHRONOUS) 395 398 err = sync_dirty_buffer(c_bh);

+3

fs/inode.c

··· 181 181 mapping->flags = 0; 182 182 mapping->wb_err = 0; 183 183 atomic_set(&mapping->i_mmap_writable, 0); 184 + #ifdef CONFIG_READ_ONLY_THP_FOR_FS 185 + atomic_set(&mapping->nr_thps, 0); 186 + #endif 184 187 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); 185 188 mapping->private_data = NULL; 186 189 mapping->writeback_index = 0;

+1 -1

fs/io_uring.c

··· 3319 3319 } 3320 3320 3321 3321 page = virt_to_head_page(ptr); 3322 - if (sz > (PAGE_SIZE << compound_order(page))) 3322 + if (sz > page_size(page)) 3323 3323 return -EINVAL; 3324 3324 3325 3325 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;

-2

fs/jbd2/journal.c

··· 89 89 EXPORT_SYMBOL(jbd2_journal_invalidatepage); 90 90 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); 91 91 EXPORT_SYMBOL(jbd2_journal_force_commit); 92 - EXPORT_SYMBOL(jbd2_journal_inode_add_write); 93 - EXPORT_SYMBOL(jbd2_journal_inode_add_wait); 94 92 EXPORT_SYMBOL(jbd2_journal_inode_ranged_write); 95 93 EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait); 96 94 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);

-12

fs/jbd2/transaction.c

··· 2622 2622 return 0; 2623 2623 } 2624 2624 2625 - int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode) 2626 - { 2627 - return jbd2_journal_file_inode(handle, jinode, 2628 - JI_WRITE_DATA | JI_WAIT_DATA, 0, LLONG_MAX); 2629 - } 2630 - 2631 - int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode) 2632 - { 2633 - return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA, 0, 2634 - LLONG_MAX); 2635 - } 2636 - 2637 2625 int jbd2_journal_inode_ranged_write(handle_t *handle, 2638 2626 struct jbd2_inode *jinode, loff_t start_byte, loff_t length) 2639 2627 {

+19 -1

fs/ocfs2/alloc.c

··· 5993 5993 struct buffer_head *data_alloc_bh = NULL; 5994 5994 struct ocfs2_dinode *di; 5995 5995 struct ocfs2_truncate_log *tl; 5996 + struct ocfs2_journal *journal = osb->journal; 5996 5997 5997 5998 BUG_ON(inode_trylock(tl_inode)); 5998 5999 ··· 6011 6010 num_to_flush); 6012 6011 if (!num_to_flush) { 6013 6012 status = 0; 6013 + goto out; 6014 + } 6015 + 6016 + /* Appending truncate log(TA) and and flushing truncate log(TF) are 6017 + * two separated transactions. They can be both committed but not 6018 + * checkpointed. If crash occurs then, both two transaction will be 6019 + * replayed with several already released to global bitmap clusters. 6020 + * Then truncate log will be replayed resulting in cluster double free. 6021 + */ 6022 + jbd2_journal_lock_updates(journal->j_journal); 6023 + status = jbd2_journal_flush(journal->j_journal); 6024 + jbd2_journal_unlock_updates(journal->j_journal); 6025 + if (status < 0) { 6026 + mlog_errno(status); 6014 6027 goto out; 6015 6028 } 6016 6029 ··· 6807 6792 struct page *page, int zero, u64 *phys) 6808 6793 { 6809 6794 int ret, partial = 0; 6795 + loff_t start_byte = ((loff_t)page->index << PAGE_SHIFT) + from; 6796 + loff_t length = to - from; 6810 6797 6811 6798 ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0); 6812 6799 if (ret) ··· 6828 6811 if (ret < 0) 6829 6812 mlog_errno(ret); 6830 6813 else if (ocfs2_should_order_data(inode)) { 6831 - ret = ocfs2_jbd2_file_inode(handle, inode); 6814 + ret = ocfs2_jbd2_inode_add_write(handle, inode, 6815 + start_byte, length); 6832 6816 if (ret < 0) 6833 6817 mlog_errno(ret); 6834 6818 }

+10 -3

fs/ocfs2/aops.c

··· 942 942 943 943 if (tmppage && page_has_buffers(tmppage)) { 944 944 if (ocfs2_should_order_data(inode)) 945 - ocfs2_jbd2_file_inode(wc->w_handle, inode); 945 + ocfs2_jbd2_inode_add_write(wc->w_handle, inode, 946 + user_pos, user_len); 946 947 947 948 block_commit_write(tmppage, from, to); 948 949 } ··· 2024 2023 } 2025 2024 2026 2025 if (page_has_buffers(tmppage)) { 2027 - if (handle && ocfs2_should_order_data(inode)) 2028 - ocfs2_jbd2_file_inode(handle, inode); 2026 + if (handle && ocfs2_should_order_data(inode)) { 2027 + loff_t start_byte = 2028 + ((loff_t)tmppage->index << PAGE_SHIFT) + 2029 + from; 2030 + loff_t length = to - from; 2031 + ocfs2_jbd2_inode_add_write(handle, inode, 2032 + start_byte, length); 2033 + } 2029 2034 block_commit_write(tmppage, from, to); 2030 2035 } 2031 2036 }

+11 -15

fs/ocfs2/blockcheck.c

··· 231 231 } 232 232 DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); 233 233 234 - static struct dentry *blockcheck_debugfs_create(const char *name, 235 - struct dentry *parent, 236 - u64 *value) 237 - { 238 - return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value, 239 - &blockcheck_fops); 240 - } 241 - 242 234 static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) 243 235 { 244 236 if (stats) { ··· 242 250 static void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, 243 251 struct dentry *parent) 244 252 { 245 - stats->b_debug_dir = debugfs_create_dir("blockcheck", parent); 253 + struct dentry *dir; 246 254 247 - blockcheck_debugfs_create("blocks_checked", stats->b_debug_dir, 248 - &stats->b_check_count); 255 + dir = debugfs_create_dir("blockcheck", parent); 256 + stats->b_debug_dir = dir; 249 257 250 - blockcheck_debugfs_create("checksums_failed", stats->b_debug_dir, 251 - &stats->b_failure_count); 258 + debugfs_create_file("blocks_checked", S_IFREG | S_IRUSR, dir, 259 + &stats->b_check_count, &blockcheck_fops); 252 260 253 - blockcheck_debugfs_create("ecc_recoveries", stats->b_debug_dir, 254 - &stats->b_recover_count); 261 + debugfs_create_file("checksums_failed", S_IFREG | S_IRUSR, dir, 262 + &stats->b_failure_count, &blockcheck_fops); 263 + 264 + debugfs_create_file("ecc_recoveries", S_IFREG | S_IRUSR, dir, 265 + &stats->b_recover_count, &blockcheck_fops); 266 + 255 267 } 256 268 #else 257 269 static inline void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,

+27 -76

fs/ocfs2/cluster/heartbeat.c

··· 225 225 unsigned int hr_region_num; 226 226 227 227 struct dentry *hr_debug_dir; 228 - struct dentry *hr_debug_livenodes; 229 - struct dentry *hr_debug_regnum; 230 - struct dentry *hr_debug_elapsed_time; 231 - struct dentry *hr_debug_pinned; 232 228 struct o2hb_debug_buf *hr_db_livenodes; 233 229 struct o2hb_debug_buf *hr_db_regnum; 234 230 struct o2hb_debug_buf *hr_db_elapsed_time; ··· 1390 1394 kfree(o2hb_db_failedregions); 1391 1395 } 1392 1396 1393 - static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, 1394 - struct o2hb_debug_buf **db, int db_len, 1395 - int type, int size, int len, void *data) 1397 + static void o2hb_debug_create(const char *name, struct dentry *dir, 1398 + struct o2hb_debug_buf **db, int db_len, int type, 1399 + int size, int len, void *data) 1396 1400 { 1397 1401 *db = kmalloc(db_len, GFP_KERNEL); 1398 1402 if (!*db) 1399 - return NULL; 1403 + return; 1400 1404 1401 1405 (*db)->db_type = type; 1402 1406 (*db)->db_size = size; 1403 1407 (*db)->db_len = len; 1404 1408 (*db)->db_data = data; 1405 1409 1406 - return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, 1407 - &o2hb_debug_fops); 1410 + debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, &o2hb_debug_fops); 1408 1411 } 1409 1412 1410 1413 static void o2hb_debug_init(void) ··· 1520 1525 1521 1526 kfree(reg->hr_slots); 1522 1527 1523 - debugfs_remove(reg->hr_debug_livenodes); 1524 - debugfs_remove(reg->hr_debug_regnum); 1525 - debugfs_remove(reg->hr_debug_elapsed_time); 1526 - debugfs_remove(reg->hr_debug_pinned); 1527 - debugfs_remove(reg->hr_debug_dir); 1528 + debugfs_remove_recursive(reg->hr_debug_dir); 1528 1529 kfree(reg->hr_db_livenodes); 1529 1530 kfree(reg->hr_db_regnum); 1530 1531 kfree(reg->hr_db_elapsed_time); ··· 1979 1988 : NULL; 1980 1989 } 1981 1990 1982 - static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) 1991 + static void o2hb_debug_region_init(struct o2hb_region *reg, 1992 + struct dentry *parent) 1983 1993 { 1984 - int ret = -ENOMEM; 1994 + struct dentry *dir; 1985 1995 1986 - reg->hr_debug_dir = 1987 - debugfs_create_dir(config_item_name(&reg->hr_item), dir); 1988 - if (!reg->hr_debug_dir) { 1989 - mlog_errno(ret); 1990 - goto bail; 1991 - } 1996 + dir = debugfs_create_dir(config_item_name(&reg->hr_item), parent); 1997 + reg->hr_debug_dir = dir; 1992 1998 1993 - reg->hr_debug_livenodes = 1994 - o2hb_debug_create(O2HB_DEBUG_LIVENODES, 1995 - reg->hr_debug_dir, 1996 - &(reg->hr_db_livenodes), 1997 - sizeof(*(reg->hr_db_livenodes)), 1998 - O2HB_DB_TYPE_REGION_LIVENODES, 1999 - sizeof(reg->hr_live_node_bitmap), 2000 - O2NM_MAX_NODES, reg); 2001 - if (!reg->hr_debug_livenodes) { 2002 - mlog_errno(ret); 2003 - goto bail; 2004 - } 1999 + o2hb_debug_create(O2HB_DEBUG_LIVENODES, dir, &(reg->hr_db_livenodes), 2000 + sizeof(*(reg->hr_db_livenodes)), 2001 + O2HB_DB_TYPE_REGION_LIVENODES, 2002 + sizeof(reg->hr_live_node_bitmap), O2NM_MAX_NODES, 2003 + reg); 2005 2004 2006 - reg->hr_debug_regnum = 2007 - o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, 2008 - reg->hr_debug_dir, 2009 - &(reg->hr_db_regnum), 2010 - sizeof(*(reg->hr_db_regnum)), 2011 - O2HB_DB_TYPE_REGION_NUMBER, 2012 - 0, O2NM_MAX_NODES, reg); 2013 - if (!reg->hr_debug_regnum) { 2014 - mlog_errno(ret); 2015 - goto bail; 2016 - } 2005 + o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, dir, &(reg->hr_db_regnum), 2006 + sizeof(*(reg->hr_db_regnum)), 2007 + O2HB_DB_TYPE_REGION_NUMBER, 0, O2NM_MAX_NODES, reg); 2017 2008 2018 - reg->hr_debug_elapsed_time = 2019 - o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, 2020 - reg->hr_debug_dir, 2021 - &(reg->hr_db_elapsed_time), 2022 - sizeof(*(reg->hr_db_elapsed_time)), 2023 - O2HB_DB_TYPE_REGION_ELAPSED_TIME, 2024 - 0, 0, reg); 2025 - if (!reg->hr_debug_elapsed_time) { 2026 - mlog_errno(ret); 2027 - goto bail; 2028 - } 2009 + o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, dir, 2010 + &(reg->hr_db_elapsed_time), 2011 + sizeof(*(reg->hr_db_elapsed_time)), 2012 + O2HB_DB_TYPE_REGION_ELAPSED_TIME, 0, 0, reg); 2029 2013 2030 - reg->hr_debug_pinned = 2031 - o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, 2032 - reg->hr_debug_dir, 2033 - &(reg->hr_db_pinned), 2034 - sizeof(*(reg->hr_db_pinned)), 2035 - O2HB_DB_TYPE_REGION_PINNED, 2036 - 0, 0, reg); 2037 - if (!reg->hr_debug_pinned) { 2038 - mlog_errno(ret); 2039 - goto bail; 2040 - } 2014 + o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, dir, &(reg->hr_db_pinned), 2015 + sizeof(*(reg->hr_db_pinned)), 2016 + O2HB_DB_TYPE_REGION_PINNED, 0, 0, reg); 2041 2017 2042 - ret = 0; 2043 - bail: 2044 - return ret; 2045 2018 } 2046 2019 2047 2020 static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, ··· 2061 2106 if (ret) 2062 2107 goto unregister_handler; 2063 2108 2064 - ret = o2hb_debug_region_init(reg, o2hb_debug_dir); 2065 - if (ret) { 2066 - config_item_put(&reg->hr_item); 2067 - goto unregister_handler; 2068 - } 2109 + o2hb_debug_region_init(reg, o2hb_debug_dir); 2069 2110 2070 2111 return &reg->hr_item; 2071 2112

+1 -2

fs/ocfs2/dir.c

··· 3636 3636 int i, j, num_used; 3637 3637 u32 major_hash; 3638 3638 struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf; 3639 - struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list; 3639 + struct ocfs2_dx_entry_list *orig_list, *tmp_list; 3640 3640 struct ocfs2_dx_entry *dx_entry; 3641 3641 3642 3642 tmp_list = &tmp_dx_leaf->dl_list; ··· 3645 3645 orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data; 3646 3646 orig_list = &orig_dx_leaf->dl_list; 3647 3647 new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data; 3648 - new_list = &new_dx_leaf->dl_list; 3649 3648 3650 3649 num_used = le16_to_cpu(orig_list->de_num_used); 3651 3650

-1

fs/ocfs2/dlm/dlmcommon.h

··· 142 142 atomic_t res_tot_count; 143 143 atomic_t res_cur_count; 144 144 145 - struct dlm_debug_ctxt *dlm_debug_ctxt; 146 145 struct dentry *dlm_debugfs_subroot; 147 146 148 147 /* NOTE: Next three are protected by dlm_domain_lock */

+11 -44

fs/ocfs2/dlm/dlmdebug.c

··· 853 853 /* files in subroot */ 854 854 void dlm_debug_init(struct dlm_ctxt *dlm) 855 855 { 856 - struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; 857 - 858 856 /* for dumping dlm_ctxt */ 859 - dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE, 860 - S_IFREG|S_IRUSR, 861 - dlm->dlm_debugfs_subroot, 862 - dlm, &debug_state_fops); 857 + debugfs_create_file(DLM_DEBUGFS_DLM_STATE, S_IFREG|S_IRUSR, 858 + dlm->dlm_debugfs_subroot, dlm, &debug_state_fops); 863 859 864 860 /* for dumping lockres */ 865 - dc->debug_lockres_dentry = 866 - debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, 867 - S_IFREG|S_IRUSR, 868 - dlm->dlm_debugfs_subroot, 869 - dlm, &debug_lockres_fops); 861 + debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, S_IFREG|S_IRUSR, 862 + dlm->dlm_debugfs_subroot, dlm, &debug_lockres_fops); 870 863 871 864 /* for dumping mles */ 872 - dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE, 873 - S_IFREG|S_IRUSR, 874 - dlm->dlm_debugfs_subroot, 875 - dlm, &debug_mle_fops); 865 + debugfs_create_file(DLM_DEBUGFS_MLE_STATE, S_IFREG|S_IRUSR, 866 + dlm->dlm_debugfs_subroot, dlm, &debug_mle_fops); 876 867 877 868 /* for dumping lockres on the purge list */ 878 - dc->debug_purgelist_dentry = 879 - debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, 880 - S_IFREG|S_IRUSR, 881 - dlm->dlm_debugfs_subroot, 882 - dlm, &debug_purgelist_fops); 883 - } 884 - 885 - void dlm_debug_shutdown(struct dlm_ctxt *dlm) 886 - { 887 - struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; 888 - 889 - if (dc) { 890 - debugfs_remove(dc->debug_purgelist_dentry); 891 - debugfs_remove(dc->debug_mle_dentry); 892 - debugfs_remove(dc->debug_lockres_dentry); 893 - debugfs_remove(dc->debug_state_dentry); 894 - kfree(dc); 895 - dc = NULL; 896 - } 869 + debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, S_IFREG|S_IRUSR, 870 + dlm->dlm_debugfs_subroot, dlm, 871 + &debug_purgelist_fops); 897 872 } 898 873 899 874 /* subroot - domain dir */ 900 - int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) 875 + void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) 901 876 { 902 - dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt), 903 - GFP_KERNEL); 904 - if (!dlm->dlm_debug_ctxt) { 905 - mlog_errno(-ENOMEM); 906 - return -ENOMEM; 907 - } 908 - 909 877 dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name, 910 878 dlm_debugfs_root); 911 - return 0; 912 879 } 913 880 914 881 void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) 915 882 { 916 - debugfs_remove(dlm->dlm_debugfs_subroot); 883 + debugfs_remove_recursive(dlm->dlm_debugfs_subroot); 917 884 } 918 885 919 886 /* debugfs root */

+2 -14

fs/ocfs2/dlm/dlmdebug.h

··· 14 14 15 15 #ifdef CONFIG_DEBUG_FS 16 16 17 - struct dlm_debug_ctxt { 18 - struct dentry *debug_state_dentry; 19 - struct dentry *debug_lockres_dentry; 20 - struct dentry *debug_mle_dentry; 21 - struct dentry *debug_purgelist_dentry; 22 - }; 23 - 24 17 struct debug_lockres { 25 18 int dl_len; 26 19 char *dl_buf; ··· 22 29 }; 23 30 24 31 void dlm_debug_init(struct dlm_ctxt *dlm); 25 - void dlm_debug_shutdown(struct dlm_ctxt *dlm); 26 32 27 - int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); 33 + void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); 28 34 void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm); 29 35 30 36 void dlm_create_debugfs_root(void); ··· 34 42 static inline void dlm_debug_init(struct dlm_ctxt *dlm) 35 43 { 36 44 } 37 - static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm) 45 + static inline void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) 38 46 { 39 - } 40 - static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) 41 - { 42 - return 0; 43 47 } 44 48 static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) 45 49 {

+2 -5

fs/ocfs2/dlm/dlmdomain.c

··· 387 387 static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) 388 388 { 389 389 dlm_unregister_domain_handlers(dlm); 390 - dlm_debug_shutdown(dlm); 391 390 dlm_complete_thread(dlm); 392 391 dlm_complete_recovery_thread(dlm); 393 392 dlm_destroy_dlm_worker(dlm); ··· 1937 1938 1938 1939 if (status) { 1939 1940 dlm_unregister_domain_handlers(dlm); 1940 - dlm_debug_shutdown(dlm); 1941 1941 dlm_complete_thread(dlm); 1942 1942 dlm_complete_recovery_thread(dlm); 1943 1943 dlm_destroy_dlm_worker(dlm); ··· 1990 1992 dlm->key = key; 1991 1993 dlm->node_num = o2nm_this_node(); 1992 1994 1993 - ret = dlm_create_debugfs_subroot(dlm); 1994 - if (ret < 0) 1995 - goto leave; 1995 + dlm_create_debugfs_subroot(dlm); 1996 1996 1997 1997 spin_lock_init(&dlm->spinlock); 1998 1998 spin_lock_init(&dlm->master_lock); ··· 2052 2056 mlog(0, "context init: refcount %u\n", 2053 2057 kref_read(&dlm->dlm_refs)); 2054 2058 2059 + ret = 0; 2055 2060 leave: 2056 2061 if (ret < 0 && dlm) { 2057 2062 if (dlm->master_hash)

+19 -4

fs/ocfs2/dlm/dlmunlock.c

··· 90 90 enum dlm_status status; 91 91 int actions = 0; 92 92 int in_use; 93 - u8 owner; 93 + u8 owner; 94 + int recovery_wait = 0; 94 95 95 96 mlog(0, "master_node = %d, valblk = %d\n", master_node, 96 97 flags & LKM_VALBLK); ··· 194 193 } 195 194 if (flags & LKM_CANCEL) 196 195 lock->cancel_pending = 0; 197 - else 198 - lock->unlock_pending = 0; 199 - 196 + else { 197 + if (!lock->unlock_pending) 198 + recovery_wait = 1; 199 + else 200 + lock->unlock_pending = 0; 201 + } 200 202 } 201 203 202 204 /* get an extra ref on lock. if we are just switching ··· 232 228 spin_unlock(&lock->spinlock); 233 229 spin_unlock(&res->spinlock); 234 230 wake_up(&res->wq); 231 + 232 + if (recovery_wait) { 233 + spin_lock(&res->spinlock); 234 + /* Unlock request will directly succeed after owner dies, 235 + * and the lock is already removed from grant list. We have to 236 + * wait for RECOVERING done or we miss the chance to purge it 237 + * since the removement is much faster than RECOVERING proc. 238 + */ 239 + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_RECOVERING); 240 + spin_unlock(&res->spinlock); 241 + } 235 242 236 243 /* let the caller's final dlm_lock_put handle the actual kfree */ 237 244 if (actions & DLM_UNLOCK_FREE_LOCK) {

+7 -20

fs/ocfs2/dlmglue.c

··· 2508 2508 ocfs2_inode_unlock(inode, ex); 2509 2509 } 2510 2510 2511 - if (local_bh) 2512 - brelse(local_bh); 2513 - 2511 + brelse(local_bh); 2514 2512 return status; 2515 2513 } 2516 2514 ··· 2591 2593 *level = 1; 2592 2594 if (ocfs2_should_update_atime(inode, vfsmnt)) 2593 2595 ocfs2_update_inode_atime(inode, bh); 2594 - if (bh) 2595 - brelse(bh); 2596 + brelse(bh); 2596 2597 } else 2597 2598 *level = 0; 2598 2599 ··· 3009 3012 3010 3013 kref_init(&dlm_debug->d_refcnt); 3011 3014 INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); 3012 - dlm_debug->d_locking_state = NULL; 3013 - dlm_debug->d_locking_filter = NULL; 3014 3015 dlm_debug->d_filter_secs = 0; 3015 3016 out: 3016 3017 return dlm_debug; ··· 3277 3282 { 3278 3283 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; 3279 3284 3280 - dlm_debug->d_locking_state = debugfs_create_file("locking_state", 3281 - S_IFREG|S_IRUSR, 3282 - osb->osb_debug_root, 3283 - osb, 3284 - &ocfs2_dlm_debug_fops); 3285 + debugfs_create_file("locking_state", S_IFREG|S_IRUSR, 3286 + osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops); 3285 3287 3286 - dlm_debug->d_locking_filter = debugfs_create_u32("locking_filter", 3287 - 0600, 3288 - osb->osb_debug_root, 3289 - &dlm_debug->d_filter_secs); 3288 + debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root, 3289 + &dlm_debug->d_filter_secs); 3290 3290 } 3291 3291 3292 3292 static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) 3293 3293 { 3294 3294 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; 3295 3295 3296 - if (dlm_debug) { 3297 - debugfs_remove(dlm_debug->d_locking_state); 3298 - debugfs_remove(dlm_debug->d_locking_filter); 3296 + if (dlm_debug) 3299 3297 ocfs2_put_dlm_debug(dlm_debug); 3300 - } 3301 3298 } 3302 3299 3303 3300 int ocfs2_dlm_init(struct ocfs2_super *osb)

+1 -2

fs/ocfs2/extent_map.c

··· 590 590 *extent_flags = rec->e_flags; 591 591 } 592 592 out: 593 - if (eb_bh) 594 - brelse(eb_bh); 593 + brelse(eb_bh); 595 594 return ret; 596 595 } 597 596

+7 -6

fs/ocfs2/file.c

··· 706 706 * Thus, we need to explicitly order the zeroed pages. 707 707 */ 708 708 static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, 709 - struct buffer_head *di_bh) 709 + struct buffer_head *di_bh, 710 + loff_t start_byte, 711 + loff_t length) 710 712 { 711 713 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 712 714 handle_t *handle = NULL; ··· 724 722 goto out; 725 723 } 726 724 727 - ret = ocfs2_jbd2_file_inode(handle, inode); 725 + ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length); 728 726 if (ret < 0) { 729 727 mlog_errno(ret); 730 728 goto out; ··· 763 761 BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT)); 764 762 BUG_ON(abs_from & (inode->i_blkbits - 1)); 765 763 766 - handle = ocfs2_zero_start_ordered_transaction(inode, di_bh); 764 + handle = ocfs2_zero_start_ordered_transaction(inode, di_bh, 765 + abs_from, 766 + abs_to - abs_from); 767 767 if (IS_ERR(handle)) { 768 768 ret = PTR_ERR(handle); 769 769 goto out; ··· 2130 2126 struct dentry *dentry = file->f_path.dentry; 2131 2127 struct inode *inode = d_inode(dentry); 2132 2128 struct buffer_head *di_bh = NULL; 2133 - loff_t end; 2134 2129 2135 2130 /* 2136 2131 * We start with a read level meta lock and only jump to an ex ··· 2192 2189 goto out_unlock; 2193 2190 } 2194 2191 } 2195 - 2196 - end = pos + count; 2197 2192 2198 2193 ret = ocfs2_check_range_for_refcount(inode, pos, count); 2199 2194 if (ret == 1) {

+1 -1

fs/ocfs2/inode.c

··· 534 534 */ 535 535 mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != 536 536 !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), 537 - "Inode %llu: system file state is ambigous\n", 537 + "Inode %llu: system file state is ambiguous\n", 538 538 (unsigned long long)args->fi_blkno); 539 539 540 540 if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||

+8 -34

fs/ocfs2/journal.h

··· 144 144 void ocfs2_orphan_scan_init(struct ocfs2_super *osb); 145 145 void ocfs2_orphan_scan_start(struct ocfs2_super *osb); 146 146 void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); 147 - void ocfs2_orphan_scan_exit(struct ocfs2_super *osb); 148 147 149 148 void ocfs2_complete_recovery(struct work_struct *work); 150 149 void ocfs2_wait_for_recovery(struct ocfs2_super *osb); ··· 231 232 * ocfs2_journal_access_*() unless you intend to 232 233 * manage the checksum by hand. 233 234 * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. 234 - * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before 235 - * the current handle commits. 235 + * ocfs2_jbd2_inode_add_write - Mark an inode with range so that its data goes 236 + * out before the current handle commits. 236 237 */ 237 238 238 239 /* You must always start_trans with a number of buffs > 0, but it's ··· 440 441 * previous dirblock update in the free list */ 441 442 static inline int ocfs2_link_credits(struct super_block *sb) 442 443 { 443 - return 2*OCFS2_INODE_UPDATE_CREDITS + 4 + 444 + return 2 * OCFS2_INODE_UPDATE_CREDITS + 4 + 444 445 ocfs2_quota_trans_credits(sb); 445 446 } 446 447 ··· 574 575 return ocfs2_extent_recs_per_gd(sb); 575 576 } 576 577 577 - static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, 578 - unsigned int clusters_to_del, 579 - struct ocfs2_dinode *fe, 580 - struct ocfs2_extent_list *last_el) 578 + static inline int ocfs2_jbd2_inode_add_write(handle_t *handle, struct inode *inode, 579 + loff_t start_byte, loff_t length) 581 580 { 582 - /* for dinode + all headers in this pass + update to next leaf */ 583 - u16 next_free = le16_to_cpu(last_el->l_next_free_rec); 584 - u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth); 585 - int credits = 1 + tree_depth + 1; 586 - int i; 587 - 588 - i = next_free - 1; 589 - BUG_ON(i < 0); 590 - 591 - /* We may be deleting metadata blocks, so metadata alloc dinode + 592 - one desc. block for each possible delete. */ 593 - if (tree_depth && next_free == 1 && 594 - ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del) 595 - credits += 1 + tree_depth; 596 - 597 - /* update to the truncate log. */ 598 - credits += OCFS2_TRUNCATE_LOG_UPDATE; 599 - 600 - credits += ocfs2_quota_trans_credits(sb); 601 - 602 - return credits; 603 - } 604 - 605 - static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) 606 - { 607 - return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode); 581 + return jbd2_journal_inode_ranged_write(handle, 582 + &OCFS2_I(inode)->ip_jinode, 583 + start_byte, length); 608 584 } 609 585 610 586 static inline int ocfs2_begin_ordered_truncate(struct inode *inode,

-2

fs/ocfs2/namei.c

··· 2486 2486 struct inode *inode = NULL; 2487 2487 struct inode *orphan_dir = NULL; 2488 2488 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 2489 - struct ocfs2_dinode *di = NULL; 2490 2489 handle_t *handle = NULL; 2491 2490 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; 2492 2491 struct buffer_head *parent_di_bh = NULL; ··· 2551 2552 goto leave; 2552 2553 } 2553 2554 2554 - di = (struct ocfs2_dinode *)new_di_bh->b_data; 2555 2555 status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, 2556 2556 &orphan_insert, orphan_dir, false); 2557 2557 if (status < 0) {

-3

fs/ocfs2/ocfs2.h

··· 223 223 224 224 struct ocfs2_dlm_debug { 225 225 struct kref d_refcnt; 226 - struct dentry *d_locking_state; 227 - struct dentry *d_locking_filter; 228 226 u32 d_filter_secs; 229 227 struct list_head d_lockres_tracking; 230 228 }; ··· 399 401 struct ocfs2_dlm_debug *osb_dlm_debug; 400 402 401 403 struct dentry *osb_debug_root; 402 - struct dentry *osb_ctxt; 403 404 404 405 wait_queue_head_t recovery_event; 405 406

+3 -7

fs/ocfs2/super.c

··· 1080 1080 osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, 1081 1081 ocfs2_debugfs_root); 1082 1082 1083 - osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR, 1084 - osb->osb_debug_root, 1085 - osb, 1086 - &ocfs2_osb_debug_fops); 1083 + debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root, 1084 + osb, &ocfs2_osb_debug_fops); 1087 1085 1088 1086 if (ocfs2_meta_ecc(osb)) 1089 1087 ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats, ··· 1859 1861 1860 1862 kset_unregister(osb->osb_dev_kset); 1861 1863 1862 - debugfs_remove(osb->osb_ctxt); 1863 - 1864 1864 /* Orphan scan should be stopped as early as possible */ 1865 1865 ocfs2_orphan_scan_stop(osb); 1866 1866 ··· 1914 1918 ocfs2_dlm_shutdown(osb, hangup_needed); 1915 1919 1916 1920 ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats); 1917 - debugfs_remove(osb->osb_debug_root); 1921 + debugfs_remove_recursive(osb->osb_debug_root); 1918 1922 1919 1923 if (hangup_needed) 1920 1924 ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));

+8

fs/open.c

··· 818 818 if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) 819 819 return -EINVAL; 820 820 } 821 + 822 + /* 823 + * XXX: Huge page cache doesn't support writing yet. Drop all page 824 + * cache for this file before processing writes. 825 + */ 826 + if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping)) 827 + truncate_pagecache(inode, 0); 828 + 821 829 return 0; 822 830 823 831 cleanup_all:

+4 -4

fs/proc/meminfo.c

··· 8 8 #include <linux/mmzone.h> 9 9 #include <linux/proc_fs.h> 10 10 #include <linux/percpu.h> 11 - #include <linux/quicklist.h> 12 11 #include <linux/seq_file.h> 13 12 #include <linux/swap.h> 14 13 #include <linux/vmstat.h> ··· 105 106 global_zone_page_state(NR_KERNEL_STACK_KB)); 106 107 show_val_kb(m, "PageTables: ", 107 108 global_zone_page_state(NR_PAGETABLE)); 108 - #ifdef CONFIG_QUICKLIST 109 - show_val_kb(m, "Quicklists: ", quicklist_total_size()); 110 - #endif 111 109 112 110 show_val_kb(m, "NFS_Unstable: ", 113 111 global_node_page_state(NR_UNSTABLE_NFS)); ··· 132 136 global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); 133 137 show_val_kb(m, "ShmemPmdMapped: ", 134 138 global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); 139 + show_val_kb(m, "FileHugePages: ", 140 + global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR); 141 + show_val_kb(m, "FilePmdMapped: ", 142 + global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR); 135 143 #endif 136 144 137 145 #ifdef CONFIG_CMA

+4 -2

fs/proc/task_mmu.c

··· 417 417 unsigned long lazyfree; 418 418 unsigned long anonymous_thp; 419 419 unsigned long shmem_thp; 420 + unsigned long file_thp; 420 421 unsigned long swap; 421 422 unsigned long shared_hugetlb; 422 423 unsigned long private_hugetlb; ··· 462 461 static void smaps_account(struct mem_size_stats *mss, struct page *page, 463 462 bool compound, bool young, bool dirty, bool locked) 464 463 { 465 - int i, nr = compound ? 1 << compound_order(page) : 1; 464 + int i, nr = compound ? compound_nr(page) : 1; 466 465 unsigned long size = nr * PAGE_SIZE; 467 466 468 467 /* ··· 589 588 else if (is_zone_device_page(page)) 590 589 /* pass */; 591 590 else 592 - VM_BUG_ON_PAGE(1, page); 591 + mss->file_thp += HPAGE_PMD_SIZE; 593 592 smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked); 594 593 } 595 594 #else ··· 810 809 SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); 811 810 SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); 812 811 SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); 812 + SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); 813 813 SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); 814 814 seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", 815 815 mss->private_hugetlb >> 10, 7);

-5

include/asm-generic/pgalloc.h

··· 102 102 __free_page(pte_page); 103 103 } 104 104 105 - #else /* CONFIG_MMU */ 106 - 107 - /* This is enough for a nommu architecture */ 108 - #define check_pgt_cache() do { } while (0) 109 - 110 105 #endif /* CONFIG_MMU */ 111 106 112 107 #endif /* __ASM_GENERIC_PGALLOC_H */

+3 -4

include/asm-generic/pgtable.h

··· 1002 1002 * need this). If THP is not enabled, the pmd can't go away under the 1003 1003 * code even if MADV_DONTNEED runs, but if THP is enabled we need to 1004 1004 * run a pmd_trans_unstable before walking the ptes after 1005 - * split_huge_page_pmd returns (because it may have run when the pmd 1006 - * become null, but then a page fault can map in a THP and not a 1007 - * regular page). 1005 + * split_huge_pmd returns (because it may have run when the pmd become 1006 + * null, but then a page fault can map in a THP and not a regular page). 1008 1007 */ 1009 1008 static inline int pmd_trans_unstable(pmd_t *pmd) 1010 1009 { ··· 1125 1126 static inline void init_espfix_bsp(void) { } 1126 1127 #endif 1127 1128 1128 - extern void __init pgd_cache_init(void); 1129 + extern void __init pgtable_cache_init(void); 1129 1130 1130 1131 #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED 1131 1132 static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)

+17 -5

include/linux/compaction.h

··· 129 129 return false; 130 130 } 131 131 132 - /* 133 - * Compaction has backed off for some reason. It might be throttling or 134 - * lock contention. Retrying is still worthwhile. 135 - */ 136 - static inline bool compaction_withdrawn(enum compact_result result) 132 + /* Compaction needs reclaim to be performed first, so it can continue. */ 133 + static inline bool compaction_needs_reclaim(enum compact_result result) 137 134 { 138 135 /* 139 136 * Compaction backed off due to watermark checks for order-0 ··· 139 142 if (result == COMPACT_SKIPPED) 140 143 return true; 141 144 145 + return false; 146 + } 147 + 148 + /* 149 + * Compaction has backed off for some reason after doing some work or none 150 + * at all. It might be throttling or lock contention. Retrying might be still 151 + * worthwhile, but with a higher priority if allowed. 152 + */ 153 + static inline bool compaction_withdrawn(enum compact_result result) 154 + { 142 155 /* 143 156 * If compaction is deferred for high-order allocations, it is 144 157 * because sync compaction recently failed. If this is the case ··· 210 203 } 211 204 212 205 static inline bool compaction_failed(enum compact_result result) 206 + { 207 + return false; 208 + } 209 + 210 + static inline bool compaction_needs_reclaim(enum compact_result result) 213 211 { 214 212 return false; 215 213 }

+32

include/linux/fs.h

··· 429 429 * @i_pages: Cached pages. 430 430 * @gfp_mask: Memory allocation flags to use for allocating pages. 431 431 * @i_mmap_writable: Number of VM_SHARED mappings. 432 + * @nr_thps: Number of THPs in the pagecache (non-shmem only). 432 433 * @i_mmap: Tree of private and shared mappings. 433 434 * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. 434 435 * @nrpages: Number of page entries, protected by the i_pages lock. ··· 447 446 struct xarray i_pages; 448 447 gfp_t gfp_mask; 449 448 atomic_t i_mmap_writable; 449 + #ifdef CONFIG_READ_ONLY_THP_FOR_FS 450 + /* number of thp, only for non-shmem files */ 451 + atomic_t nr_thps; 452 + #endif 450 453 struct rb_root_cached i_mmap; 451 454 struct rw_semaphore i_mmap_rwsem; 452 455 unsigned long nrpages; ··· 2801 2796 static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) 2802 2797 { 2803 2798 return errseq_sample(&mapping->wb_err); 2799 + } 2800 + 2801 + static inline int filemap_nr_thps(struct address_space *mapping) 2802 + { 2803 + #ifdef CONFIG_READ_ONLY_THP_FOR_FS 2804 + return atomic_read(&mapping->nr_thps); 2805 + #else 2806 + return 0; 2807 + #endif 2808 + } 2809 + 2810 + static inline void filemap_nr_thps_inc(struct address_space *mapping) 2811 + { 2812 + #ifdef CONFIG_READ_ONLY_THP_FOR_FS 2813 + atomic_inc(&mapping->nr_thps); 2814 + #else 2815 + WARN_ON_ONCE(1); 2816 + #endif 2817 + } 2818 + 2819 + static inline void filemap_nr_thps_dec(struct address_space *mapping) 2820 + { 2821 + #ifdef CONFIG_READ_ONLY_THP_FOR_FS 2822 + atomic_dec(&mapping->nr_thps); 2823 + #else 2824 + WARN_ON_ONCE(1); 2825 + #endif 2804 2826 } 2805 2827 2806 2828 extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,

+9

include/linux/huge_mm.h

··· 267 267 return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); 268 268 } 269 269 270 + static inline struct list_head *page_deferred_list(struct page *page) 271 + { 272 + /* 273 + * Global or memcg deferred list in the second tail pages is 274 + * occupied by compound_head. 275 + */ 276 + return &page[2].deferred_list; 277 + } 278 + 270 279 #else /* CONFIG_TRANSPARENT_HUGEPAGE */ 271 280 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) 272 281 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })

+1 -1

include/linux/hugetlb.h

··· 454 454 static inline struct hstate *page_hstate(struct page *page) 455 455 { 456 456 VM_BUG_ON_PAGE(!PageHuge(page), page); 457 - return size_to_hstate(PAGE_SIZE << compound_order(page)); 457 + return size_to_hstate(page_size(page)); 458 458 } 459 459 460 460 static inline unsigned hstate_index_to_shift(unsigned index)

-2

include/linux/jbd2.h

··· 1410 1410 extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); 1411 1411 extern int jbd2_journal_force_commit(journal_t *); 1412 1412 extern int jbd2_journal_force_commit_nested(journal_t *); 1413 - extern int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode); 1414 - extern int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode); 1415 1413 extern int jbd2_journal_inode_ranged_write(handle_t *handle, 1416 1414 struct jbd2_inode *inode, loff_t start_byte, 1417 1415 loff_t length);

+12

include/linux/khugepaged.h

··· 15 15 extern void __khugepaged_exit(struct mm_struct *mm); 16 16 extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, 17 17 unsigned long vm_flags); 18 + #ifdef CONFIG_SHMEM 19 + extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr); 20 + #else 21 + static inline void collapse_pte_mapped_thp(struct mm_struct *mm, 22 + unsigned long addr) 23 + { 24 + } 25 + #endif 18 26 19 27 #define khugepaged_enabled() \ 20 28 (transparent_hugepage_flags & \ ··· 80 72 unsigned long vm_flags) 81 73 { 82 74 return 0; 75 + } 76 + static inline void collapse_pte_mapped_thp(struct mm_struct *mm, 77 + unsigned long addr) 78 + { 83 79 } 84 80 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 85 81

+15 -8

include/linux/memcontrol.h

··· 128 128 129 129 struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; 130 130 131 - #ifdef CONFIG_MEMCG_KMEM 132 131 struct memcg_shrinker_map __rcu *shrinker_map; 133 - #endif 132 + 134 133 struct rb_node tree_node; /* RB tree node */ 135 134 unsigned long usage_in_excess;/* Set to the value by which */ 136 135 /* the soft limit is exceeded*/ ··· 329 330 /* List of events which userspace want to receive */ 330 331 struct list_head event_list; 331 332 spinlock_t event_list_lock; 333 + 334 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 335 + struct deferred_split deferred_split_queue; 336 + #endif 332 337 333 338 struct mem_cgroup_per_node *nodeinfo[0]; 334 339 /* WARNING: nodeinfo must be the last member here */ ··· 1314 1311 } while ((memcg = parent_mem_cgroup(memcg))); 1315 1312 return false; 1316 1313 } 1314 + 1315 + extern int memcg_expand_shrinker_maps(int new_id); 1316 + 1317 + extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, 1318 + int nid, int shrinker_id); 1317 1319 #else 1318 1320 #define mem_cgroup_sockets_enabled 0 1319 1321 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; ··· 1326 1318 static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) 1327 1319 { 1328 1320 return false; 1321 + } 1322 + 1323 + static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, 1324 + int nid, int shrinker_id) 1325 + { 1329 1326 } 1330 1327 #endif 1331 1328 ··· 1403 1390 return memcg ? memcg->kmemcg_id : -1; 1404 1391 } 1405 1392 1406 - extern int memcg_expand_shrinker_maps(int new_id); 1407 - 1408 - extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, 1409 - int nid, int shrinker_id); 1410 1393 #else 1411 1394 1412 1395 static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) ··· 1444 1435 { 1445 1436 } 1446 1437 1447 - static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, 1448 - int nid, int shrinker_id) { } 1449 1438 #endif /* CONFIG_MEMCG_KMEM */ 1450 1439 1451 1440 #endif /* _LINUX_MEMCONTROL_H */

+3 -4

include/linux/memory.h

··· 25 25 26 26 struct memory_block { 27 27 unsigned long start_section_nr; 28 - unsigned long end_section_nr; 29 28 unsigned long state; /* serialized by the dev->lock */ 30 29 int section_count; /* serialized by mem_sysfs_mutex */ 31 30 int online_type; /* for passing data to online routine */ ··· 79 80 #define IPC_CALLBACK_PRI 10 80 81 81 82 #ifndef CONFIG_MEMORY_HOTPLUG_SPARSE 82 - static inline int memory_dev_init(void) 83 + static inline void memory_dev_init(void) 83 84 { 84 - return 0; 85 + return; 85 86 } 86 87 static inline int register_memory_notifier(struct notifier_block *nb) 87 88 { ··· 112 113 extern void unregister_memory_isolate_notifier(struct notifier_block *nb); 113 114 int create_memory_block_devices(unsigned long start, unsigned long size); 114 115 void remove_memory_block_devices(unsigned long start, unsigned long size); 115 - extern int memory_dev_init(void); 116 + extern void memory_dev_init(void); 116 117 extern int memory_notify(unsigned long val, void *v); 117 118 extern int memory_isolate_notify(unsigned long val, void *v); 118 119 extern struct memory_block *find_memory_block(struct mem_section *);

+35 -2

include/linux/mm.h

··· 805 805 page[1].compound_order = order; 806 806 } 807 807 808 + /* Returns the number of pages in this potentially compound page. */ 809 + static inline unsigned long compound_nr(struct page *page) 810 + { 811 + return 1UL << compound_order(page); 812 + } 813 + 814 + /* Returns the number of bytes in this potentially compound page. */ 815 + static inline unsigned long page_size(struct page *page) 816 + { 817 + return PAGE_SIZE << compound_order(page); 818 + } 819 + 820 + /* Returns the number of bits needed for the number of bytes in a page */ 821 + static inline unsigned int page_shift(struct page *page) 822 + { 823 + return PAGE_SHIFT + compound_order(page); 824 + } 825 + 808 826 void free_compound_page(struct page *page); 809 827 810 828 #ifdef CONFIG_MMU ··· 1075 1057 put_page(page); 1076 1058 } 1077 1059 1078 - void put_user_pages_dirty(struct page **pages, unsigned long npages); 1079 - void put_user_pages_dirty_lock(struct page **pages, unsigned long npages); 1060 + void put_user_pages_dirty_lock(struct page **pages, unsigned long npages, 1061 + bool make_dirty); 1062 + 1080 1063 void put_user_pages(struct page **pages, unsigned long npages); 1081 1064 1082 1065 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) ··· 1424 1405 1425 1406 extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); 1426 1407 1408 + #ifdef CONFIG_MMU 1427 1409 extern bool can_do_mlock(void); 1410 + #else 1411 + static inline bool can_do_mlock(void) { return false; } 1412 + #endif 1428 1413 extern int user_shm_lock(size_t, struct user_struct *); 1429 1414 extern void user_shm_unlock(size_t, struct user_struct *); 1430 1415 ··· 2328 2305 unsigned long addr, unsigned long len, 2329 2306 unsigned long flags, struct page **pages); 2330 2307 2308 + unsigned long randomize_stack_top(unsigned long stack_top); 2309 + 2331 2310 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); 2332 2311 2333 2312 extern unsigned long mmap_region(struct file *file, unsigned long addr, ··· 2593 2568 #define FOLL_COW 0x4000 /* internal GUP flag */ 2594 2569 #define FOLL_ANON 0x8000 /* don't do file mappings */ 2595 2570 #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ 2571 + #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ 2596 2572 2597 2573 /* 2598 2574 * NOTE on FOLL_LONGTERM: ··· 2870 2844 #else 2871 2845 static inline void setup_nr_node_ids(void) {} 2872 2846 #endif 2847 + 2848 + extern int memcmp_pages(struct page *page1, struct page *page2); 2849 + 2850 + static inline int pages_identical(struct page *page1, struct page *page2) 2851 + { 2852 + return !memcmp_pages(page1, page2); 2853 + } 2873 2854 2874 2855 #endif /* __KERNEL__ */ 2875 2856 #endif /* _LINUX_MM_H */

+1

include/linux/mm_types.h

··· 138 138 struct { /* Second tail page of compound page */ 139 139 unsigned long _compound_pad_1; /* compound_head */ 140 140 unsigned long _compound_pad_2; 141 + /* For both global and memcg */ 141 142 struct list_head deferred_list; 142 143 }; 143 144 struct { /* Page table pages */

+11 -3

include/linux/mmzone.h

··· 235 235 NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ 236 236 NR_SHMEM_THPS, 237 237 NR_SHMEM_PMDMAPPED, 238 + NR_FILE_THPS, 239 + NR_FILE_PMDMAPPED, 238 240 NR_ANON_THPS, 239 241 NR_UNSTABLE_NFS, /* NFS unstable pages */ 240 242 NR_VMSCAN_WRITE, ··· 679 677 extern struct page *mem_map; 680 678 #endif 681 679 680 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 681 + struct deferred_split { 682 + spinlock_t split_queue_lock; 683 + struct list_head split_queue; 684 + unsigned long split_queue_len; 685 + }; 686 + #endif 687 + 682 688 /* 683 689 * On NUMA machines, each NUMA node would have a pg_data_t to describe 684 690 * it's memory layout. On UMA machines there is a single pglist_data which ··· 766 756 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 767 757 768 758 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 769 - spinlock_t split_queue_lock; 770 - struct list_head split_queue; 771 - unsigned long split_queue_len; 759 + struct deferred_split deferred_split_queue; 772 760 #endif 773 761 774 762 /* Fields commonly accessed by the page reclaim scanner */

+1

include/linux/page_ext.h

··· 18 18 19 19 enum page_ext_flags { 20 20 PAGE_EXT_OWNER, 21 + PAGE_EXT_OWNER_ACTIVE, 21 22 #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) 22 23 PAGE_EXT_YOUNG, 23 24 PAGE_EXT_IDLE,

+10

include/linux/pagemap.h

··· 333 333 mapping_gfp_mask(mapping)); 334 334 } 335 335 336 + static inline struct page *find_subpage(struct page *page, pgoff_t offset) 337 + { 338 + if (PageHuge(page)) 339 + return page; 340 + 341 + VM_BUG_ON_PAGE(PageTail(page), page); 342 + 343 + return page + (offset & (compound_nr(page) - 1)); 344 + } 345 + 336 346 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); 337 347 struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); 338 348 unsigned find_get_entries(struct address_space *mapping, pgoff_t start,

-94

include/linux/quicklist.h

··· 1 - /* SPDX-License-Identifier: GPL-2.0 */ 2 - #ifndef LINUX_QUICKLIST_H 3 - #define LINUX_QUICKLIST_H 4 - /* 5 - * Fast allocations and disposal of pages. Pages must be in the condition 6 - * as needed after allocation when they are freed. Per cpu lists of pages 7 - * are kept that only contain node local pages. 8 - * 9 - * (C) 2007, SGI. Christoph Lameter <cl@linux.com> 10 - */ 11 - #include <linux/kernel.h> 12 - #include <linux/gfp.h> 13 - #include <linux/percpu.h> 14 - 15 - #ifdef CONFIG_QUICKLIST 16 - 17 - struct quicklist { 18 - void *page; 19 - int nr_pages; 20 - }; 21 - 22 - DECLARE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; 23 - 24 - /* 25 - * The two key functions quicklist_alloc and quicklist_free are inline so 26 - * that they may be custom compiled for the platform. 27 - * Specifying a NULL ctor can remove constructor support. Specifying 28 - * a constant quicklist allows the determination of the exact address 29 - * in the per cpu area. 30 - * 31 - * The fast patch in quicklist_alloc touched only a per cpu cacheline and 32 - * the first cacheline of the page itself. There is minmal overhead involved. 33 - */ 34 - static inline void *quicklist_alloc(int nr, gfp_t flags, void (*ctor)(void *)) 35 - { 36 - struct quicklist *q; 37 - void **p = NULL; 38 - 39 - q =&get_cpu_var(quicklist)[nr]; 40 - p = q->page; 41 - if (likely(p)) { 42 - q->page = p[0]; 43 - p[0] = NULL; 44 - q->nr_pages--; 45 - } 46 - put_cpu_var(quicklist); 47 - if (likely(p)) 48 - return p; 49 - 50 - p = (void *)__get_free_page(flags | __GFP_ZERO); 51 - if (ctor && p) 52 - ctor(p); 53 - return p; 54 - } 55 - 56 - static inline void __quicklist_free(int nr, void (*dtor)(void *), void *p, 57 - struct page *page) 58 - { 59 - struct quicklist *q; 60 - 61 - q = &get_cpu_var(quicklist)[nr]; 62 - *(void **)p = q->page; 63 - q->page = p; 64 - q->nr_pages++; 65 - put_cpu_var(quicklist); 66 - } 67 - 68 - static inline void quicklist_free(int nr, void (*dtor)(void *), void *pp) 69 - { 70 - __quicklist_free(nr, dtor, pp, virt_to_page(pp)); 71 - } 72 - 73 - static inline void quicklist_free_page(int nr, void (*dtor)(void *), 74 - struct page *page) 75 - { 76 - __quicklist_free(nr, dtor, page_address(page), page); 77 - } 78 - 79 - void quicklist_trim(int nr, void (*dtor)(void *), 80 - unsigned long min_pages, unsigned long max_free); 81 - 82 - unsigned long quicklist_total_size(void); 83 - 84 - #else 85 - 86 - static inline unsigned long quicklist_total_size(void) 87 - { 88 - return 0; 89 - } 90 - 91 - #endif 92 - 93 - #endif /* LINUX_QUICKLIST_H */ 94 -

+6 -1

include/linux/shrinker.h

··· 69 69 70 70 /* These are for internal use */ 71 71 struct list_head list; 72 - #ifdef CONFIG_MEMCG_KMEM 72 + #ifdef CONFIG_MEMCG 73 73 /* ID in shrinker_idr */ 74 74 int id; 75 75 #endif ··· 81 81 /* Flags */ 82 82 #define SHRINKER_NUMA_AWARE (1 << 0) 83 83 #define SHRINKER_MEMCG_AWARE (1 << 1) 84 + /* 85 + * It just makes sense when the shrinker is also MEMCG_AWARE for now, 86 + * non-MEMCG_AWARE shrinker should not have this flag set. 87 + */ 88 + #define SHRINKER_NONSLAB (1 << 2) 84 89 85 90 extern int prealloc_shrinker(struct shrinker *shrinker); 86 91 extern void register_shrinker_prepared(struct shrinker *shrinker);

-62

include/linux/slab.h

··· 595 595 return __kmalloc_node(size, flags, node); 596 596 } 597 597 598 - struct memcg_cache_array { 599 - struct rcu_head rcu; 600 - struct kmem_cache *entries[0]; 601 - }; 602 - 603 - /* 604 - * This is the main placeholder for memcg-related information in kmem caches. 605 - * Both the root cache and the child caches will have it. For the root cache, 606 - * this will hold a dynamically allocated array large enough to hold 607 - * information about the currently limited memcgs in the system. To allow the 608 - * array to be accessed without taking any locks, on relocation we free the old 609 - * version only after a grace period. 610 - * 611 - * Root and child caches hold different metadata. 612 - * 613 - * @root_cache: Common to root and child caches. NULL for root, pointer to 614 - * the root cache for children. 615 - * 616 - * The following fields are specific to root caches. 617 - * 618 - * @memcg_caches: kmemcg ID indexed table of child caches. This table is 619 - * used to index child cachces during allocation and cleared 620 - * early during shutdown. 621 - * 622 - * @root_caches_node: List node for slab_root_caches list. 623 - * 624 - * @children: List of all child caches. While the child caches are also 625 - * reachable through @memcg_caches, a child cache remains on 626 - * this list until it is actually destroyed. 627 - * 628 - * The following fields are specific to child caches. 629 - * 630 - * @memcg: Pointer to the memcg this cache belongs to. 631 - * 632 - * @children_node: List node for @root_cache->children list. 633 - * 634 - * @kmem_caches_node: List node for @memcg->kmem_caches list. 635 - */ 636 - struct memcg_cache_params { 637 - struct kmem_cache *root_cache; 638 - union { 639 - struct { 640 - struct memcg_cache_array __rcu *memcg_caches; 641 - struct list_head __root_caches_node; 642 - struct list_head children; 643 - bool dying; 644 - }; 645 - struct { 646 - struct mem_cgroup *memcg; 647 - struct list_head children_node; 648 - struct list_head kmem_caches_node; 649 - struct percpu_ref refcnt; 650 - 651 - void (*work_fn)(struct kmem_cache *); 652 - union { 653 - struct rcu_head rcu_head; 654 - struct work_struct work; 655 - }; 656 - }; 657 - }; 658 - }; 659 - 660 598 int memcg_update_all_caches(int num_memcgs); 661 599 662 600 /**

+13 -7

include/linux/vmalloc.h

··· 53 53 unsigned long va_start; 54 54 unsigned long va_end; 55 55 56 - /* 57 - * Largest available free size in subtree. 58 - */ 59 - unsigned long subtree_max_size; 60 - unsigned long flags; 61 56 struct rb_node rb_node; /* address sorted rbtree */ 62 57 struct list_head list; /* address sorted list */ 63 - struct llist_node purge_list; /* "lazy purge" list */ 64 - struct vm_struct *vm; 58 + 59 + /* 60 + * The following three variables can be packed, because 61 + * a vmap_area object is always one of the three states: 62 + * 1) in "free" tree (root is vmap_area_root) 63 + * 2) in "busy" tree (root is free_vmap_area_root) 64 + * 3) in purge list (head is vmap_purge_list) 65 + */ 66 + union { 67 + unsigned long subtree_max_size; /* in "free" tree */ 68 + struct vm_struct *vm; /* in "busy" tree */ 69 + struct llist_node purge_list; /* in purge list */ 70 + }; 65 71 }; 66 72 67 73 /*

+3

include/linux/zpool.h

··· 46 46 47 47 void zpool_destroy_pool(struct zpool *pool); 48 48 49 + bool zpool_malloc_support_movable(struct zpool *pool); 50 + 49 51 int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, 50 52 unsigned long *handle); 51 53 ··· 92 90 struct zpool *zpool); 93 91 void (*destroy)(void *pool); 94 92 93 + bool malloc_support_movable; 95 94 int (*malloc)(void *pool, size_t size, gfp_t gfp, 96 95 unsigned long *handle); 97 96 void (*free)(void *pool, unsigned long handle);

+2 -4

init/main.c

··· 507 507 508 508 void __init __weak poking_init(void) { } 509 509 510 - void __init __weak pgd_cache_init(void) { } 510 + void __init __weak pgtable_cache_init(void) { } 511 511 512 512 bool initcall_debug; 513 513 core_param(initcall_debug, initcall_debug, bool, 0644); ··· 556 556 report_meminit(); 557 557 mem_init(); 558 558 kmem_cache_init(); 559 + kmemleak_init(); 559 560 pgtable_init(); 560 561 debug_objects_mem_init(); 561 562 vmalloc_init(); ··· 565 564 init_espfix_bsp(); 566 565 /* Should be run after espfix64 is set up. */ 567 566 pti_init(); 568 - pgd_cache_init(); 569 567 } 570 568 571 569 void __init __weak arch_call_rest_init(void) ··· 594 594 page_address_init(); 595 595 pr_notice("%s", linux_banner); 596 596 setup_arch(&command_line); 597 - mm_init_cpumask(&init_mm); 598 597 setup_command_line(command_line); 599 598 setup_nr_cpu_ids(); 600 599 setup_per_cpu_areas(); ··· 739 740 initrd_start = 0; 740 741 } 741 742 #endif 742 - kmemleak_init(); 743 743 setup_per_cpu_pageset(); 744 744 numa_policy_init(); 745 745 acpi_early_init();

+62 -19

kernel/events/uprobes.c

··· 26 26 #include <linux/percpu-rwsem.h> 27 27 #include <linux/task_work.h> 28 28 #include <linux/shmem_fs.h> 29 + #include <linux/khugepaged.h> 29 30 30 31 #include <linux/uprobes.h> 31 32 ··· 144 143 * 145 144 * @vma: vma that holds the pte pointing to page 146 145 * @addr: address the old @page is mapped at 147 - * @page: the cowed page we are replacing by kpage 148 - * @kpage: the modified page we replace page by 146 + * @old_page: the page we are replacing by new_page 147 + * @new_page: the modified page we replace page by 149 148 * 150 - * Returns 0 on success, -EFAULT on failure. 149 + * If @new_page is NULL, only unmap @old_page. 150 + * 151 + * Returns 0 on success, negative error code otherwise. 151 152 */ 152 153 static int __replace_page(struct vm_area_struct *vma, unsigned long addr, 153 154 struct page *old_page, struct page *new_page) 154 155 { 155 156 struct mm_struct *mm = vma->vm_mm; 156 157 struct page_vma_mapped_walk pvmw = { 157 - .page = old_page, 158 + .page = compound_head(old_page), 158 159 .vma = vma, 159 160 .address = addr, 160 161 }; ··· 167 164 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, 168 165 addr + PAGE_SIZE); 169 166 170 - VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); 171 - 172 - err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, 173 - false); 174 - if (err) 175 - return err; 167 + if (new_page) { 168 + err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, 169 + &memcg, false); 170 + if (err) 171 + return err; 172 + } 176 173 177 174 /* For try_to_free_swap() and munlock_vma_page() below */ 178 175 lock_page(old_page); ··· 180 177 mmu_notifier_invalidate_range_start(&range); 181 178 err = -EAGAIN; 182 179 if (!page_vma_mapped_walk(&pvmw)) { 183 - mem_cgroup_cancel_charge(new_page, memcg, false); 180 + if (new_page) 181 + mem_cgroup_cancel_charge(new_page, memcg, false); 184 182 goto unlock; 185 183 } 186 184 VM_BUG_ON_PAGE(addr != pvmw.address, old_page); 187 185 188 - get_page(new_page); 189 - page_add_new_anon_rmap(new_page, vma, addr, false); 190 - mem_cgroup_commit_charge(new_page, memcg, false, false); 191 - lru_cache_add_active_or_unevictable(new_page, vma); 186 + if (new_page) { 187 + get_page(new_page); 188 + page_add_new_anon_rmap(new_page, vma, addr, false); 189 + mem_cgroup_commit_charge(new_page, memcg, false, false); 190 + lru_cache_add_active_or_unevictable(new_page, vma); 191 + } else 192 + /* no new page, just dec_mm_counter for old_page */ 193 + dec_mm_counter(mm, MM_ANONPAGES); 192 194 193 195 if (!PageAnon(old_page)) { 194 196 dec_mm_counter(mm, mm_counter_file(old_page)); ··· 202 194 203 195 flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); 204 196 ptep_clear_flush_notify(vma, addr, pvmw.pte); 205 - set_pte_at_notify(mm, addr, pvmw.pte, 206 - mk_pte(new_page, vma->vm_page_prot)); 197 + if (new_page) 198 + set_pte_at_notify(mm, addr, pvmw.pte, 199 + mk_pte(new_page, vma->vm_page_prot)); 207 200 208 201 page_remove_rmap(old_page, false); 209 202 if (!page_mapped(old_page)) ··· 473 464 struct page *old_page, *new_page; 474 465 struct vm_area_struct *vma; 475 466 int ret, is_register, ref_ctr_updated = 0; 467 + bool orig_page_huge = false; 476 468 477 469 is_register = is_swbp_insn(&opcode); 478 470 uprobe = container_of(auprobe, struct uprobe, arch); ··· 481 471 retry: 482 472 /* Read the page with vaddr into memory */ 483 473 ret = get_user_pages_remote(NULL, mm, vaddr, 1, 484 - FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL); 474 + FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL); 485 475 if (ret <= 0) 486 476 return ret; 487 477 ··· 498 488 ref_ctr_updated = 1; 499 489 } 500 490 491 + ret = 0; 492 + if (!is_register && !PageAnon(old_page)) 493 + goto put_old; 494 + 501 495 ret = anon_vma_prepare(vma); 502 496 if (ret) 503 497 goto put_old; ··· 515 501 copy_highpage(new_page, old_page); 516 502 copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); 517 503 504 + if (!is_register) { 505 + struct page *orig_page; 506 + pgoff_t index; 507 + 508 + VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); 509 + 510 + index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; 511 + orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, 512 + index); 513 + 514 + if (orig_page) { 515 + if (PageUptodate(orig_page) && 516 + pages_identical(new_page, orig_page)) { 517 + /* let go new_page */ 518 + put_page(new_page); 519 + new_page = NULL; 520 + 521 + if (PageCompound(orig_page)) 522 + orig_page_huge = true; 523 + } 524 + put_page(orig_page); 525 + } 526 + } 527 + 518 528 ret = __replace_page(vma, vaddr, old_page, new_page); 519 - put_page(new_page); 529 + if (new_page) 530 + put_page(new_page); 520 531 put_old: 521 532 put_page(old_page); 522 533 ··· 551 512 /* Revert back reference counter if instruction update failed. */ 552 513 if (ret && is_register && ref_ctr_updated) 553 514 update_ref_ctr(uprobe, mm, -1); 515 + 516 + /* try collapse pmd for compound page */ 517 + if (!ret && orig_page_huge) 518 + collapse_pte_mapped_thp(mm, vaddr); 554 519 555 520 return ret; 556 521 }

+2 -2

kernel/resource.c

··· 487 487 while (start < end && 488 488 !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, 489 489 false, &res)) { 490 - pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; 491 - end_pfn = (res.end + 1) >> PAGE_SHIFT; 490 + pfn = PFN_UP(res.start); 491 + end_pfn = PFN_DOWN(res.end + 1); 492 492 if (end_pfn > pfn) 493 493 ret = (*func)(pfn, end_pfn - pfn, arg); 494 494 if (ret)

-1

kernel/sched/idle.c

··· 238 238 tick_nohz_idle_enter(); 239 239 240 240 while (!need_resched()) { 241 - check_pgt_cache(); 242 241 rmb(); 243 242 244 243 local_irq_disable();

+4 -2

kernel/sysctl.c

··· 264 264 extern struct ctl_table firmware_config_table[]; 265 265 #endif 266 266 267 - #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 267 + #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ 268 + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) 268 269 int sysctl_legacy_va_layout; 269 270 #endif 270 271 ··· 1574 1573 .proc_handler = proc_dointvec, 1575 1574 .extra1 = SYSCTL_ZERO, 1576 1575 }, 1577 - #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 1576 + #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ 1577 + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) 1578 1578 { 1579 1579 .procname = "legacy_va_layout", 1580 1580 .data = &sysctl_legacy_va_layout,

+8 -7

lib/Kconfig.debug

··· 576 576 In order to access the kmemleak file, debugfs needs to be 577 577 mounted (usually at /sys/kernel/debug). 578 578 579 - config DEBUG_KMEMLEAK_EARLY_LOG_SIZE 580 - int "Maximum kmemleak early log entries" 579 + config DEBUG_KMEMLEAK_MEM_POOL_SIZE 580 + int "Kmemleak memory pool size" 581 581 depends on DEBUG_KMEMLEAK 582 - range 200 40000 583 - default 400 582 + range 200 1000000 583 + default 16000 584 584 help 585 585 Kmemleak must track all the memory allocations to avoid 586 586 reporting false positives. Since memory may be allocated or 587 - freed before kmemleak is initialised, an early log buffer is 588 - used to store these actions. If kmemleak reports "early log 589 - buffer exceeded", please increase this value. 587 + freed before kmemleak is fully initialised, use a static pool 588 + of metadata objects to track such callbacks. After kmemleak is 589 + fully initialised, this memory pool acts as an emergency one 590 + if slab allocations fail. 590 591 591 592 config DEBUG_KMEMLEAK_TEST 592 593 tristate "Simple test for the kernel memory leak detector"

+8

lib/Kconfig.kasan

··· 134 134 to 3TB of RAM with KASan enabled). This options allows to force 135 135 4-level paging instead. 136 136 137 + config KASAN_SW_TAGS_IDENTIFY 138 + bool "Enable memory corruption identification" 139 + depends on KASAN_SW_TAGS 140 + help 141 + This option enables best-effort identification of bug type 142 + (use-after-free or out-of-bounds) at the cost of increased 143 + memory consumption. 144 + 137 145 config TEST_KASAN 138 146 tristate "Module for testing KASAN for bug detection" 139 147 depends on m && KASAN

+1 -1

lib/iov_iter.c

··· 878 878 head = compound_head(page); 879 879 v += (page - head) << PAGE_SHIFT; 880 880 881 - if (likely(n <= v && v <= (PAGE_SIZE << compound_order(head)))) 881 + if (likely(n <= v && v <= (page_size(head)))) 882 882 return true; 883 883 WARN_ON(1); 884 884 return false;

-5

lib/show_mem.c

··· 6 6 */ 7 7 8 8 #include <linux/mm.h> 9 - #include <linux/quicklist.h> 10 9 #include <linux/cma.h> 11 10 12 11 void show_mem(unsigned int filter, nodemask_t *nodemask) ··· 37 38 printk("%lu pages reserved\n", reserved); 38 39 #ifdef CONFIG_CMA 39 40 printk("%lu pages cma reserved\n", totalcma_pages); 40 - #endif 41 - #ifdef CONFIG_QUICKLIST 42 - printk("%lu pages in pagetable cache\n", 43 - quicklist_total_size()); 44 41 #endif 45 42 #ifdef CONFIG_MEMORY_FAILURE 46 43 printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));

+41

lib/test_kasan.c

··· 18 18 #include <linux/slab.h> 19 19 #include <linux/string.h> 20 20 #include <linux/uaccess.h> 21 + #include <linux/io.h> 22 + 23 + #include <asm/page.h> 21 24 22 25 /* 23 26 * Note: test functions are marked noinline so that their names appear in ··· 338 335 if (ptr1 == ptr2) 339 336 pr_err("Could not detect use-after-free: ptr1 == ptr2\n"); 340 337 kfree(ptr2); 338 + } 339 + 340 + static noinline void __init kfree_via_page(void) 341 + { 342 + char *ptr; 343 + size_t size = 8; 344 + struct page *page; 345 + unsigned long offset; 346 + 347 + pr_info("invalid-free false positive (via page)\n"); 348 + ptr = kmalloc(size, GFP_KERNEL); 349 + if (!ptr) { 350 + pr_err("Allocation failed\n"); 351 + return; 352 + } 353 + 354 + page = virt_to_page(ptr); 355 + offset = offset_in_page(ptr); 356 + kfree(page_address(page) + offset); 357 + } 358 + 359 + static noinline void __init kfree_via_phys(void) 360 + { 361 + char *ptr; 362 + size_t size = 8; 363 + phys_addr_t phys; 364 + 365 + pr_info("invalid-free false positive (via phys)\n"); 366 + ptr = kmalloc(size, GFP_KERNEL); 367 + if (!ptr) { 368 + pr_err("Allocation failed\n"); 369 + return; 370 + } 371 + 372 + phys = virt_to_phys(ptr); 373 + kfree(phys_to_virt(phys)); 341 374 } 342 375 343 376 static noinline void __init kmem_cache_oob(void) ··· 776 737 kmalloc_uaf(); 777 738 kmalloc_uaf_memset(); 778 739 kmalloc_uaf2(); 740 + kfree_via_page(); 741 + kfree_via_phys(); 779 742 kmem_cache_oob(); 780 743 memcg_accounted_kmem_cache(); 781 744 kasan_stack_oob();

+11 -5

mm/Kconfig

··· 273 273 by default when ZONE_DMA or HIGHMEM is selected, but you 274 274 may say n to override this. 275 275 276 - config NR_QUICK 277 - int 278 - depends on QUICKLIST 279 - default "1" 280 - 281 276 config VIRT_TO_BUS 282 277 bool 283 278 help ··· 711 716 712 717 config GUP_GET_PTE_LOW_HIGH 713 718 bool 719 + 720 + config READ_ONLY_THP_FOR_FS 721 + bool "Read-only THP for filesystems (EXPERIMENTAL)" 722 + depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM 723 + 724 + help 725 + Allow khugepaged to put read-only file-backed pages in THP. 726 + 727 + This is marked experimental because it is a new feature. Write 728 + support of file THPs will be developed in the next few release 729 + cycles. 714 730 715 731 config ARCH_HAS_PTE_SPECIAL 716 732 bool

+3 -1

mm/Kconfig.debug

··· 21 21 Also, the state of page tracking structures is checked more often as 22 22 pages are being allocated and freed, as unexpected state changes 23 23 often happen for same reasons as memory corruption (e.g. double free, 24 - use-after-free). 24 + use-after-free). The error reports for these checks can be augmented 25 + with stack traces of last allocation and freeing of the page, when 26 + PAGE_OWNER is also selected and enabled on boot. 25 27 26 28 For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, 27 29 fill the pages with poison patterns after free_pages() and verify

+3 -1

mm/Makefile

··· 21 21 KCOV_INSTRUMENT_mmzone.o := n 22 22 KCOV_INSTRUMENT_vmstat.o := n 23 23 24 + CFLAGS_init-mm.o += $(call cc-disable-warning, override-init) 25 + CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides) 26 + 24 27 mmu-y := nommu.o 25 28 mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ 26 29 mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ ··· 75 72 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 76 73 obj-$(CONFIG_MEMTEST) += memtest.o 77 74 obj-$(CONFIG_MIGRATION) += migrate.o 78 - obj-$(CONFIG_QUICKLIST) += quicklist.o 79 75 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o 80 76 obj-$(CONFIG_PAGE_COUNTER) += page_counter.o 81 77 obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o

+20 -30

mm/compaction.c

··· 969 969 * is safe to read and it's 0 for tail pages. 970 970 */ 971 971 if (unlikely(PageCompound(page))) { 972 - low_pfn += (1UL << compound_order(page)) - 1; 972 + low_pfn += compound_nr(page) - 1; 973 973 goto isolate_fail; 974 974 } 975 975 } ··· 1737 1737 * starting at the block pointed to by the migrate scanner pfn within 1738 1738 * compact_control. 1739 1739 */ 1740 - static isolate_migrate_t isolate_migratepages(struct zone *zone, 1741 - struct compact_control *cc) 1740 + static isolate_migrate_t isolate_migratepages(struct compact_control *cc) 1742 1741 { 1743 1742 unsigned long block_start_pfn; 1744 1743 unsigned long block_end_pfn; ··· 1755 1756 */ 1756 1757 low_pfn = fast_find_migrateblock(cc); 1757 1758 block_start_pfn = pageblock_start_pfn(low_pfn); 1758 - if (block_start_pfn < zone->zone_start_pfn) 1759 - block_start_pfn = zone->zone_start_pfn; 1759 + if (block_start_pfn < cc->zone->zone_start_pfn) 1760 + block_start_pfn = cc->zone->zone_start_pfn; 1760 1761 1761 1762 /* 1762 1763 * fast_find_migrateblock marks a pageblock skipped so to avoid ··· 1786 1787 if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) 1787 1788 cond_resched(); 1788 1789 1789 - page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, 1790 - zone); 1790 + page = pageblock_pfn_to_page(block_start_pfn, 1791 + block_end_pfn, cc->zone); 1791 1792 if (!page) 1792 1793 continue; 1793 1794 ··· 2077 2078 const bool sync = cc->mode != MIGRATE_ASYNC; 2078 2079 bool update_cached; 2079 2080 2081 + /* 2082 + * These counters track activities during zone compaction. Initialize 2083 + * them before compacting a new zone. 2084 + */ 2085 + cc->total_migrate_scanned = 0; 2086 + cc->total_free_scanned = 0; 2087 + cc->nr_migratepages = 0; 2088 + cc->nr_freepages = 0; 2089 + INIT_LIST_HEAD(&cc->freepages); 2090 + INIT_LIST_HEAD(&cc->migratepages); 2091 + 2080 2092 cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); 2081 2093 ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, 2082 2094 cc->classzone_idx); ··· 2168 2158 cc->rescan = true; 2169 2159 } 2170 2160 2171 - switch (isolate_migratepages(cc->zone, cc)) { 2161 + switch (isolate_migratepages(cc)) { 2172 2162 case ISOLATE_ABORT: 2173 2163 ret = COMPACT_CONTENDED; 2174 2164 putback_movable_pages(&cc->migratepages); ··· 2291 2281 { 2292 2282 enum compact_result ret; 2293 2283 struct compact_control cc = { 2294 - .nr_freepages = 0, 2295 - .nr_migratepages = 0, 2296 - .total_migrate_scanned = 0, 2297 - .total_free_scanned = 0, 2298 2284 .order = order, 2299 2285 .search_order = order, 2300 2286 .gfp_mask = gfp_mask, ··· 2311 2305 2312 2306 if (capture) 2313 2307 current->capture_control = &capc; 2314 - INIT_LIST_HEAD(&cc.freepages); 2315 - INIT_LIST_HEAD(&cc.migratepages); 2316 2308 2317 2309 ret = compact_zone(&cc, &capc); 2318 2310 ··· 2412 2408 struct zone *zone; 2413 2409 struct compact_control cc = { 2414 2410 .order = -1, 2415 - .total_migrate_scanned = 0, 2416 - .total_free_scanned = 0, 2417 2411 .mode = MIGRATE_SYNC, 2418 2412 .ignore_skip_hint = true, 2419 2413 .whole_zone = true, ··· 2425 2423 if (!populated_zone(zone)) 2426 2424 continue; 2427 2425 2428 - cc.nr_freepages = 0; 2429 - cc.nr_migratepages = 0; 2430 2426 cc.zone = zone; 2431 - INIT_LIST_HEAD(&cc.freepages); 2432 - INIT_LIST_HEAD(&cc.migratepages); 2433 2427 2434 2428 compact_zone(&cc, NULL); 2435 2429 ··· 2527 2529 struct compact_control cc = { 2528 2530 .order = pgdat->kcompactd_max_order, 2529 2531 .search_order = pgdat->kcompactd_max_order, 2530 - .total_migrate_scanned = 0, 2531 - .total_free_scanned = 0, 2532 2532 .classzone_idx = pgdat->kcompactd_classzone_idx, 2533 2533 .mode = MIGRATE_SYNC_LIGHT, 2534 2534 .ignore_skip_hint = false, ··· 2550 2554 COMPACT_CONTINUE) 2551 2555 continue; 2552 2556 2553 - cc.nr_freepages = 0; 2554 - cc.nr_migratepages = 0; 2555 - cc.total_migrate_scanned = 0; 2556 - cc.total_free_scanned = 0; 2557 - cc.zone = zone; 2558 - INIT_LIST_HEAD(&cc.freepages); 2559 - INIT_LIST_HEAD(&cc.migratepages); 2560 - 2561 2557 if (kthread_should_stop()) 2562 2558 return; 2559 + 2560 + cc.zone = zone; 2563 2561 status = compact_zone(&cc, NULL); 2564 2562 2565 2563 if (status == COMPACT_SUCCESS) {

+72 -98

mm/filemap.c

··· 126 126 /* hugetlb pages are represented by a single entry in the xarray */ 127 127 if (!PageHuge(page)) { 128 128 xas_set_order(&xas, page->index, compound_order(page)); 129 - nr = 1U << compound_order(page); 129 + nr = compound_nr(page); 130 130 } 131 131 132 132 VM_BUG_ON_PAGE(!PageLocked(page), page); ··· 203 203 __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); 204 204 if (PageTransHuge(page)) 205 205 __dec_node_page_state(page, NR_SHMEM_THPS); 206 - } else { 207 - VM_BUG_ON_PAGE(PageTransHuge(page), page); 206 + } else if (PageTransHuge(page)) { 207 + __dec_node_page_state(page, NR_FILE_THPS); 208 + filemap_nr_thps_dec(mapping); 208 209 } 209 210 210 211 /* ··· 282 281 * @pvec: pagevec with pages to delete 283 282 * 284 283 * The function walks over mapping->i_pages and removes pages passed in @pvec 285 - * from the mapping. The function expects @pvec to be sorted by page index. 284 + * from the mapping. The function expects @pvec to be sorted by page index 285 + * and is optimised for it to be dense. 286 286 * It tolerates holes in @pvec (mapping entries at those indices are not 287 287 * modified). The function expects only THP head pages to be present in the 288 - * @pvec and takes care to delete all corresponding tail pages from the 289 - * mapping as well. 288 + * @pvec. 290 289 * 291 290 * The function expects the i_pages lock to be held. 292 291 */ ··· 295 294 { 296 295 XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); 297 296 int total_pages = 0; 298 - int i = 0, tail_pages = 0; 297 + int i = 0; 299 298 struct page *page; 300 299 301 300 mapping_set_update(&xas, mapping); 302 301 xas_for_each(&xas, page, ULONG_MAX) { 303 - if (i >= pagevec_count(pvec) && !tail_pages) 302 + if (i >= pagevec_count(pvec)) 304 303 break; 304 + 305 + /* A swap/dax/shadow entry got inserted? Skip it. */ 305 306 if (xa_is_value(page)) 306 307 continue; 307 - if (!tail_pages) { 308 - /* 309 - * Some page got inserted in our range? Skip it. We 310 - * have our pages locked so they are protected from 311 - * being removed. 312 - */ 313 - if (page != pvec->pages[i]) { 314 - VM_BUG_ON_PAGE(page->index > 315 - pvec->pages[i]->index, page); 316 - continue; 317 - } 318 - WARN_ON_ONCE(!PageLocked(page)); 319 - if (PageTransHuge(page) && !PageHuge(page)) 320 - tail_pages = HPAGE_PMD_NR - 1; 321 - page->mapping = NULL; 322 - /* 323 - * Leave page->index set: truncation lookup relies 324 - * upon it 325 - */ 326 - i++; 327 - } else { 328 - VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages 329 - != pvec->pages[i]->index, page); 330 - tail_pages--; 308 + /* 309 + * A page got inserted in our range? Skip it. We have our 310 + * pages locked so they are protected from being removed. 311 + * If we see a page whose index is higher than ours, it 312 + * means our page has been removed, which shouldn't be 313 + * possible because we're holding the PageLock. 314 + */ 315 + if (page != pvec->pages[i]) { 316 + VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index, 317 + page); 318 + continue; 331 319 } 320 + 321 + WARN_ON_ONCE(!PageLocked(page)); 322 + 323 + if (page->index == xas.xa_index) 324 + page->mapping = NULL; 325 + /* Leave page->index set: truncation lookup relies on it */ 326 + 327 + /* 328 + * Move to the next page in the vector if this is a regular 329 + * page or the index is of the last sub-page of this compound 330 + * page. 331 + */ 332 + if (page->index + compound_nr(page) - 1 == xas.xa_index) 333 + i++; 332 334 xas_store(&xas, NULL); 333 335 total_pages++; 334 336 } ··· 412 408 .range_end = end, 413 409 }; 414 410 415 - if (!mapping_cap_writeback_dirty(mapping)) 411 + if (!mapping_cap_writeback_dirty(mapping) || 412 + !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 416 413 return 0; 417 414 418 415 wbc_attach_fdatawrite_inode(&wbc, mapping->host); ··· 622 617 } 623 618 EXPORT_SYMBOL(filemap_fdatawait_keep_errors); 624 619 620 + /* Returns true if writeback might be needed or already in progress. */ 625 621 static bool mapping_needs_writeback(struct address_space *mapping) 626 622 { 627 - return (!dax_mapping(mapping) && mapping->nrpages) || 628 - (dax_mapping(mapping) && mapping->nrexceptional); 623 + if (dax_mapping(mapping)) 624 + return mapping->nrexceptional; 625 + 626 + return mapping->nrpages; 629 627 } 630 628 631 629 int filemap_write_and_wait(struct address_space *mapping) ··· 1524 1516 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) 1525 1517 { 1526 1518 XA_STATE(xas, &mapping->i_pages, offset); 1527 - struct page *head, *page; 1519 + struct page *page; 1528 1520 1529 1521 rcu_read_lock(); 1530 1522 repeat: ··· 1539 1531 if (!page || xa_is_value(page)) 1540 1532 goto out; 1541 1533 1542 - head = compound_head(page); 1543 - if (!page_cache_get_speculative(head)) 1534 + if (!page_cache_get_speculative(page)) 1544 1535 goto repeat; 1545 - 1546 - /* The page was split under us? */ 1547 - if (compound_head(page) != head) { 1548 - put_page(head); 1549 - goto repeat; 1550 - } 1551 1536 1552 1537 /* 1553 - * Has the page moved? 1538 + * Has the page moved or been split? 1554 1539 * This is part of the lockless pagecache protocol. See 1555 1540 * include/linux/pagemap.h for details. 1556 1541 */ 1557 1542 if (unlikely(page != xas_reload(&xas))) { 1558 - put_page(head); 1543 + put_page(page); 1559 1544 goto repeat; 1560 1545 } 1546 + page = find_subpage(page, offset); 1561 1547 out: 1562 1548 rcu_read_unlock(); 1563 1549 ··· 1648 1646 } 1649 1647 1650 1648 /* Has the page been truncated? */ 1651 - if (unlikely(page->mapping != mapping)) { 1649 + if (unlikely(compound_head(page)->mapping != mapping)) { 1652 1650 unlock_page(page); 1653 1651 put_page(page); 1654 1652 goto repeat; ··· 1733 1731 1734 1732 rcu_read_lock(); 1735 1733 xas_for_each(&xas, page, ULONG_MAX) { 1736 - struct page *head; 1737 1734 if (xas_retry(&xas, page)) 1738 1735 continue; 1739 1736 /* ··· 1743 1742 if (xa_is_value(page)) 1744 1743 goto export; 1745 1744 1746 - head = compound_head(page); 1747 - if (!page_cache_get_speculative(head)) 1745 + if (!page_cache_get_speculative(page)) 1748 1746 goto retry; 1749 1747 1750 - /* The page was split under us? */ 1751 - if (compound_head(page) != head) 1752 - goto put_page; 1753 - 1754 - /* Has the page moved? */ 1748 + /* Has the page moved or been split? */ 1755 1749 if (unlikely(page != xas_reload(&xas))) 1756 1750 goto put_page; 1751 + page = find_subpage(page, xas.xa_index); 1757 1752 1758 1753 export: 1759 1754 indices[ret] = xas.xa_index; ··· 1758 1761 break; 1759 1762 continue; 1760 1763 put_page: 1761 - put_page(head); 1764 + put_page(page); 1762 1765 retry: 1763 1766 xas_reset(&xas); 1764 1767 } ··· 1800 1803 1801 1804 rcu_read_lock(); 1802 1805 xas_for_each(&xas, page, end) { 1803 - struct page *head; 1804 1806 if (xas_retry(&xas, page)) 1805 1807 continue; 1806 1808 /* Skip over shadow, swap and DAX entries */ 1807 1809 if (xa_is_value(page)) 1808 1810 continue; 1809 1811 1810 - head = compound_head(page); 1811 - if (!page_cache_get_speculative(head)) 1812 + if (!page_cache_get_speculative(page)) 1812 1813 goto retry; 1813 1814 1814 - /* The page was split under us? */ 1815 - if (compound_head(page) != head) 1816 - goto put_page; 1817 - 1818 - /* Has the page moved? */ 1815 + /* Has the page moved or been split? */ 1819 1816 if (unlikely(page != xas_reload(&xas))) 1820 1817 goto put_page; 1821 1818 1822 - pages[ret] = page; 1819 + pages[ret] = find_subpage(page, xas.xa_index); 1823 1820 if (++ret == nr_pages) { 1824 1821 *start = xas.xa_index + 1; 1825 1822 goto out; 1826 1823 } 1827 1824 continue; 1828 1825 put_page: 1829 - put_page(head); 1826 + put_page(page); 1830 1827 retry: 1831 1828 xas_reset(&xas); 1832 1829 } ··· 1865 1874 1866 1875 rcu_read_lock(); 1867 1876 for (page = xas_load(&xas); page; page = xas_next(&xas)) { 1868 - struct page *head; 1869 1877 if (xas_retry(&xas, page)) 1870 1878 continue; 1871 1879 /* ··· 1874 1884 if (xa_is_value(page)) 1875 1885 break; 1876 1886 1877 - head = compound_head(page); 1878 - if (!page_cache_get_speculative(head)) 1887 + if (!page_cache_get_speculative(page)) 1879 1888 goto retry; 1880 1889 1881 - /* The page was split under us? */ 1882 - if (compound_head(page) != head) 1883 - goto put_page; 1884 - 1885 - /* Has the page moved? */ 1890 + /* Has the page moved or been split? */ 1886 1891 if (unlikely(page != xas_reload(&xas))) 1887 1892 goto put_page; 1888 1893 1889 - pages[ret] = page; 1894 + pages[ret] = find_subpage(page, xas.xa_index); 1890 1895 if (++ret == nr_pages) 1891 1896 break; 1892 1897 continue; 1893 1898 put_page: 1894 - put_page(head); 1899 + put_page(page); 1895 1900 retry: 1896 1901 xas_reset(&xas); 1897 1902 } ··· 1922 1937 1923 1938 rcu_read_lock(); 1924 1939 xas_for_each_marked(&xas, page, end, tag) { 1925 - struct page *head; 1926 1940 if (xas_retry(&xas, page)) 1927 1941 continue; 1928 1942 /* ··· 1932 1948 if (xa_is_value(page)) 1933 1949 continue; 1934 1950 1935 - head = compound_head(page); 1936 - if (!page_cache_get_speculative(head)) 1951 + if (!page_cache_get_speculative(page)) 1937 1952 goto retry; 1938 1953 1939 - /* The page was split under us? */ 1940 - if (compound_head(page) != head) 1941 - goto put_page; 1942 - 1943 - /* Has the page moved? */ 1954 + /* Has the page moved or been split? */ 1944 1955 if (unlikely(page != xas_reload(&xas))) 1945 1956 goto put_page; 1946 1957 1947 - pages[ret] = page; 1958 + pages[ret] = find_subpage(page, xas.xa_index); 1948 1959 if (++ret == nr_pages) { 1949 1960 *index = xas.xa_index + 1; 1950 1961 goto out; 1951 1962 } 1952 1963 continue; 1953 1964 put_page: 1954 - put_page(head); 1965 + put_page(page); 1955 1966 retry: 1956 1967 xas_reset(&xas); 1957 1968 } ··· 2541 2562 goto out_retry; 2542 2563 2543 2564 /* Did it get truncated? */ 2544 - if (unlikely(page->mapping != mapping)) { 2565 + if (unlikely(compound_head(page)->mapping != mapping)) { 2545 2566 unlock_page(page); 2546 2567 put_page(page); 2547 2568 goto retry_find; 2548 2569 } 2549 - VM_BUG_ON_PAGE(page->index != offset, page); 2570 + VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); 2550 2571 2551 2572 /* 2552 2573 * We have a locked page in the page cache, now we need to check ··· 2627 2648 pgoff_t last_pgoff = start_pgoff; 2628 2649 unsigned long max_idx; 2629 2650 XA_STATE(xas, &mapping->i_pages, start_pgoff); 2630 - struct page *head, *page; 2651 + struct page *page; 2631 2652 2632 2653 rcu_read_lock(); 2633 2654 xas_for_each(&xas, page, end_pgoff) { ··· 2636 2657 if (xa_is_value(page)) 2637 2658 goto next; 2638 2659 2639 - head = compound_head(page); 2640 - 2641 2660 /* 2642 2661 * Check for a locked page first, as a speculative 2643 2662 * reference may adversely influence page migration. 2644 2663 */ 2645 - if (PageLocked(head)) 2664 + if (PageLocked(page)) 2646 2665 goto next; 2647 - if (!page_cache_get_speculative(head)) 2666 + if (!page_cache_get_speculative(page)) 2648 2667 goto next; 2649 2668 2650 - /* The page was split under us? */ 2651 - if (compound_head(page) != head) 2652 - goto skip; 2653 - 2654 - /* Has the page moved? */ 2669 + /* Has the page moved or been split? */ 2655 2670 if (unlikely(page != xas_reload(&xas))) 2656 2671 goto skip; 2672 + page = find_subpage(page, xas.xa_index); 2657 2673 2658 2674 if (!PageUptodate(page) || 2659 2675 PageReadahead(page) ||

+57 -68

mm/gup.c

··· 29 29 unsigned int page_mask; 30 30 }; 31 31 32 - typedef int (*set_dirty_func_t)(struct page *page); 33 - 34 - static void __put_user_pages_dirty(struct page **pages, 35 - unsigned long npages, 36 - set_dirty_func_t sdf) 37 - { 38 - unsigned long index; 39 - 40 - for (index = 0; index < npages; index++) { 41 - struct page *page = compound_head(pages[index]); 42 - 43 - /* 44 - * Checking PageDirty at this point may race with 45 - * clear_page_dirty_for_io(), but that's OK. Two key cases: 46 - * 47 - * 1) This code sees the page as already dirty, so it skips 48 - * the call to sdf(). That could happen because 49 - * clear_page_dirty_for_io() called page_mkclean(), 50 - * followed by set_page_dirty(). However, now the page is 51 - * going to get written back, which meets the original 52 - * intention of setting it dirty, so all is well: 53 - * clear_page_dirty_for_io() goes on to call 54 - * TestClearPageDirty(), and write the page back. 55 - * 56 - * 2) This code sees the page as clean, so it calls sdf(). 57 - * The page stays dirty, despite being written back, so it 58 - * gets written back again in the next writeback cycle. 59 - * This is harmless. 60 - */ 61 - if (!PageDirty(page)) 62 - sdf(page); 63 - 64 - put_user_page(page); 65 - } 66 - } 67 - 68 32 /** 69 - * put_user_pages_dirty() - release and dirty an array of gup-pinned pages 70 - * @pages: array of pages to be marked dirty and released. 33 + * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages 34 + * @pages: array of pages to be maybe marked dirty, and definitely released. 71 35 * @npages: number of pages in the @pages array. 36 + * @make_dirty: whether to mark the pages dirty 72 37 * 73 38 * "gup-pinned page" refers to a page that has had one of the get_user_pages() 74 39 * variants called on that page. 75 40 * 76 41 * For each page in the @pages array, make that page (or its head page, if a 77 - * compound page) dirty, if it was previously listed as clean. Then, release 78 - * the page using put_user_page(). 42 + * compound page) dirty, if @make_dirty is true, and if the page was previously 43 + * listed as clean. In any case, releases all pages using put_user_page(), 44 + * possibly via put_user_pages(), for the non-dirty case. 79 45 * 80 46 * Please see the put_user_page() documentation for details. 81 47 * 82 - * set_page_dirty(), which does not lock the page, is used here. 83 - * Therefore, it is the caller's responsibility to ensure that this is 84 - * safe. If not, then put_user_pages_dirty_lock() should be called instead. 48 + * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is 49 + * required, then the caller should a) verify that this is really correct, 50 + * because _lock() is usually required, and b) hand code it: 51 + * set_page_dirty_lock(), put_user_page(). 85 52 * 86 53 */ 87 - void put_user_pages_dirty(struct page **pages, unsigned long npages) 54 + void put_user_pages_dirty_lock(struct page **pages, unsigned long npages, 55 + bool make_dirty) 88 56 { 89 - __put_user_pages_dirty(pages, npages, set_page_dirty); 90 - } 91 - EXPORT_SYMBOL(put_user_pages_dirty); 57 + unsigned long index; 92 58 93 - /** 94 - * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages 95 - * @pages: array of pages to be marked dirty and released. 96 - * @npages: number of pages in the @pages array. 97 - * 98 - * For each page in the @pages array, make that page (or its head page, if a 99 - * compound page) dirty, if it was previously listed as clean. Then, release 100 - * the page using put_user_page(). 101 - * 102 - * Please see the put_user_page() documentation for details. 103 - * 104 - * This is just like put_user_pages_dirty(), except that it invokes 105 - * set_page_dirty_lock(), instead of set_page_dirty(). 106 - * 107 - */ 108 - void put_user_pages_dirty_lock(struct page **pages, unsigned long npages) 109 - { 110 - __put_user_pages_dirty(pages, npages, set_page_dirty_lock); 59 + /* 60 + * TODO: this can be optimized for huge pages: if a series of pages is 61 + * physically contiguous and part of the same compound page, then a 62 + * single operation to the head page should suffice. 63 + */ 64 + 65 + if (!make_dirty) { 66 + put_user_pages(pages, npages); 67 + return; 68 + } 69 + 70 + for (index = 0; index < npages; index++) { 71 + struct page *page = compound_head(pages[index]); 72 + /* 73 + * Checking PageDirty at this point may race with 74 + * clear_page_dirty_for_io(), but that's OK. Two key 75 + * cases: 76 + * 77 + * 1) This code sees the page as already dirty, so it 78 + * skips the call to set_page_dirty(). That could happen 79 + * because clear_page_dirty_for_io() called 80 + * page_mkclean(), followed by set_page_dirty(). 81 + * However, now the page is going to get written back, 82 + * which meets the original intention of setting it 83 + * dirty, so all is well: clear_page_dirty_for_io() goes 84 + * on to call TestClearPageDirty(), and write the page 85 + * back. 86 + * 87 + * 2) This code sees the page as clean, so it calls 88 + * set_page_dirty(). The page stays dirty, despite being 89 + * written back, so it gets written back again in the 90 + * next writeback cycle. This is harmless. 91 + */ 92 + if (!PageDirty(page)) 93 + set_page_dirty_lock(page); 94 + put_user_page(page); 95 + } 111 96 } 112 97 EXPORT_SYMBOL(put_user_pages_dirty_lock); 113 98 ··· 384 399 spin_unlock(ptl); 385 400 return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); 386 401 } 387 - if (flags & FOLL_SPLIT) { 402 + if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) { 388 403 int ret; 389 404 page = pmd_page(*pmd); 390 405 if (is_huge_zero_page(page)) { ··· 393 408 split_huge_pmd(vma, pmd, address); 394 409 if (pmd_trans_unstable(pmd)) 395 410 ret = -EBUSY; 396 - } else { 411 + } else if (flags & FOLL_SPLIT) { 397 412 if (unlikely(!try_get_page(page))) { 398 413 spin_unlock(ptl); 399 414 return ERR_PTR(-ENOMEM); ··· 405 420 put_page(page); 406 421 if (pmd_none(*pmd)) 407 422 return no_page_table(vma, flags); 423 + } else { /* flags & FOLL_SPLIT_PMD */ 424 + spin_unlock(ptl); 425 + split_huge_pmd(vma, pmd, address); 426 + ret = pte_alloc(mm, pmd) ? -ENOMEM : 0; 408 427 } 409 428 410 429 return ret ? ERR_PTR(ret) : ··· 1449 1460 * gup may start from a tail page. Advance step by the left 1450 1461 * part. 1451 1462 */ 1452 - step = (1 << compound_order(head)) - (pages[i] - head); 1463 + step = compound_nr(head) - (pages[i] - head); 1453 1464 /* 1454 1465 * If we get a page from the CMA zone, since we are going to 1455 1466 * be pinning these entries, we might as well move them out

+95 -28

mm/huge_memory.c

··· 496 496 return pmd; 497 497 } 498 498 499 - static inline struct list_head *page_deferred_list(struct page *page) 499 + #ifdef CONFIG_MEMCG 500 + static inline struct deferred_split *get_deferred_split_queue(struct page *page) 500 501 { 501 - /* ->lru in the tail pages is occupied by compound_head. */ 502 - return &page[2].deferred_list; 502 + struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; 503 + struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); 504 + 505 + if (memcg) 506 + return &memcg->deferred_split_queue; 507 + else 508 + return &pgdat->deferred_split_queue; 503 509 } 510 + #else 511 + static inline struct deferred_split *get_deferred_split_queue(struct page *page) 512 + { 513 + struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); 514 + 515 + return &pgdat->deferred_split_queue; 516 + } 517 + #endif 504 518 505 519 void prep_transhuge_page(struct page *page) 506 520 { ··· 2511 2497 struct page *head = compound_head(page); 2512 2498 pg_data_t *pgdat = page_pgdat(head); 2513 2499 struct lruvec *lruvec; 2500 + struct address_space *swap_cache = NULL; 2501 + unsigned long offset = 0; 2514 2502 int i; 2515 2503 2516 2504 lruvec = mem_cgroup_page_lruvec(head, pgdat); 2517 2505 2518 2506 /* complete memcg works before add pages to LRU */ 2519 2507 mem_cgroup_split_huge_fixup(head); 2508 + 2509 + if (PageAnon(head) && PageSwapCache(head)) { 2510 + swp_entry_t entry = { .val = page_private(head) }; 2511 + 2512 + offset = swp_offset(entry); 2513 + swap_cache = swap_address_space(entry); 2514 + xa_lock(&swap_cache->i_pages); 2515 + } 2520 2516 2521 2517 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { 2522 2518 __split_huge_page_tail(head, i, lruvec, list); ··· 2537 2513 if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) 2538 2514 shmem_uncharge(head->mapping->host, 1); 2539 2515 put_page(head + i); 2516 + } else if (!PageAnon(page)) { 2517 + __xa_store(&head->mapping->i_pages, head[i].index, 2518 + head + i, 0); 2519 + } else if (swap_cache) { 2520 + __xa_store(&swap_cache->i_pages, offset + i, 2521 + head + i, 0); 2540 2522 } 2541 2523 } 2542 2524 ··· 2553 2523 /* See comment in __split_huge_page_tail() */ 2554 2524 if (PageAnon(head)) { 2555 2525 /* Additional pin to swap cache */ 2556 - if (PageSwapCache(head)) 2526 + if (PageSwapCache(head)) { 2557 2527 page_ref_add(head, 2); 2558 - else 2528 + xa_unlock(&swap_cache->i_pages); 2529 + } else { 2559 2530 page_ref_inc(head); 2531 + } 2560 2532 } else { 2561 2533 /* Additional pin to page cache */ 2562 2534 page_ref_add(head, 2); ··· 2705 2673 { 2706 2674 struct page *head = compound_head(page); 2707 2675 struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); 2676 + struct deferred_split *ds_queue = get_deferred_split_queue(page); 2708 2677 struct anon_vma *anon_vma = NULL; 2709 2678 struct address_space *mapping = NULL; 2710 2679 int count, mapcount, extra_pins, ret; ··· 2792 2759 } 2793 2760 2794 2761 /* Prevent deferred_split_scan() touching ->_refcount */ 2795 - spin_lock(&pgdata->split_queue_lock); 2762 + spin_lock(&ds_queue->split_queue_lock); 2796 2763 count = page_count(head); 2797 2764 mapcount = total_mapcount(head); 2798 2765 if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { 2799 2766 if (!list_empty(page_deferred_list(head))) { 2800 - pgdata->split_queue_len--; 2767 + ds_queue->split_queue_len--; 2801 2768 list_del(page_deferred_list(head)); 2802 2769 } 2803 2770 if (mapping) 2804 2771 __dec_node_page_state(page, NR_SHMEM_THPS); 2805 - spin_unlock(&pgdata->split_queue_lock); 2772 + spin_unlock(&ds_queue->split_queue_lock); 2806 2773 __split_huge_page(page, list, end, flags); 2807 2774 if (PageSwapCache(head)) { 2808 2775 swp_entry_t entry = { .val = page_private(head) }; ··· 2819 2786 dump_page(page, "total_mapcount(head) > 0"); 2820 2787 BUG(); 2821 2788 } 2822 - spin_unlock(&pgdata->split_queue_lock); 2789 + spin_unlock(&ds_queue->split_queue_lock); 2823 2790 fail: if (mapping) 2824 2791 xa_unlock(&mapping->i_pages); 2825 2792 spin_unlock_irqrestore(&pgdata->lru_lock, flags); ··· 2841 2808 2842 2809 void free_transhuge_page(struct page *page) 2843 2810 { 2844 - struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); 2811 + struct deferred_split *ds_queue = get_deferred_split_queue(page); 2845 2812 unsigned long flags; 2846 2813 2847 - spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2814 + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 2848 2815 if (!list_empty(page_deferred_list(page))) { 2849 - pgdata->split_queue_len--; 2816 + ds_queue->split_queue_len--; 2850 2817 list_del(page_deferred_list(page)); 2851 2818 } 2852 - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2819 + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 2853 2820 free_compound_page(page); 2854 2821 } 2855 2822 2856 2823 void deferred_split_huge_page(struct page *page) 2857 2824 { 2858 - struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); 2825 + struct deferred_split *ds_queue = get_deferred_split_queue(page); 2826 + #ifdef CONFIG_MEMCG 2827 + struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; 2828 + #endif 2859 2829 unsigned long flags; 2860 2830 2861 2831 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 2862 2832 2863 - spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2833 + /* 2834 + * The try_to_unmap() in page reclaim path might reach here too, 2835 + * this may cause a race condition to corrupt deferred split queue. 2836 + * And, if page reclaim is already handling the same page, it is 2837 + * unnecessary to handle it again in shrinker. 2838 + * 2839 + * Check PageSwapCache to determine if the page is being 2840 + * handled by page reclaim since THP swap would add the page into 2841 + * swap cache before calling try_to_unmap(). 2842 + */ 2843 + if (PageSwapCache(page)) 2844 + return; 2845 + 2846 + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 2864 2847 if (list_empty(page_deferred_list(page))) { 2865 2848 count_vm_event(THP_DEFERRED_SPLIT_PAGE); 2866 - list_add_tail(page_deferred_list(page), &pgdata->split_queue); 2867 - pgdata->split_queue_len++; 2849 + list_add_tail(page_deferred_list(page), &ds_queue->split_queue); 2850 + ds_queue->split_queue_len++; 2851 + #ifdef CONFIG_MEMCG 2852 + if (memcg) 2853 + memcg_set_shrinker_bit(memcg, page_to_nid(page), 2854 + deferred_split_shrinker.id); 2855 + #endif 2868 2856 } 2869 - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2857 + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 2870 2858 } 2871 2859 2872 2860 static unsigned long deferred_split_count(struct shrinker *shrink, 2873 2861 struct shrink_control *sc) 2874 2862 { 2875 2863 struct pglist_data *pgdata = NODE_DATA(sc->nid); 2876 - return READ_ONCE(pgdata->split_queue_len); 2864 + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; 2865 + 2866 + #ifdef CONFIG_MEMCG 2867 + if (sc->memcg) 2868 + ds_queue = &sc->memcg->deferred_split_queue; 2869 + #endif 2870 + return READ_ONCE(ds_queue->split_queue_len); 2877 2871 } 2878 2872 2879 2873 static unsigned long deferred_split_scan(struct shrinker *shrink, 2880 2874 struct shrink_control *sc) 2881 2875 { 2882 2876 struct pglist_data *pgdata = NODE_DATA(sc->nid); 2877 + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; 2883 2878 unsigned long flags; 2884 2879 LIST_HEAD(list), *pos, *next; 2885 2880 struct page *page; 2886 2881 int split = 0; 2887 2882 2888 - spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2883 + #ifdef CONFIG_MEMCG 2884 + if (sc->memcg) 2885 + ds_queue = &sc->memcg->deferred_split_queue; 2886 + #endif 2887 + 2888 + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 2889 2889 /* Take pin on all head pages to avoid freeing them under us */ 2890 - list_for_each_safe(pos, next, &pgdata->split_queue) { 2890 + list_for_each_safe(pos, next, &ds_queue->split_queue) { 2891 2891 page = list_entry((void *)pos, struct page, mapping); 2892 2892 page = compound_head(page); 2893 2893 if (get_page_unless_zero(page)) { ··· 2928 2862 } else { 2929 2863 /* We lost race with put_compound_page() */ 2930 2864 list_del_init(page_deferred_list(page)); 2931 - pgdata->split_queue_len--; 2865 + ds_queue->split_queue_len--; 2932 2866 } 2933 2867 if (!--sc->nr_to_scan) 2934 2868 break; 2935 2869 } 2936 - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2870 + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 2937 2871 2938 2872 list_for_each_safe(pos, next, &list) { 2939 2873 page = list_entry((void *)pos, struct page, mapping); ··· 2947 2881 put_page(page); 2948 2882 } 2949 2883 2950 - spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2951 - list_splice_tail(&list, &pgdata->split_queue); 2952 - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2884 + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); 2885 + list_splice_tail(&list, &ds_queue->split_queue); 2886 + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 2953 2887 2954 2888 /* 2955 2889 * Stop shrinker if we didn't split any page, but the queue is empty. 2956 2890 * This can happen if pages were freed under us. 2957 2891 */ 2958 - if (!split && list_empty(&pgdata->split_queue)) 2892 + if (!split && list_empty(&ds_queue->split_queue)) 2959 2893 return SHRINK_STOP; 2960 2894 return split; 2961 2895 } ··· 2964 2898 .count_objects = deferred_split_count, 2965 2899 .scan_objects = deferred_split_scan, 2966 2900 .seeks = DEFAULT_SEEKS, 2967 - .flags = SHRINKER_NUMA_AWARE, 2901 + .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | 2902 + SHRINKER_NONSLAB, 2968 2903 }; 2969 2904 2970 2905 #ifdef CONFIG_DEBUG_FS

+79 -10

mm/hugetlb.c

··· 1405 1405 } 1406 1406 1407 1407 static struct page *alloc_buddy_huge_page(struct hstate *h, 1408 - gfp_t gfp_mask, int nid, nodemask_t *nmask) 1408 + gfp_t gfp_mask, int nid, nodemask_t *nmask, 1409 + nodemask_t *node_alloc_noretry) 1409 1410 { 1410 1411 int order = huge_page_order(h); 1411 1412 struct page *page; 1413 + bool alloc_try_hard = true; 1412 1414 1413 - gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; 1415 + /* 1416 + * By default we always try hard to allocate the page with 1417 + * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in 1418 + * a loop (to adjust global huge page counts) and previous allocation 1419 + * failed, do not continue to try hard on the same node. Use the 1420 + * node_alloc_noretry bitmap to manage this state information. 1421 + */ 1422 + if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) 1423 + alloc_try_hard = false; 1424 + gfp_mask |= __GFP_COMP|__GFP_NOWARN; 1425 + if (alloc_try_hard) 1426 + gfp_mask |= __GFP_RETRY_MAYFAIL; 1414 1427 if (nid == NUMA_NO_NODE) 1415 1428 nid = numa_mem_id(); 1416 1429 page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); ··· 1431 1418 __count_vm_event(HTLB_BUDDY_PGALLOC); 1432 1419 else 1433 1420 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 1421 + 1422 + /* 1423 + * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this 1424 + * indicates an overall state change. Clear bit so that we resume 1425 + * normal 'try hard' allocations. 1426 + */ 1427 + if (node_alloc_noretry && page && !alloc_try_hard) 1428 + node_clear(nid, *node_alloc_noretry); 1429 + 1430 + /* 1431 + * If we tried hard to get a page but failed, set bit so that 1432 + * subsequent attempts will not try as hard until there is an 1433 + * overall state change. 1434 + */ 1435 + if (node_alloc_noretry && !page && alloc_try_hard) 1436 + node_set(nid, *node_alloc_noretry); 1434 1437 1435 1438 return page; 1436 1439 } ··· 1456 1427 * should use this function to get new hugetlb pages 1457 1428 */ 1458 1429 static struct page *alloc_fresh_huge_page(struct hstate *h, 1459 - gfp_t gfp_mask, int nid, nodemask_t *nmask) 1430 + gfp_t gfp_mask, int nid, nodemask_t *nmask, 1431 + nodemask_t *node_alloc_noretry) 1460 1432 { 1461 1433 struct page *page; 1462 1434 ··· 1465 1435 page = alloc_gigantic_page(h, gfp_mask, nid, nmask); 1466 1436 else 1467 1437 page = alloc_buddy_huge_page(h, gfp_mask, 1468 - nid, nmask); 1438 + nid, nmask, node_alloc_noretry); 1469 1439 if (!page) 1470 1440 return NULL; 1471 1441 ··· 1480 1450 * Allocates a fresh page to the hugetlb allocator pool in the node interleaved 1481 1451 * manner. 1482 1452 */ 1483 - static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) 1453 + static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 1454 + nodemask_t *node_alloc_noretry) 1484 1455 { 1485 1456 struct page *page; 1486 1457 int nr_nodes, node; 1487 1458 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; 1488 1459 1489 1460 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 1490 - page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed); 1461 + page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed, 1462 + node_alloc_noretry); 1491 1463 if (page) 1492 1464 break; 1493 1465 } ··· 1633 1601 goto out_unlock; 1634 1602 spin_unlock(&hugetlb_lock); 1635 1603 1636 - page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); 1604 + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); 1637 1605 if (!page) 1638 1606 return NULL; 1639 1607 ··· 1669 1637 if (hstate_is_gigantic(h)) 1670 1638 return NULL; 1671 1639 1672 - page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); 1640 + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); 1673 1641 if (!page) 1674 1642 return NULL; 1675 1643 ··· 2239 2207 static void __init hugetlb_hstate_alloc_pages(struct hstate *h) 2240 2208 { 2241 2209 unsigned long i; 2210 + nodemask_t *node_alloc_noretry; 2211 + 2212 + if (!hstate_is_gigantic(h)) { 2213 + /* 2214 + * Bit mask controlling how hard we retry per-node allocations. 2215 + * Ignore errors as lower level routines can deal with 2216 + * node_alloc_noretry == NULL. If this kmalloc fails at boot 2217 + * time, we are likely in bigger trouble. 2218 + */ 2219 + node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), 2220 + GFP_KERNEL); 2221 + } else { 2222 + /* allocations done at boot time */ 2223 + node_alloc_noretry = NULL; 2224 + } 2225 + 2226 + /* bit mask controlling how hard we retry per-node allocations */ 2227 + if (node_alloc_noretry) 2228 + nodes_clear(*node_alloc_noretry); 2242 2229 2243 2230 for (i = 0; i < h->max_huge_pages; ++i) { 2244 2231 if (hstate_is_gigantic(h)) { 2245 2232 if (!alloc_bootmem_huge_page(h)) 2246 2233 break; 2247 2234 } else if (!alloc_pool_huge_page(h, 2248 - &node_states[N_MEMORY])) 2235 + &node_states[N_MEMORY], 2236 + node_alloc_noretry)) 2249 2237 break; 2250 2238 cond_resched(); 2251 2239 } ··· 2277 2225 h->max_huge_pages, buf, i); 2278 2226 h->max_huge_pages = i; 2279 2227 } 2228 + 2229 + kfree(node_alloc_noretry); 2280 2230 } 2281 2231 2282 2232 static void __init hugetlb_init_hstates(void) ··· 2377 2323 nodemask_t *nodes_allowed) 2378 2324 { 2379 2325 unsigned long min_count, ret; 2326 + NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); 2327 + 2328 + /* 2329 + * Bit mask controlling how hard we retry per-node allocations. 2330 + * If we can not allocate the bit mask, do not attempt to allocate 2331 + * the requested huge pages. 2332 + */ 2333 + if (node_alloc_noretry) 2334 + nodes_clear(*node_alloc_noretry); 2335 + else 2336 + return -ENOMEM; 2380 2337 2381 2338 spin_lock(&hugetlb_lock); 2382 2339 ··· 2421 2356 if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { 2422 2357 if (count > persistent_huge_pages(h)) { 2423 2358 spin_unlock(&hugetlb_lock); 2359 + NODEMASK_FREE(node_alloc_noretry); 2424 2360 return -EINVAL; 2425 2361 } 2426 2362 /* Fall through to decrease pool */ ··· 2454 2388 /* yield cpu to avoid soft lockup */ 2455 2389 cond_resched(); 2456 2390 2457 - ret = alloc_pool_huge_page(h, nodes_allowed); 2391 + ret = alloc_pool_huge_page(h, nodes_allowed, 2392 + node_alloc_noretry); 2458 2393 spin_lock(&hugetlb_lock); 2459 2394 if (!ret) 2460 2395 goto out; ··· 2495 2428 out: 2496 2429 h->max_huge_pages = persistent_huge_pages(h); 2497 2430 spin_unlock(&hugetlb_lock); 2431 + 2432 + NODEMASK_FREE(node_alloc_noretry); 2498 2433 2499 2434 return 0; 2500 2435 }

+1 -1

mm/hugetlb_cgroup.c

··· 139 139 if (!page_hcg || page_hcg != h_cg) 140 140 goto out; 141 141 142 - nr_pages = 1 << compound_order(page); 142 + nr_pages = compound_nr(page); 143 143 if (!parent) { 144 144 parent = root_h_cgroup; 145 145 /* root has no limit */

+1 -1

mm/init-mm.c

··· 35 35 .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), 36 36 .mmlist = LIST_HEAD_INIT(init_mm.mmlist), 37 37 .user_ns = &init_user_ns, 38 - .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, 38 + .cpu_bitmap = CPU_BITS_NONE, 39 39 INIT_MM_CONTEXT(init_mm) 40 40 };

+24 -8

mm/kasan/common.c

··· 304 304 struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, 305 305 const void *object) 306 306 { 307 - BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); 308 307 return (void *)object + cache->kasan_info.alloc_meta_offset; 309 308 } 310 309 ··· 314 315 return (void *)object + cache->kasan_info.free_meta_offset; 315 316 } 316 317 318 + 319 + static void kasan_set_free_info(struct kmem_cache *cache, 320 + void *object, u8 tag) 321 + { 322 + struct kasan_alloc_meta *alloc_meta; 323 + u8 idx = 0; 324 + 325 + alloc_meta = get_alloc_info(cache, object); 326 + 327 + #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY 328 + idx = alloc_meta->free_track_idx; 329 + alloc_meta->free_pointer_tag[idx] = tag; 330 + alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS; 331 + #endif 332 + 333 + set_track(&alloc_meta->free_track[idx], GFP_NOWAIT); 334 + } 335 + 317 336 void kasan_poison_slab(struct page *page) 318 337 { 319 338 unsigned long i; 320 339 321 - for (i = 0; i < (1 << compound_order(page)); i++) 340 + for (i = 0; i < compound_nr(page); i++) 322 341 page_kasan_tag_reset(page + i); 323 - kasan_poison_shadow(page_address(page), 324 - PAGE_SIZE << compound_order(page), 342 + kasan_poison_shadow(page_address(page), page_size(page), 325 343 KASAN_KMALLOC_REDZONE); 326 344 } 327 345 ··· 468 452 unlikely(!(cache->flags & SLAB_KASAN))) 469 453 return false; 470 454 471 - set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT); 455 + kasan_set_free_info(cache, object, tag); 456 + 472 457 quarantine_put(get_free_info(cache, object), cache); 473 458 474 459 return IS_ENABLED(CONFIG_KASAN_GENERIC); ··· 541 524 page = virt_to_page(ptr); 542 525 redzone_start = round_up((unsigned long)(ptr + size), 543 526 KASAN_SHADOW_SCALE_SIZE); 544 - redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page)); 527 + redzone_end = (unsigned long)ptr + page_size(page); 545 528 546 529 kasan_unpoison_shadow(ptr, size); 547 530 kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, ··· 577 560 kasan_report_invalid_free(ptr, ip); 578 561 return; 579 562 } 580 - kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), 581 - KASAN_FREE_PAGE); 563 + kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE); 582 564 } else { 583 565 __kasan_slab_free(page->slab_cache, ptr, ip, false); 584 566 }

+13 -1

mm/kasan/kasan.h

··· 95 95 depot_stack_handle_t stack; 96 96 }; 97 97 98 + #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY 99 + #define KASAN_NR_FREE_STACKS 5 100 + #else 101 + #define KASAN_NR_FREE_STACKS 1 102 + #endif 103 + 98 104 struct kasan_alloc_meta { 99 105 struct kasan_track alloc_track; 100 - struct kasan_track free_track; 106 + struct kasan_track free_track[KASAN_NR_FREE_STACKS]; 107 + #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY 108 + u8 free_pointer_tag[KASAN_NR_FREE_STACKS]; 109 + u8 free_track_idx; 110 + #endif 101 111 }; 102 112 103 113 struct qlist_node { ··· 155 145 void kasan_report(unsigned long addr, size_t size, 156 146 bool is_write, unsigned long ip); 157 147 void kasan_report_invalid_free(void *object, unsigned long ip); 148 + 149 + struct page *kasan_addr_to_page(const void *addr); 158 150 159 151 #if defined(CONFIG_KASAN_GENERIC) && \ 160 152 (defined(CONFIG_SLAB) || defined(CONFIG_SLUB))

+34 -10

mm/kasan/report.c

··· 111 111 } 112 112 } 113 113 114 - static struct page *addr_to_page(const void *addr) 114 + struct page *kasan_addr_to_page(const void *addr) 115 115 { 116 116 if ((addr >= (void *)PAGE_OFFSET) && 117 117 (addr < high_memory)) ··· 151 151 (void *)(object_addr + cache->object_size)); 152 152 } 153 153 154 + static struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, 155 + void *object, u8 tag) 156 + { 157 + struct kasan_alloc_meta *alloc_meta; 158 + int i = 0; 159 + 160 + alloc_meta = get_alloc_info(cache, object); 161 + 162 + #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY 163 + for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { 164 + if (alloc_meta->free_pointer_tag[i] == tag) 165 + break; 166 + } 167 + if (i == KASAN_NR_FREE_STACKS) 168 + i = alloc_meta->free_track_idx; 169 + #endif 170 + 171 + return &alloc_meta->free_track[i]; 172 + } 173 + 154 174 static void describe_object(struct kmem_cache *cache, void *object, 155 - const void *addr) 175 + const void *addr, u8 tag) 156 176 { 157 177 struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); 158 178 159 179 if (cache->flags & SLAB_KASAN) { 180 + struct kasan_track *free_track; 181 + 160 182 print_track(&alloc_info->alloc_track, "Allocated"); 161 183 pr_err("\n"); 162 - print_track(&alloc_info->free_track, "Freed"); 184 + free_track = kasan_get_free_track(cache, object, tag); 185 + print_track(free_track, "Freed"); 163 186 pr_err("\n"); 164 187 } 165 188 ··· 367 344 print_decoded_frame_descr(frame_descr); 368 345 } 369 346 370 - static void print_address_description(void *addr) 347 + static void print_address_description(void *addr, u8 tag) 371 348 { 372 - struct page *page = addr_to_page(addr); 349 + struct page *page = kasan_addr_to_page(addr); 373 350 374 351 dump_stack(); 375 352 pr_err("\n"); ··· 378 355 struct kmem_cache *cache = page->slab_cache; 379 356 void *object = nearest_obj(cache, page, addr); 380 357 381 - describe_object(cache, object, addr); 358 + describe_object(cache, object, addr, tag); 382 359 } 383 360 384 361 if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) { ··· 458 435 void kasan_report_invalid_free(void *object, unsigned long ip) 459 436 { 460 437 unsigned long flags; 438 + u8 tag = get_tag(object); 461 439 440 + object = reset_tag(object); 462 441 start_report(&flags); 463 442 pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); 464 - print_tags(get_tag(object), reset_tag(object)); 465 - object = reset_tag(object); 443 + print_tags(tag, object); 466 444 pr_err("\n"); 467 - print_address_description(object); 445 + print_address_description(object, tag); 468 446 pr_err("\n"); 469 447 print_shadow_for_address(object); 470 448 end_report(&flags); ··· 503 479 pr_err("\n"); 504 480 505 481 if (addr_has_shadow(untagged_addr)) { 506 - print_address_description(untagged_addr); 482 + print_address_description(untagged_addr, get_tag(tagged_addr)); 507 483 pr_err("\n"); 508 484 print_shadow_for_address(info.first_bad_addr); 509 485 } else {

+24

mm/kasan/tags_report.c

··· 36 36 37 37 const char *get_bug_type(struct kasan_access_info *info) 38 38 { 39 + #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY 40 + struct kasan_alloc_meta *alloc_meta; 41 + struct kmem_cache *cache; 42 + struct page *page; 43 + const void *addr; 44 + void *object; 45 + u8 tag; 46 + int i; 47 + 48 + tag = get_tag(info->access_addr); 49 + addr = reset_tag(info->access_addr); 50 + page = kasan_addr_to_page(addr); 51 + if (page && PageSlab(page)) { 52 + cache = page->slab_cache; 53 + object = nearest_obj(cache, page, (void *)addr); 54 + alloc_meta = get_alloc_info(cache, object); 55 + 56 + for (i = 0; i < KASAN_NR_FREE_STACKS; i++) 57 + if (alloc_meta->free_pointer_tag[i] == tag) 58 + return "use-after-free"; 59 + return "out-of-bounds"; 60 + } 61 + 62 + #endif 39 63 return "invalid-access"; 40 64 } 41 65

+308 -58

mm/khugepaged.c

··· 48 48 SCAN_CGROUP_CHARGE_FAIL, 49 49 SCAN_EXCEED_SWAP_PTE, 50 50 SCAN_TRUNCATED, 51 + SCAN_PAGE_HAS_PRIVATE, 51 52 }; 52 53 53 54 #define CREATE_TRACE_POINTS ··· 77 76 78 77 static struct kmem_cache *mm_slot_cache __read_mostly; 79 78 79 + #define MAX_PTE_MAPPED_THP 8 80 + 80 81 /** 81 82 * struct mm_slot - hash lookup from mm to mm_slot 82 83 * @hash: hash collision list ··· 89 86 struct hlist_node hash; 90 87 struct list_head mm_node; 91 88 struct mm_struct *mm; 89 + 90 + /* pte-mapped THP in this mm */ 91 + int nr_pte_mapped_thp; 92 + unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP]; 92 93 }; 93 94 94 95 /** ··· 411 404 (vm_flags & VM_NOHUGEPAGE) || 412 405 test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) 413 406 return false; 414 - if (shmem_file(vma->vm_file)) { 407 + 408 + if (shmem_file(vma->vm_file) || 409 + (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && 410 + vma->vm_file && 411 + (vm_flags & VM_DENYWRITE))) { 415 412 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) 416 413 return false; 417 414 return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, ··· 467 456 unsigned long hstart, hend; 468 457 469 458 /* 470 - * khugepaged does not yet work on non-shmem files or special 471 - * mappings. And file-private shmem THP is not supported. 459 + * khugepaged only supports read-only files for non-shmem files. 460 + * khugepaged does not yet work on special mappings. And 461 + * file-private shmem THP is not supported. 472 462 */ 473 463 if (!hugepage_vma_check(vma, vm_flags)) 474 464 return 0; ··· 1260 1248 } 1261 1249 1262 1250 #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) 1251 + /* 1252 + * Notify khugepaged that given addr of the mm is pte-mapped THP. Then 1253 + * khugepaged should try to collapse the page table. 1254 + */ 1255 + static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, 1256 + unsigned long addr) 1257 + { 1258 + struct mm_slot *mm_slot; 1259 + 1260 + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); 1261 + 1262 + spin_lock(&khugepaged_mm_lock); 1263 + mm_slot = get_mm_slot(mm); 1264 + if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) 1265 + mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; 1266 + spin_unlock(&khugepaged_mm_lock); 1267 + return 0; 1268 + } 1269 + 1270 + /** 1271 + * Try to collapse a pte-mapped THP for mm at address haddr. 1272 + * 1273 + * This function checks whether all the PTEs in the PMD are pointing to the 1274 + * right THP. If so, retract the page table so the THP can refault in with 1275 + * as pmd-mapped. 1276 + */ 1277 + void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) 1278 + { 1279 + unsigned long haddr = addr & HPAGE_PMD_MASK; 1280 + struct vm_area_struct *vma = find_vma(mm, haddr); 1281 + struct page *hpage = NULL; 1282 + pte_t *start_pte, *pte; 1283 + pmd_t *pmd, _pmd; 1284 + spinlock_t *ptl; 1285 + int count = 0; 1286 + int i; 1287 + 1288 + if (!vma || !vma->vm_file || 1289 + vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE) 1290 + return; 1291 + 1292 + /* 1293 + * This vm_flags may not have VM_HUGEPAGE if the page was not 1294 + * collapsed by this mm. But we can still collapse if the page is 1295 + * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() 1296 + * will not fail the vma for missing VM_HUGEPAGE 1297 + */ 1298 + if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE)) 1299 + return; 1300 + 1301 + pmd = mm_find_pmd(mm, haddr); 1302 + if (!pmd) 1303 + return; 1304 + 1305 + start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); 1306 + 1307 + /* step 1: check all mapped PTEs are to the right huge page */ 1308 + for (i = 0, addr = haddr, pte = start_pte; 1309 + i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { 1310 + struct page *page; 1311 + 1312 + /* empty pte, skip */ 1313 + if (pte_none(*pte)) 1314 + continue; 1315 + 1316 + /* page swapped out, abort */ 1317 + if (!pte_present(*pte)) 1318 + goto abort; 1319 + 1320 + page = vm_normal_page(vma, addr, *pte); 1321 + 1322 + if (!page || !PageCompound(page)) 1323 + goto abort; 1324 + 1325 + if (!hpage) { 1326 + hpage = compound_head(page); 1327 + /* 1328 + * The mapping of the THP should not change. 1329 + * 1330 + * Note that uprobe, debugger, or MAP_PRIVATE may 1331 + * change the page table, but the new page will 1332 + * not pass PageCompound() check. 1333 + */ 1334 + if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping)) 1335 + goto abort; 1336 + } 1337 + 1338 + /* 1339 + * Confirm the page maps to the correct subpage. 1340 + * 1341 + * Note that uprobe, debugger, or MAP_PRIVATE may change 1342 + * the page table, but the new page will not pass 1343 + * PageCompound() check. 1344 + */ 1345 + if (WARN_ON(hpage + i != page)) 1346 + goto abort; 1347 + count++; 1348 + } 1349 + 1350 + /* step 2: adjust rmap */ 1351 + for (i = 0, addr = haddr, pte = start_pte; 1352 + i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { 1353 + struct page *page; 1354 + 1355 + if (pte_none(*pte)) 1356 + continue; 1357 + page = vm_normal_page(vma, addr, *pte); 1358 + page_remove_rmap(page, false); 1359 + } 1360 + 1361 + pte_unmap_unlock(start_pte, ptl); 1362 + 1363 + /* step 3: set proper refcount and mm_counters. */ 1364 + if (hpage) { 1365 + page_ref_sub(hpage, count); 1366 + add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count); 1367 + } 1368 + 1369 + /* step 4: collapse pmd */ 1370 + ptl = pmd_lock(vma->vm_mm, pmd); 1371 + _pmd = pmdp_collapse_flush(vma, addr, pmd); 1372 + spin_unlock(ptl); 1373 + mm_dec_nr_ptes(mm); 1374 + pte_free(mm, pmd_pgtable(_pmd)); 1375 + return; 1376 + 1377 + abort: 1378 + pte_unmap_unlock(start_pte, ptl); 1379 + } 1380 + 1381 + static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) 1382 + { 1383 + struct mm_struct *mm = mm_slot->mm; 1384 + int i; 1385 + 1386 + if (likely(mm_slot->nr_pte_mapped_thp == 0)) 1387 + return 0; 1388 + 1389 + if (!down_write_trylock(&mm->mmap_sem)) 1390 + return -EBUSY; 1391 + 1392 + if (unlikely(khugepaged_test_exit(mm))) 1393 + goto out; 1394 + 1395 + for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) 1396 + collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]); 1397 + 1398 + out: 1399 + mm_slot->nr_pte_mapped_thp = 0; 1400 + up_write(&mm->mmap_sem); 1401 + return 0; 1402 + } 1403 + 1263 1404 static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) 1264 1405 { 1265 1406 struct vm_area_struct *vma; ··· 1421 1256 1422 1257 i_mmap_lock_write(mapping); 1423 1258 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 1424 - /* probably overkill */ 1259 + /* 1260 + * Check vma->anon_vma to exclude MAP_PRIVATE mappings that 1261 + * got written to. These VMAs are likely not worth investing 1262 + * down_write(mmap_sem) as PMD-mapping is likely to be split 1263 + * later. 1264 + * 1265 + * Not that vma->anon_vma check is racy: it can be set up after 1266 + * the check but before we took mmap_sem by the fault path. 1267 + * But page lock would prevent establishing any new ptes of the 1268 + * page, so we are safe. 1269 + * 1270 + * An alternative would be drop the check, but check that page 1271 + * table is clear before calling pmdp_collapse_flush() under 1272 + * ptl. It has higher chance to recover THP for the VMA, but 1273 + * has higher cost too. 1274 + */ 1425 1275 if (vma->anon_vma) 1426 1276 continue; 1427 1277 addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); ··· 1449 1269 continue; 1450 1270 /* 1451 1271 * We need exclusive mmap_sem to retract page table. 1452 - * If trylock fails we would end up with pte-mapped THP after 1453 - * re-fault. Not ideal, but it's more important to not disturb 1454 - * the system too much. 1272 + * 1273 + * We use trylock due to lock inversion: we need to acquire 1274 + * mmap_sem while holding page lock. Fault path does it in 1275 + * reverse order. Trylock is a way to avoid deadlock. 1455 1276 */ 1456 1277 if (down_write_trylock(&vma->vm_mm->mmap_sem)) { 1457 1278 spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); ··· 1462 1281 up_write(&vma->vm_mm->mmap_sem); 1463 1282 mm_dec_nr_ptes(vma->vm_mm); 1464 1283 pte_free(vma->vm_mm, pmd_pgtable(_pmd)); 1284 + } else { 1285 + /* Try again later */ 1286 + khugepaged_add_pte_mapped_thp(vma->vm_mm, addr); 1465 1287 } 1466 1288 } 1467 1289 i_mmap_unlock_write(mapping); 1468 1290 } 1469 1291 1470 1292 /** 1471 - * collapse_shmem - collapse small tmpfs/shmem pages into huge one. 1293 + * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. 1472 1294 * 1473 1295 * Basic scheme is simple, details are more complex: 1474 1296 * - allocate and lock a new huge page; 1475 1297 * - scan page cache replacing old pages with the new one 1476 - * + swap in pages if necessary; 1298 + * + swap/gup in pages if necessary; 1477 1299 * + fill in gaps; 1478 1300 * + keep old pages around in case rollback is required; 1479 1301 * - if replacing succeeds: ··· 1488 1304 * + restore gaps in the page cache; 1489 1305 * + unlock and free huge page; 1490 1306 */ 1491 - static void collapse_shmem(struct mm_struct *mm, 1492 - struct address_space *mapping, pgoff_t start, 1307 + static void collapse_file(struct mm_struct *mm, 1308 + struct file *file, pgoff_t start, 1493 1309 struct page **hpage, int node) 1494 1310 { 1311 + struct address_space *mapping = file->f_mapping; 1495 1312 gfp_t gfp; 1496 1313 struct page *new_page; 1497 1314 struct mem_cgroup *memcg; ··· 1500 1315 LIST_HEAD(pagelist); 1501 1316 XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); 1502 1317 int nr_none = 0, result = SCAN_SUCCEED; 1318 + bool is_shmem = shmem_file(file); 1503 1319 1320 + VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); 1504 1321 VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); 1505 1322 1506 1323 /* Only allocate from the target node */ ··· 1534 1347 } while (1); 1535 1348 1536 1349 __SetPageLocked(new_page); 1537 - __SetPageSwapBacked(new_page); 1350 + if (is_shmem) 1351 + __SetPageSwapBacked(new_page); 1538 1352 new_page->index = start; 1539 1353 new_page->mapping = mapping; 1540 1354 ··· 1550 1362 struct page *page = xas_next(&xas); 1551 1363 1552 1364 VM_BUG_ON(index != xas.xa_index); 1553 - if (!page) { 1554 - /* 1555 - * Stop if extent has been truncated or hole-punched, 1556 - * and is now completely empty. 1557 - */ 1558 - if (index == start) { 1559 - if (!xas_next_entry(&xas, end - 1)) { 1560 - result = SCAN_TRUNCATED; 1365 + if (is_shmem) { 1366 + if (!page) { 1367 + /* 1368 + * Stop if extent has been truncated or 1369 + * hole-punched, and is now completely 1370 + * empty. 1371 + */ 1372 + if (index == start) { 1373 + if (!xas_next_entry(&xas, end - 1)) { 1374 + result = SCAN_TRUNCATED; 1375 + goto xa_locked; 1376 + } 1377 + xas_set(&xas, index); 1378 + } 1379 + if (!shmem_charge(mapping->host, 1)) { 1380 + result = SCAN_FAIL; 1561 1381 goto xa_locked; 1562 1382 } 1563 - xas_set(&xas, index); 1383 + xas_store(&xas, new_page); 1384 + nr_none++; 1385 + continue; 1564 1386 } 1565 - if (!shmem_charge(mapping->host, 1)) { 1566 - result = SCAN_FAIL; 1387 + 1388 + if (xa_is_value(page) || !PageUptodate(page)) { 1389 + xas_unlock_irq(&xas); 1390 + /* swap in or instantiate fallocated page */ 1391 + if (shmem_getpage(mapping->host, index, &page, 1392 + SGP_NOHUGE)) { 1393 + result = SCAN_FAIL; 1394 + goto xa_unlocked; 1395 + } 1396 + } else if (trylock_page(page)) { 1397 + get_page(page); 1398 + xas_unlock_irq(&xas); 1399 + } else { 1400 + result = SCAN_PAGE_LOCK; 1567 1401 goto xa_locked; 1568 1402 } 1569 - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); 1570 - nr_none++; 1571 - continue; 1572 - } 1573 - 1574 - if (xa_is_value(page) || !PageUptodate(page)) { 1575 - xas_unlock_irq(&xas); 1576 - /* swap in or instantiate fallocated page */ 1577 - if (shmem_getpage(mapping->host, index, &page, 1578 - SGP_NOHUGE)) { 1403 + } else { /* !is_shmem */ 1404 + if (!page || xa_is_value(page)) { 1405 + xas_unlock_irq(&xas); 1406 + page_cache_sync_readahead(mapping, &file->f_ra, 1407 + file, index, 1408 + PAGE_SIZE); 1409 + /* drain pagevecs to help isolate_lru_page() */ 1410 + lru_add_drain(); 1411 + page = find_lock_page(mapping, index); 1412 + if (unlikely(page == NULL)) { 1413 + result = SCAN_FAIL; 1414 + goto xa_unlocked; 1415 + } 1416 + } else if (!PageUptodate(page)) { 1417 + xas_unlock_irq(&xas); 1418 + wait_on_page_locked(page); 1419 + if (!trylock_page(page)) { 1420 + result = SCAN_PAGE_LOCK; 1421 + goto xa_unlocked; 1422 + } 1423 + get_page(page); 1424 + } else if (PageDirty(page)) { 1579 1425 result = SCAN_FAIL; 1580 - goto xa_unlocked; 1426 + goto xa_locked; 1427 + } else if (trylock_page(page)) { 1428 + get_page(page); 1429 + xas_unlock_irq(&xas); 1430 + } else { 1431 + result = SCAN_PAGE_LOCK; 1432 + goto xa_locked; 1581 1433 } 1582 - } else if (trylock_page(page)) { 1583 - get_page(page); 1584 - xas_unlock_irq(&xas); 1585 - } else { 1586 - result = SCAN_PAGE_LOCK; 1587 - goto xa_locked; 1588 1434 } 1589 1435 1590 1436 /* ··· 1644 1422 1645 1423 if (isolate_lru_page(page)) { 1646 1424 result = SCAN_DEL_PAGE_LRU; 1425 + goto out_unlock; 1426 + } 1427 + 1428 + if (page_has_private(page) && 1429 + !try_to_release_page(page, GFP_KERNEL)) { 1430 + result = SCAN_PAGE_HAS_PRIVATE; 1647 1431 goto out_unlock; 1648 1432 } 1649 1433 ··· 1682 1454 list_add_tail(&page->lru, &pagelist); 1683 1455 1684 1456 /* Finally, replace with the new page. */ 1685 - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); 1457 + xas_store(&xas, new_page); 1686 1458 continue; 1687 1459 out_unlock: 1688 1460 unlock_page(page); ··· 1690 1462 goto xa_unlocked; 1691 1463 } 1692 1464 1693 - __inc_node_page_state(new_page, NR_SHMEM_THPS); 1465 + if (is_shmem) 1466 + __inc_node_page_state(new_page, NR_SHMEM_THPS); 1467 + else { 1468 + __inc_node_page_state(new_page, NR_FILE_THPS); 1469 + filemap_nr_thps_inc(mapping); 1470 + } 1471 + 1694 1472 if (nr_none) { 1695 1473 struct zone *zone = page_zone(new_page); 1696 1474 1697 1475 __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); 1698 - __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); 1476 + if (is_shmem) 1477 + __mod_node_page_state(zone->zone_pgdat, 1478 + NR_SHMEM, nr_none); 1699 1479 } 1700 1480 1701 1481 xa_locked: ··· 1741 1505 1742 1506 SetPageUptodate(new_page); 1743 1507 page_ref_add(new_page, HPAGE_PMD_NR - 1); 1744 - set_page_dirty(new_page); 1745 1508 mem_cgroup_commit_charge(new_page, memcg, false, true); 1509 + 1510 + if (is_shmem) { 1511 + set_page_dirty(new_page); 1512 + lru_cache_add_anon(new_page); 1513 + } else { 1514 + lru_cache_add_file(new_page); 1515 + } 1746 1516 count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); 1747 - lru_cache_add_anon(new_page); 1748 1517 1749 1518 /* 1750 1519 * Remove pte page tables, so we can re-fault the page as huge. ··· 1764 1523 /* Something went wrong: roll back page cache changes */ 1765 1524 xas_lock_irq(&xas); 1766 1525 mapping->nrpages -= nr_none; 1767 - shmem_uncharge(mapping->host, nr_none); 1526 + 1527 + if (is_shmem) 1528 + shmem_uncharge(mapping->host, nr_none); 1768 1529 1769 1530 xas_set(&xas, start); 1770 1531 xas_for_each(&xas, page, end - 1) { ··· 1806 1563 /* TODO: tracepoints */ 1807 1564 } 1808 1565 1809 - static void khugepaged_scan_shmem(struct mm_struct *mm, 1810 - struct address_space *mapping, 1811 - pgoff_t start, struct page **hpage) 1566 + static void khugepaged_scan_file(struct mm_struct *mm, 1567 + struct file *file, pgoff_t start, struct page **hpage) 1812 1568 { 1813 1569 struct page *page = NULL; 1570 + struct address_space *mapping = file->f_mapping; 1814 1571 XA_STATE(xas, &mapping->i_pages, start); 1815 1572 int present, swap; 1816 1573 int node = NUMA_NO_NODE; ··· 1849 1606 break; 1850 1607 } 1851 1608 1852 - if (page_count(page) != 1 + page_mapcount(page)) { 1609 + if (page_count(page) != 1610 + 1 + page_mapcount(page) + page_has_private(page)) { 1853 1611 result = SCAN_PAGE_COUNT; 1854 1612 break; 1855 1613 } ··· 1875 1631 result = SCAN_EXCEED_NONE_PTE; 1876 1632 } else { 1877 1633 node = khugepaged_find_target_node(); 1878 - collapse_shmem(mm, mapping, start, hpage, node); 1634 + collapse_file(mm, file, start, hpage, node); 1879 1635 } 1880 1636 } 1881 1637 1882 1638 /* TODO: tracepoints */ 1883 1639 } 1884 1640 #else 1885 - static void khugepaged_scan_shmem(struct mm_struct *mm, 1886 - struct address_space *mapping, 1887 - pgoff_t start, struct page **hpage) 1641 + static void khugepaged_scan_file(struct mm_struct *mm, 1642 + struct file *file, pgoff_t start, struct page **hpage) 1888 1643 { 1889 1644 BUILD_BUG(); 1645 + } 1646 + 1647 + static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) 1648 + { 1649 + return 0; 1890 1650 } 1891 1651 #endif 1892 1652 ··· 1916 1668 khugepaged_scan.mm_slot = mm_slot; 1917 1669 } 1918 1670 spin_unlock(&khugepaged_mm_lock); 1671 + khugepaged_collapse_pte_mapped_thps(mm_slot); 1919 1672 1920 1673 mm = mm_slot->mm; 1921 1674 /* ··· 1962 1713 VM_BUG_ON(khugepaged_scan.address < hstart || 1963 1714 khugepaged_scan.address + HPAGE_PMD_SIZE > 1964 1715 hend); 1965 - if (shmem_file(vma->vm_file)) { 1716 + if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { 1966 1717 struct file *file; 1967 1718 pgoff_t pgoff = linear_page_index(vma, 1968 1719 khugepaged_scan.address); 1969 - if (!shmem_huge_enabled(vma)) 1720 + 1721 + if (shmem_file(vma->vm_file) 1722 + && !shmem_huge_enabled(vma)) 1970 1723 goto skip; 1971 1724 file = get_file(vma->vm_file); 1972 1725 up_read(&mm->mmap_sem); 1973 1726 ret = 1; 1974 - khugepaged_scan_shmem(mm, file->f_mapping, 1975 - pgoff, hpage); 1727 + khugepaged_scan_file(mm, file, pgoff, hpage); 1976 1728 fput(file); 1977 1729 } else { 1978 1730 ret = khugepaged_scan_pmd(mm, vma,

+84 -242

mm/kmemleak.c

··· 168 168 #define OBJECT_REPORTED (1 << 1) 169 169 /* flag set to not scan the object */ 170 170 #define OBJECT_NO_SCAN (1 << 2) 171 + /* flag set to fully scan the object when scan_area allocation failed */ 172 + #define OBJECT_FULL_SCAN (1 << 3) 171 173 172 174 #define HEX_PREFIX " " 173 175 /* number of bytes to print per line; must be 16 or 32 */ ··· 185 183 static LIST_HEAD(object_list); 186 184 /* the list of gray-colored objects (see color_gray comment below) */ 187 185 static LIST_HEAD(gray_list); 186 + /* memory pool allocation */ 187 + static struct kmemleak_object mem_pool[CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE]; 188 + static int mem_pool_free_count = ARRAY_SIZE(mem_pool); 189 + static LIST_HEAD(mem_pool_free_list); 188 190 /* search tree for object boundaries */ 189 191 static struct rb_root object_tree_root = RB_ROOT; 190 192 /* rw_lock protecting the access to object_list and object_tree_root */ ··· 199 193 static struct kmem_cache *scan_area_cache; 200 194 201 195 /* set if tracing memory operations is enabled */ 202 - static int kmemleak_enabled; 196 + static int kmemleak_enabled = 1; 203 197 /* same as above but only for the kmemleak_free() callback */ 204 - static int kmemleak_free_enabled; 198 + static int kmemleak_free_enabled = 1; 205 199 /* set in the late_initcall if there were no errors */ 206 200 static int kmemleak_initialized; 207 - /* enables or disables early logging of the memory operations */ 208 - static int kmemleak_early_log = 1; 209 201 /* set if a kmemleak warning was issued */ 210 202 static int kmemleak_warning; 211 203 /* set if a fatal kmemleak error has occurred */ ··· 230 226 231 227 static bool kmemleak_verbose; 232 228 module_param_named(verbose, kmemleak_verbose, bool, 0600); 233 - 234 - /* 235 - * Early object allocation/freeing logging. Kmemleak is initialized after the 236 - * kernel allocator. However, both the kernel allocator and kmemleak may 237 - * allocate memory blocks which need to be tracked. Kmemleak defines an 238 - * arbitrary buffer to hold the allocation/freeing information before it is 239 - * fully initialized. 240 - */ 241 - 242 - /* kmemleak operation type for early logging */ 243 - enum { 244 - KMEMLEAK_ALLOC, 245 - KMEMLEAK_ALLOC_PERCPU, 246 - KMEMLEAK_FREE, 247 - KMEMLEAK_FREE_PART, 248 - KMEMLEAK_FREE_PERCPU, 249 - KMEMLEAK_NOT_LEAK, 250 - KMEMLEAK_IGNORE, 251 - KMEMLEAK_SCAN_AREA, 252 - KMEMLEAK_NO_SCAN, 253 - KMEMLEAK_SET_EXCESS_REF 254 - }; 255 - 256 - /* 257 - * Structure holding the information passed to kmemleak callbacks during the 258 - * early logging. 259 - */ 260 - struct early_log { 261 - int op_type; /* kmemleak operation type */ 262 - int min_count; /* minimum reference count */ 263 - const void *ptr; /* allocated/freed memory block */ 264 - union { 265 - size_t size; /* memory block size */ 266 - unsigned long excess_ref; /* surplus reference passing */ 267 - }; 268 - unsigned long trace[MAX_TRACE]; /* stack trace */ 269 - unsigned int trace_len; /* stack trace length */ 270 - }; 271 - 272 - /* early logging buffer and current position */ 273 - static struct early_log 274 - early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata; 275 - static int crt_early_log __initdata; 276 229 277 230 static void kmemleak_disable(void); 278 231 ··· 411 450 } 412 451 413 452 /* 453 + * Memory pool allocation and freeing. kmemleak_lock must not be held. 454 + */ 455 + static struct kmemleak_object *mem_pool_alloc(gfp_t gfp) 456 + { 457 + unsigned long flags; 458 + struct kmemleak_object *object; 459 + 460 + /* try the slab allocator first */ 461 + if (object_cache) { 462 + object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); 463 + if (object) 464 + return object; 465 + } 466 + 467 + /* slab allocation failed, try the memory pool */ 468 + write_lock_irqsave(&kmemleak_lock, flags); 469 + object = list_first_entry_or_null(&mem_pool_free_list, 470 + typeof(*object), object_list); 471 + if (object) 472 + list_del(&object->object_list); 473 + else if (mem_pool_free_count) 474 + object = &mem_pool[--mem_pool_free_count]; 475 + else 476 + pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n"); 477 + write_unlock_irqrestore(&kmemleak_lock, flags); 478 + 479 + return object; 480 + } 481 + 482 + /* 483 + * Return the object to either the slab allocator or the memory pool. 484 + */ 485 + static void mem_pool_free(struct kmemleak_object *object) 486 + { 487 + unsigned long flags; 488 + 489 + if (object < mem_pool || object >= mem_pool + ARRAY_SIZE(mem_pool)) { 490 + kmem_cache_free(object_cache, object); 491 + return; 492 + } 493 + 494 + /* add the object to the memory pool free list */ 495 + write_lock_irqsave(&kmemleak_lock, flags); 496 + list_add(&object->object_list, &mem_pool_free_list); 497 + write_unlock_irqrestore(&kmemleak_lock, flags); 498 + } 499 + 500 + /* 414 501 * RCU callback to free a kmemleak_object. 415 502 */ 416 503 static void free_object_rcu(struct rcu_head *rcu) ··· 476 467 hlist_del(&area->node); 477 468 kmem_cache_free(scan_area_cache, area); 478 469 } 479 - kmem_cache_free(object_cache, object); 470 + mem_pool_free(object); 480 471 } 481 472 482 473 /* ··· 494 485 /* should only get here after delete_object was called */ 495 486 WARN_ON(object->flags & OBJECT_ALLOCATED); 496 487 497 - call_rcu(&object->rcu, free_object_rcu); 488 + /* 489 + * It may be too early for the RCU callbacks, however, there is no 490 + * concurrent object_list traversal when !object_cache and all objects 491 + * came from the memory pool. Free the object directly. 492 + */ 493 + if (object_cache) 494 + call_rcu(&object->rcu, free_object_rcu); 495 + else 496 + free_object_rcu(&object->rcu); 498 497 } 499 498 500 499 /* ··· 567 550 struct rb_node **link, *rb_parent; 568 551 unsigned long untagged_ptr; 569 552 570 - object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); 553 + object = mem_pool_alloc(gfp); 571 554 if (!object) { 572 555 pr_warn("Cannot allocate a kmemleak_object structure\n"); 573 556 kmemleak_disable(); ··· 706 689 /* 707 690 * Create one or two objects that may result from the memory block 708 691 * split. Note that partial freeing is only done by free_bootmem() and 709 - * this happens before kmemleak_init() is called. The path below is 710 - * only executed during early log recording in kmemleak_init(), so 711 - * GFP_KERNEL is enough. 692 + * this happens before kmemleak_init() is called. 712 693 */ 713 694 start = object->pointer; 714 695 end = object->pointer + object->size; ··· 778 763 { 779 764 unsigned long flags; 780 765 struct kmemleak_object *object; 781 - struct kmemleak_scan_area *area; 766 + struct kmemleak_scan_area *area = NULL; 782 767 783 768 object = find_and_get_object(ptr, 1); 784 769 if (!object) { ··· 787 772 return; 788 773 } 789 774 790 - area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); 791 - if (!area) { 792 - pr_warn("Cannot allocate a scan area\n"); 793 - goto out; 794 - } 775 + if (scan_area_cache) 776 + area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); 795 777 796 778 spin_lock_irqsave(&object->lock, flags); 779 + if (!area) { 780 + pr_warn_once("Cannot allocate a scan area, scanning the full object\n"); 781 + /* mark the object for full scan to avoid false positives */ 782 + object->flags |= OBJECT_FULL_SCAN; 783 + goto out_unlock; 784 + } 797 785 if (size == SIZE_MAX) { 798 786 size = object->pointer + object->size - ptr; 799 787 } else if (ptr + size > object->pointer + object->size) { ··· 813 795 hlist_add_head(&area->node, &object->area_list); 814 796 out_unlock: 815 797 spin_unlock_irqrestore(&object->lock, flags); 816 - out: 817 798 put_object(object); 818 799 } 819 800 ··· 862 845 put_object(object); 863 846 } 864 847 865 - /* 866 - * Log an early kmemleak_* call to the early_log buffer. These calls will be 867 - * processed later once kmemleak is fully initialized. 868 - */ 869 - static void __init log_early(int op_type, const void *ptr, size_t size, 870 - int min_count) 871 - { 872 - unsigned long flags; 873 - struct early_log *log; 874 - 875 - if (kmemleak_error) { 876 - /* kmemleak stopped recording, just count the requests */ 877 - crt_early_log++; 878 - return; 879 - } 880 - 881 - if (crt_early_log >= ARRAY_SIZE(early_log)) { 882 - crt_early_log++; 883 - kmemleak_disable(); 884 - return; 885 - } 886 - 887 - /* 888 - * There is no need for locking since the kernel is still in UP mode 889 - * at this stage. Disabling the IRQs is enough. 890 - */ 891 - local_irq_save(flags); 892 - log = &early_log[crt_early_log]; 893 - log->op_type = op_type; 894 - log->ptr = ptr; 895 - log->size = size; 896 - log->min_count = min_count; 897 - log->trace_len = __save_stack_trace(log->trace); 898 - crt_early_log++; 899 - local_irq_restore(flags); 900 - } 901 - 902 - /* 903 - * Log an early allocated block and populate the stack trace. 904 - */ 905 - static void early_alloc(struct early_log *log) 906 - { 907 - struct kmemleak_object *object; 908 - unsigned long flags; 909 - int i; 910 - 911 - if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr)) 912 - return; 913 - 914 - /* 915 - * RCU locking needed to ensure object is not freed via put_object(). 916 - */ 917 - rcu_read_lock(); 918 - object = create_object((unsigned long)log->ptr, log->size, 919 - log->min_count, GFP_ATOMIC); 920 - if (!object) 921 - goto out; 922 - spin_lock_irqsave(&object->lock, flags); 923 - for (i = 0; i < log->trace_len; i++) 924 - object->trace[i] = log->trace[i]; 925 - object->trace_len = log->trace_len; 926 - spin_unlock_irqrestore(&object->lock, flags); 927 - out: 928 - rcu_read_unlock(); 929 - } 930 - 931 - /* 932 - * Log an early allocated block and populate the stack trace. 933 - */ 934 - static void early_alloc_percpu(struct early_log *log) 935 - { 936 - unsigned int cpu; 937 - const void __percpu *ptr = log->ptr; 938 - 939 - for_each_possible_cpu(cpu) { 940 - log->ptr = per_cpu_ptr(ptr, cpu); 941 - early_alloc(log); 942 - } 943 - } 944 - 945 848 /** 946 849 * kmemleak_alloc - register a newly allocated object 947 850 * @ptr: pointer to beginning of the object ··· 883 946 884 947 if (kmemleak_enabled && ptr && !IS_ERR(ptr)) 885 948 create_object((unsigned long)ptr, size, min_count, gfp); 886 - else if (kmemleak_early_log) 887 - log_early(KMEMLEAK_ALLOC, ptr, size, min_count); 888 949 } 889 950 EXPORT_SYMBOL_GPL(kmemleak_alloc); 890 951 ··· 910 975 for_each_possible_cpu(cpu) 911 976 create_object((unsigned long)per_cpu_ptr(ptr, cpu), 912 977 size, 0, gfp); 913 - else if (kmemleak_early_log) 914 - log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0); 915 978 } 916 979 EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); 917 980 ··· 934 1001 create_object((unsigned long)area->addr, size, 2, gfp); 935 1002 object_set_excess_ref((unsigned long)area, 936 1003 (unsigned long)area->addr); 937 - } else if (kmemleak_early_log) { 938 - log_early(KMEMLEAK_ALLOC, area->addr, size, 2); 939 - /* reusing early_log.size for storing area->addr */ 940 - log_early(KMEMLEAK_SET_EXCESS_REF, 941 - area, (unsigned long)area->addr, 0); 942 1004 } 943 1005 } 944 1006 EXPORT_SYMBOL_GPL(kmemleak_vmalloc); ··· 951 1023 952 1024 if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) 953 1025 delete_object_full((unsigned long)ptr); 954 - else if (kmemleak_early_log) 955 - log_early(KMEMLEAK_FREE, ptr, 0, 0); 956 1026 } 957 1027 EXPORT_SYMBOL_GPL(kmemleak_free); 958 1028 ··· 969 1043 970 1044 if (kmemleak_enabled && ptr && !IS_ERR(ptr)) 971 1045 delete_object_part((unsigned long)ptr, size); 972 - else if (kmemleak_early_log) 973 - log_early(KMEMLEAK_FREE_PART, ptr, size, 0); 974 1046 } 975 1047 EXPORT_SYMBOL_GPL(kmemleak_free_part); 976 1048 ··· 989 1065 for_each_possible_cpu(cpu) 990 1066 delete_object_full((unsigned long)per_cpu_ptr(ptr, 991 1067 cpu)); 992 - else if (kmemleak_early_log) 993 - log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0); 994 1068 } 995 1069 EXPORT_SYMBOL_GPL(kmemleak_free_percpu); 996 1070 ··· 1039 1117 1040 1118 if (kmemleak_enabled && ptr && !IS_ERR(ptr)) 1041 1119 make_gray_object((unsigned long)ptr); 1042 - else if (kmemleak_early_log) 1043 - log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); 1044 1120 } 1045 1121 EXPORT_SYMBOL(kmemleak_not_leak); 1046 1122 ··· 1057 1137 1058 1138 if (kmemleak_enabled && ptr && !IS_ERR(ptr)) 1059 1139 make_black_object((unsigned long)ptr); 1060 - else if (kmemleak_early_log) 1061 - log_early(KMEMLEAK_IGNORE, ptr, 0, 0); 1062 1140 } 1063 1141 EXPORT_SYMBOL(kmemleak_ignore); 1064 1142 ··· 1077 1159 1078 1160 if (kmemleak_enabled && ptr && size && !IS_ERR(ptr)) 1079 1161 add_scan_area((unsigned long)ptr, size, gfp); 1080 - else if (kmemleak_early_log) 1081 - log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); 1082 1162 } 1083 1163 EXPORT_SYMBOL(kmemleak_scan_area); 1084 1164 ··· 1095 1179 1096 1180 if (kmemleak_enabled && ptr && !IS_ERR(ptr)) 1097 1181 object_no_scan((unsigned long)ptr); 1098 - else if (kmemleak_early_log) 1099 - log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); 1100 1182 } 1101 1183 EXPORT_SYMBOL(kmemleak_no_scan); 1102 1184 ··· 1322 1408 if (!(object->flags & OBJECT_ALLOCATED)) 1323 1409 /* already freed object */ 1324 1410 goto out; 1325 - if (hlist_empty(&object->area_list)) { 1411 + if (hlist_empty(&object->area_list) || 1412 + object->flags & OBJECT_FULL_SCAN) { 1326 1413 void *start = (void *)object->pointer; 1327 1414 void *end = (void *)(object->pointer + object->size); 1328 1415 void *next; ··· 1881 1966 1882 1967 /* stop any memory operation tracing */ 1883 1968 kmemleak_enabled = 0; 1884 - kmemleak_early_log = 0; 1885 1969 1886 1970 /* check whether it is too early for a kernel thread */ 1887 1971 if (kmemleak_initialized) ··· 1908 1994 } 1909 1995 early_param("kmemleak", kmemleak_boot_config); 1910 1996 1911 - static void __init print_log_trace(struct early_log *log) 1912 - { 1913 - pr_notice("Early log backtrace:\n"); 1914 - stack_trace_print(log->trace, log->trace_len, 2); 1915 - } 1916 - 1917 1997 /* 1918 1998 * Kmemleak initialization. 1919 1999 */ 1920 2000 void __init kmemleak_init(void) 1921 2001 { 1922 - int i; 1923 - unsigned long flags; 1924 - 1925 2002 #ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF 1926 2003 if (!kmemleak_skip_disable) { 1927 2004 kmemleak_disable(); ··· 1920 2015 } 1921 2016 #endif 1922 2017 2018 + if (kmemleak_error) 2019 + return; 2020 + 1923 2021 jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); 1924 2022 jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); 1925 2023 1926 2024 object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); 1927 2025 scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); 1928 - 1929 - if (crt_early_log > ARRAY_SIZE(early_log)) 1930 - pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", 1931 - crt_early_log); 1932 - 1933 - /* the kernel is still in UP mode, so disabling the IRQs is enough */ 1934 - local_irq_save(flags); 1935 - kmemleak_early_log = 0; 1936 - if (kmemleak_error) { 1937 - local_irq_restore(flags); 1938 - return; 1939 - } else { 1940 - kmemleak_enabled = 1; 1941 - kmemleak_free_enabled = 1; 1942 - } 1943 - local_irq_restore(flags); 1944 2026 1945 2027 /* register the data/bss sections */ 1946 2028 create_object((unsigned long)_sdata, _edata - _sdata, ··· 1939 2047 create_object((unsigned long)__start_ro_after_init, 1940 2048 __end_ro_after_init - __start_ro_after_init, 1941 2049 KMEMLEAK_GREY, GFP_ATOMIC); 1942 - 1943 - /* 1944 - * This is the point where tracking allocations is safe. Automatic 1945 - * scanning is started during the late initcall. Add the early logged 1946 - * callbacks to the kmemleak infrastructure. 1947 - */ 1948 - for (i = 0; i < crt_early_log; i++) { 1949 - struct early_log *log = &early_log[i]; 1950 - 1951 - switch (log->op_type) { 1952 - case KMEMLEAK_ALLOC: 1953 - early_alloc(log); 1954 - break; 1955 - case KMEMLEAK_ALLOC_PERCPU: 1956 - early_alloc_percpu(log); 1957 - break; 1958 - case KMEMLEAK_FREE: 1959 - kmemleak_free(log->ptr); 1960 - break; 1961 - case KMEMLEAK_FREE_PART: 1962 - kmemleak_free_part(log->ptr, log->size); 1963 - break; 1964 - case KMEMLEAK_FREE_PERCPU: 1965 - kmemleak_free_percpu(log->ptr); 1966 - break; 1967 - case KMEMLEAK_NOT_LEAK: 1968 - kmemleak_not_leak(log->ptr); 1969 - break; 1970 - case KMEMLEAK_IGNORE: 1971 - kmemleak_ignore(log->ptr); 1972 - break; 1973 - case KMEMLEAK_SCAN_AREA: 1974 - kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL); 1975 - break; 1976 - case KMEMLEAK_NO_SCAN: 1977 - kmemleak_no_scan(log->ptr); 1978 - break; 1979 - case KMEMLEAK_SET_EXCESS_REF: 1980 - object_set_excess_ref((unsigned long)log->ptr, 1981 - log->excess_ref); 1982 - break; 1983 - default: 1984 - kmemleak_warn("Unknown early log operation: %d\n", 1985 - log->op_type); 1986 - } 1987 - 1988 - if (kmemleak_warning) { 1989 - print_log_trace(log); 1990 - kmemleak_warning = 0; 1991 - } 1992 - } 1993 2050 } 1994 2051 1995 2052 /* ··· 1967 2126 mutex_unlock(&scan_mutex); 1968 2127 } 1969 2128 1970 - pr_info("Kernel memory leak detector initialized\n"); 2129 + pr_info("Kernel memory leak detector initialized (mem pool available: %d)\n", 2130 + mem_pool_free_count); 1971 2131 1972 2132 return 0; 1973 2133 }

-18

mm/ksm.c

··· 1029 1029 return checksum; 1030 1030 } 1031 1031 1032 - static int memcmp_pages(struct page *page1, struct page *page2) 1033 - { 1034 - char *addr1, *addr2; 1035 - int ret; 1036 - 1037 - addr1 = kmap_atomic(page1); 1038 - addr2 = kmap_atomic(page2); 1039 - ret = memcmp(addr1, addr2, PAGE_SIZE); 1040 - kunmap_atomic(addr2); 1041 - kunmap_atomic(addr1); 1042 - return ret; 1043 - } 1044 - 1045 - static inline int pages_identical(struct page *page1, struct page *page2) 1046 - { 1047 - return !memcmp_pages(page1, page2); 1048 - } 1049 - 1050 1032 static int write_protect_page(struct vm_area_struct *vma, struct page *page, 1051 1033 pte_t *orig_pte) 1052 1034 {

+16 -36

mm/madvise.c

··· 107 107 case MADV_MERGEABLE: 108 108 case MADV_UNMERGEABLE: 109 109 error = ksm_madvise(vma, start, end, behavior, &new_flags); 110 - if (error) { 111 - /* 112 - * madvise() returns EAGAIN if kernel resources, such as 113 - * slab, are temporarily unavailable. 114 - */ 115 - if (error == -ENOMEM) 116 - error = -EAGAIN; 117 - goto out; 118 - } 110 + if (error) 111 + goto out_convert_errno; 119 112 break; 120 113 case MADV_HUGEPAGE: 121 114 case MADV_NOHUGEPAGE: 122 115 error = hugepage_madvise(vma, &new_flags, behavior); 123 - if (error) { 124 - /* 125 - * madvise() returns EAGAIN if kernel resources, such as 126 - * slab, are temporarily unavailable. 127 - */ 128 - if (error == -ENOMEM) 129 - error = -EAGAIN; 130 - goto out; 131 - } 116 + if (error) 117 + goto out_convert_errno; 132 118 break; 133 119 } 134 120 ··· 140 154 goto out; 141 155 } 142 156 error = __split_vma(mm, vma, start, 1); 143 - if (error) { 144 - /* 145 - * madvise() returns EAGAIN if kernel resources, such as 146 - * slab, are temporarily unavailable. 147 - */ 148 - if (error == -ENOMEM) 149 - error = -EAGAIN; 150 - goto out; 151 - } 157 + if (error) 158 + goto out_convert_errno; 152 159 } 153 160 154 161 if (end != vma->vm_end) { ··· 150 171 goto out; 151 172 } 152 173 error = __split_vma(mm, vma, end, 0); 153 - if (error) { 154 - /* 155 - * madvise() returns EAGAIN if kernel resources, such as 156 - * slab, are temporarily unavailable. 157 - */ 158 - if (error == -ENOMEM) 159 - error = -EAGAIN; 160 - goto out; 161 - } 174 + if (error) 175 + goto out_convert_errno; 162 176 } 163 177 164 178 success: ··· 159 187 * vm_flags is protected by the mmap_sem held in write mode. 160 188 */ 161 189 vma->vm_flags = new_flags; 190 + 191 + out_convert_errno: 192 + /* 193 + * madvise() returns EAGAIN if kernel resources, such as 194 + * slab, are temporarily unavailable. 195 + */ 196 + if (error == -ENOMEM) 197 + error = -EAGAIN; 162 198 out: 163 199 return error; 164 200 }

+164 -24

mm/memcontrol.c

··· 57 57 #include <linux/lockdep.h> 58 58 #include <linux/file.h> 59 59 #include <linux/tracehook.h> 60 + #include <linux/psi.h> 60 61 #include <linux/seq_buf.h> 61 62 #include "internal.h" 62 63 #include <net/sock.h> ··· 318 317 EXPORT_SYMBOL(memcg_kmem_enabled_key); 319 318 320 319 struct workqueue_struct *memcg_kmem_cache_wq; 320 + #endif 321 321 322 322 static int memcg_shrinker_map_size; 323 323 static DEFINE_MUTEX(memcg_shrinker_map_mutex); ··· 441 439 rcu_read_unlock(); 442 440 } 443 441 } 444 - 445 - #else /* CONFIG_MEMCG_KMEM */ 446 - static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) 447 - { 448 - return 0; 449 - } 450 - static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { } 451 - #endif /* CONFIG_MEMCG_KMEM */ 452 442 453 443 /** 454 444 * mem_cgroup_css_from_page - css of the memcg associated with a page ··· 2264 2270 for_each_online_cpu(cpu) { 2265 2271 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2266 2272 struct mem_cgroup *memcg; 2273 + bool flush = false; 2267 2274 2275 + rcu_read_lock(); 2268 2276 memcg = stock->cached; 2269 - if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css)) 2270 - continue; 2271 - if (!mem_cgroup_is_descendant(memcg, root_memcg)) { 2272 - css_put(&memcg->css); 2273 - continue; 2274 - } 2275 - if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2277 + if (memcg && stock->nr_pages && 2278 + mem_cgroup_is_descendant(memcg, root_memcg)) 2279 + flush = true; 2280 + rcu_read_unlock(); 2281 + 2282 + if (flush && 2283 + !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2276 2284 if (cpu == curcpu) 2277 2285 drain_local_stock(&stock->work); 2278 2286 else 2279 2287 schedule_work_on(cpu, &stock->work); 2280 2288 } 2281 - css_put(&memcg->css); 2282 2289 } 2283 2290 put_cpu(); 2284 2291 mutex_unlock(&percpu_charge_mutex); ··· 2354 2359 } 2355 2360 2356 2361 /* 2362 + * Clamp the maximum sleep time per allocation batch to 2 seconds. This is 2363 + * enough to still cause a significant slowdown in most cases, while still 2364 + * allowing diagnostics and tracing to proceed without becoming stuck. 2365 + */ 2366 + #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) 2367 + 2368 + /* 2369 + * When calculating the delay, we use these either side of the exponentiation to 2370 + * maintain precision and scale to a reasonable number of jiffies (see the table 2371 + * below. 2372 + * 2373 + * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the 2374 + * overage ratio to a delay. 2375 + * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the 2376 + * proposed penalty in order to reduce to a reasonable number of jiffies, and 2377 + * to produce a reasonable delay curve. 2378 + * 2379 + * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a 2380 + * reasonable delay curve compared to precision-adjusted overage, not 2381 + * penalising heavily at first, but still making sure that growth beyond the 2382 + * limit penalises misbehaviour cgroups by slowing them down exponentially. For 2383 + * example, with a high of 100 megabytes: 2384 + * 2385 + * +-------+------------------------+ 2386 + * | usage | time to allocate in ms | 2387 + * +-------+------------------------+ 2388 + * | 100M | 0 | 2389 + * | 101M | 6 | 2390 + * | 102M | 25 | 2391 + * | 103M | 57 | 2392 + * | 104M | 102 | 2393 + * | 105M | 159 | 2394 + * | 106M | 230 | 2395 + * | 107M | 313 | 2396 + * | 108M | 409 | 2397 + * | 109M | 518 | 2398 + * | 110M | 639 | 2399 + * | 111M | 774 | 2400 + * | 112M | 921 | 2401 + * | 113M | 1081 | 2402 + * | 114M | 1254 | 2403 + * | 115M | 1439 | 2404 + * | 116M | 1638 | 2405 + * | 117M | 1849 | 2406 + * | 118M | 2000 | 2407 + * | 119M | 2000 | 2408 + * | 120M | 2000 | 2409 + * +-------+------------------------+ 2410 + */ 2411 + #define MEMCG_DELAY_PRECISION_SHIFT 20 2412 + #define MEMCG_DELAY_SCALING_SHIFT 14 2413 + 2414 + /* 2357 2415 * Scheduled by try_charge() to be executed from the userland return path 2358 2416 * and reclaims memory over the high limit. 2359 2417 */ 2360 2418 void mem_cgroup_handle_over_high(void) 2361 2419 { 2420 + unsigned long usage, high, clamped_high; 2421 + unsigned long pflags; 2422 + unsigned long penalty_jiffies, overage; 2362 2423 unsigned int nr_pages = current->memcg_nr_pages_over_high; 2363 2424 struct mem_cgroup *memcg; 2364 2425 ··· 2423 2372 2424 2373 memcg = get_mem_cgroup_from_mm(current->mm); 2425 2374 reclaim_high(memcg, nr_pages, GFP_KERNEL); 2426 - css_put(&memcg->css); 2427 2375 current->memcg_nr_pages_over_high = 0; 2376 + 2377 + /* 2378 + * memory.high is breached and reclaim is unable to keep up. Throttle 2379 + * allocators proactively to slow down excessive growth. 2380 + * 2381 + * We use overage compared to memory.high to calculate the number of 2382 + * jiffies to sleep (penalty_jiffies). Ideally this value should be 2383 + * fairly lenient on small overages, and increasingly harsh when the 2384 + * memcg in question makes it clear that it has no intention of stopping 2385 + * its crazy behaviour, so we exponentially increase the delay based on 2386 + * overage amount. 2387 + */ 2388 + 2389 + usage = page_counter_read(&memcg->memory); 2390 + high = READ_ONCE(memcg->high); 2391 + 2392 + if (usage <= high) 2393 + goto out; 2394 + 2395 + /* 2396 + * Prevent division by 0 in overage calculation by acting as if it was a 2397 + * threshold of 1 page 2398 + */ 2399 + clamped_high = max(high, 1UL); 2400 + 2401 + overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT, 2402 + clamped_high); 2403 + 2404 + penalty_jiffies = ((u64)overage * overage * HZ) 2405 + >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT); 2406 + 2407 + /* 2408 + * Factor in the task's own contribution to the overage, such that four 2409 + * N-sized allocations are throttled approximately the same as one 2410 + * 4N-sized allocation. 2411 + * 2412 + * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or 2413 + * larger the current charge patch is than that. 2414 + */ 2415 + penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; 2416 + 2417 + /* 2418 + * Clamp the max delay per usermode return so as to still keep the 2419 + * application moving forwards and also permit diagnostics, albeit 2420 + * extremely slowly. 2421 + */ 2422 + penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); 2423 + 2424 + /* 2425 + * Don't sleep if the amount of jiffies this memcg owes us is so low 2426 + * that it's not even worth doing, in an attempt to be nice to those who 2427 + * go only a small amount over their memory.high value and maybe haven't 2428 + * been aggressively reclaimed enough yet. 2429 + */ 2430 + if (penalty_jiffies <= HZ / 100) 2431 + goto out; 2432 + 2433 + /* 2434 + * If we exit early, we're guaranteed to die (since 2435 + * schedule_timeout_killable sets TASK_KILLABLE). This means we don't 2436 + * need to account for any ill-begotten jiffies to pay them off later. 2437 + */ 2438 + psi_memstall_enter(&pflags); 2439 + schedule_timeout_killable(penalty_jiffies); 2440 + psi_memstall_leave(&pflags); 2441 + 2442 + out: 2443 + css_put(&memcg->css); 2428 2444 } 2429 2445 2430 2446 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, ··· 3630 3512 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 3631 3513 break; 3632 3514 case _KMEM: 3515 + pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " 3516 + "Please report your usecase to linux-mm@kvack.org if you " 3517 + "depend on this functionality.\n"); 3633 3518 ret = memcg_update_kmem_max(memcg, nr_pages); 3634 3519 break; 3635 3520 case _TCP: ··· 4926 4805 } 4927 4806 } 4928 4807 4929 - static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) 4930 - { 4931 - mem_cgroup_id_get_many(memcg, 1); 4932 - } 4933 - 4934 4808 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) 4935 4809 { 4936 4810 mem_cgroup_id_put_many(memcg, 1); ··· 5070 4954 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5071 4955 memcg->cgwb_frn[i].done = 5072 4956 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); 4957 + #endif 4958 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4959 + spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); 4960 + INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); 4961 + memcg->deferred_split_queue.split_queue_len = 0; 5073 4962 #endif 5074 4963 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); 5075 4964 return memcg; ··· 5454 5333 __mod_memcg_state(to, NR_WRITEBACK, nr_pages); 5455 5334 } 5456 5335 5336 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5337 + if (compound && !list_empty(page_deferred_list(page))) { 5338 + spin_lock(&from->deferred_split_queue.split_queue_lock); 5339 + list_del_init(page_deferred_list(page)); 5340 + from->deferred_split_queue.split_queue_len--; 5341 + spin_unlock(&from->deferred_split_queue.split_queue_lock); 5342 + } 5343 + #endif 5457 5344 /* 5458 5345 * It is safe to change page->mem_cgroup here because the page 5459 5346 * is referenced, charged, and isolated - we can't race with ··· 5470 5341 5471 5342 /* caller should have done css_get */ 5472 5343 page->mem_cgroup = to; 5344 + 5345 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5346 + if (compound && list_empty(page_deferred_list(page))) { 5347 + spin_lock(&to->deferred_split_queue.split_queue_lock); 5348 + list_add_tail(page_deferred_list(page), 5349 + &to->deferred_split_queue.split_queue); 5350 + to->deferred_split_queue.split_queue_len++; 5351 + spin_unlock(&to->deferred_split_queue.split_queue_lock); 5352 + } 5353 + #endif 5354 + 5473 5355 spin_unlock_irqrestore(&from->move_lock, flags); 5474 5356 5475 5357 ret = 0; ··· 6651 6511 unsigned int nr_pages = 1; 6652 6512 6653 6513 if (PageTransHuge(page)) { 6654 - nr_pages <<= compound_order(page); 6514 + nr_pages = compound_nr(page); 6655 6515 ug->nr_huge += nr_pages; 6656 6516 } 6657 6517 if (PageAnon(page)) ··· 6663 6523 } 6664 6524 ug->pgpgout++; 6665 6525 } else { 6666 - ug->nr_kmem += 1 << compound_order(page); 6526 + ug->nr_kmem += compound_nr(page); 6667 6527 __ClearPageKmemcg(page); 6668 6528 } 6669 6529

+2

mm/memfd.c

··· 39 39 xas_for_each(xas, page, ULONG_MAX) { 40 40 if (xa_is_value(page)) 41 41 continue; 42 + page = find_subpage(page, xas->xa_index); 42 43 if (page_count(page) - page_mapcount(page) > 1) 43 44 xas_set_mark(xas, MEMFD_TAG_PINNED); 44 45 ··· 89 88 bool clear = true; 90 89 if (xa_is_value(page)) 91 90 continue; 91 + page = find_subpage(page, xas.xa_index); 92 92 if (page_count(page) - page_mapcount(page) != 1) { 93 93 /* 94 94 * On the last scan, we clean up all those tags

+9 -4

mm/memory.c

··· 518 518 (long long)pte_val(pte), (long long)pmd_val(*pmd)); 519 519 if (page) 520 520 dump_page(page, "bad pte"); 521 - pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", 521 + pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", 522 522 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); 523 523 pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n", 524 524 vma->vm_file, ··· 1026 1026 if (pte_none(ptent)) 1027 1027 continue; 1028 1028 1029 + if (need_resched()) 1030 + break; 1031 + 1029 1032 if (pte_present(ptent)) { 1030 1033 struct page *page; 1031 1034 ··· 1096 1093 if (unlikely(details)) 1097 1094 continue; 1098 1095 1099 - entry = pte_to_swp_entry(ptent); 1100 1096 if (!non_swap_entry(entry)) 1101 1097 rss[MM_SWAPENTS]--; 1102 1098 else if (is_migration_entry(entry)) { ··· 1126 1124 if (force_flush) { 1127 1125 force_flush = 0; 1128 1126 tlb_flush_mmu(tlb); 1129 - if (addr != end) 1130 - goto again; 1127 + } 1128 + 1129 + if (addr != end) { 1130 + cond_resched(); 1131 + goto again; 1131 1132 } 1132 1133 1133 1134 return addr;

+48 -55

mm/memory_hotplug.c

··· 632 632 #endif 633 633 } 634 634 635 - static int online_pages_blocks(unsigned long start, unsigned long nr_pages) 636 - { 637 - unsigned long end = start + nr_pages; 638 - int order, onlined_pages = 0; 639 - 640 - while (start < end) { 641 - order = min(MAX_ORDER - 1, 642 - get_order(PFN_PHYS(end) - PFN_PHYS(start))); 643 - (*online_page_callback)(pfn_to_page(start), order); 644 - 645 - onlined_pages += (1UL << order); 646 - start += (1UL << order); 647 - } 648 - return onlined_pages; 649 - } 650 - 651 635 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 652 636 void *arg) 653 637 { 654 - unsigned long onlined_pages = *(unsigned long *)arg; 638 + const unsigned long end_pfn = start_pfn + nr_pages; 639 + unsigned long pfn; 640 + int order; 655 641 656 - if (PageReserved(pfn_to_page(start_pfn))) 657 - onlined_pages += online_pages_blocks(start_pfn, nr_pages); 642 + /* 643 + * Online the pages. The callback might decide to keep some pages 644 + * PG_reserved (to add them to the buddy later), but we still account 645 + * them as being online/belonging to this zone ("present"). 646 + */ 647 + for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) { 648 + order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn))); 649 + /* __free_pages_core() wants pfns to be aligned to the order */ 650 + if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order))) 651 + order = 0; 652 + (*online_page_callback)(pfn_to_page(pfn), order); 653 + } 658 654 659 - online_mem_sections(start_pfn, start_pfn + nr_pages); 655 + /* mark all involved sections as online */ 656 + online_mem_sections(start_pfn, end_pfn); 660 657 661 - *(unsigned long *)arg = onlined_pages; 658 + *(unsigned long *)arg += nr_pages; 662 659 return 0; 663 660 } 664 661 ··· 711 714 pgdat->node_start_pfn = start_pfn; 712 715 713 716 pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; 714 - } 715 717 718 + } 719 + /* 720 + * Associate the pfn range with the given zone, initializing the memmaps 721 + * and resizing the pgdat/zone data to span the added pages. After this 722 + * call, all affected pages are PG_reserved. 723 + */ 716 724 void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, 717 725 unsigned long nr_pages, struct vmem_altmap *altmap) 718 726 { ··· 806 804 return default_zone_for_pfn(nid, start_pfn, nr_pages); 807 805 } 808 806 809 - /* 810 - * Associates the given pfn range with the given node and the zone appropriate 811 - * for the given online type. 812 - */ 813 - static struct zone * __meminit move_pfn_range(int online_type, int nid, 814 - unsigned long start_pfn, unsigned long nr_pages) 815 - { 816 - struct zone *zone; 817 - 818 - zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); 819 - move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL); 820 - return zone; 821 - } 822 - 823 807 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 824 808 { 825 809 unsigned long flags; ··· 828 840 put_device(&mem->dev); 829 841 830 842 /* associate pfn range with the zone */ 831 - zone = move_pfn_range(online_type, nid, pfn, nr_pages); 843 + zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); 844 + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL); 832 845 833 846 arg.start_pfn = pfn; 834 847 arg.nr_pages = nr_pages; ··· 853 864 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 854 865 online_pages_range); 855 866 if (ret) { 867 + /* not a single memory resource was applicable */ 856 868 if (need_zonelists_rebuild) 857 869 zone_pcp_reset(zone); 858 870 goto failed_addition; ··· 867 877 868 878 shuffle_zone(zone); 869 879 870 - if (onlined_pages) { 871 - node_states_set_node(nid, &arg); 872 - if (need_zonelists_rebuild) 873 - build_all_zonelists(NULL); 874 - else 875 - zone_pcp_update(zone); 876 - } 880 + node_states_set_node(nid, &arg); 881 + if (need_zonelists_rebuild) 882 + build_all_zonelists(NULL); 883 + else 884 + zone_pcp_update(zone); 877 885 878 886 init_per_zone_wmark_min(); 879 887 880 - if (onlined_pages) { 881 - kswapd_run(nid); 882 - kcompactd_run(nid); 883 - } 888 + kswapd_run(nid); 889 + kcompactd_run(nid); 884 890 885 891 vm_total_pages = nr_free_pagecache_pages(); 886 892 887 893 writeback_set_ratelimit(); 888 894 889 - if (onlined_pages) 890 - memory_notify(MEM_ONLINE, &arg); 895 + memory_notify(MEM_ONLINE, &arg); 891 896 mem_hotplug_done(); 892 897 return 0; 893 898 ··· 918 933 if (!pgdat) 919 934 return NULL; 920 935 936 + pgdat->per_cpu_nodestats = 937 + alloc_percpu(struct per_cpu_nodestat); 921 938 arch_refresh_nodedata(nid, pgdat); 922 939 } else { 940 + int cpu; 923 941 /* 924 942 * Reset the nr_zones, order and classzone_idx before reuse. 925 943 * Note that kswapd will init kswapd_classzone_idx properly ··· 931 943 pgdat->nr_zones = 0; 932 944 pgdat->kswapd_order = 0; 933 945 pgdat->kswapd_classzone_idx = 0; 946 + for_each_online_cpu(cpu) { 947 + struct per_cpu_nodestat *p; 948 + 949 + p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); 950 + memset(p, 0, sizeof(*p)); 951 + } 934 952 } 935 953 936 954 /* we can use NODE_DATA(nid) from here */ ··· 946 952 947 953 /* init node's zones as empty zones, we don't have any present pages.*/ 948 954 free_area_init_core_hotplug(nid); 949 - pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); 950 955 951 956 /* 952 957 * The node we allocated has no zone fallback lists. For avoiding ··· 1302 1309 head = compound_head(page); 1303 1310 if (page_huge_active(head)) 1304 1311 return pfn; 1305 - skip = (1 << compound_order(head)) - (page - head); 1312 + skip = compound_nr(head) - (page - head); 1306 1313 pfn += skip - 1; 1307 1314 } 1308 1315 return 0; ··· 1340 1347 1341 1348 if (PageHuge(page)) { 1342 1349 struct page *head = compound_head(page); 1343 - pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1350 + pfn = page_to_pfn(head) + compound_nr(head) - 1; 1344 1351 isolate_huge_page(head, &source); 1345 1352 continue; 1346 1353 } else if (PageTransHuge(page)) ··· 1655 1662 phys_addr_t beginpa, endpa; 1656 1663 1657 1664 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 1658 - endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 1665 + endpa = beginpa + memory_block_size_bytes() - 1; 1659 1666 pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", 1660 1667 &beginpa, &endpa); 1661 1668 ··· 1793 1800 { 1794 1801 1795 1802 /* 1796 - * trigger BUG() is some memory is not offlined prior to calling this 1803 + * trigger BUG() if some memory is not offlined prior to calling this 1797 1804 * function 1798 1805 */ 1799 1806 if (try_remove_memory(nid, start, size))

-4

mm/mempolicy.c

··· 1512 1512 if (nodes_empty(*new)) 1513 1513 goto out_put; 1514 1514 1515 - nodes_and(*new, *new, node_states[N_MEMORY]); 1516 - if (nodes_empty(*new)) 1517 - goto out_put; 1518 - 1519 1515 err = security_task_movememory(task); 1520 1516 if (err) 1521 1517 goto out_put;

+5 -8

mm/migrate.c

··· 460 460 461 461 for (i = 1; i < HPAGE_PMD_NR; i++) { 462 462 xas_next(&xas); 463 - xas_store(&xas, newpage + i); 463 + xas_store(&xas, newpage); 464 464 } 465 465 } 466 466 ··· 1892 1892 VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); 1893 1893 1894 1894 /* Avoid migrating to a node that is nearly full */ 1895 - if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) 1895 + if (!migrate_balanced_pgdat(pgdat, compound_nr(page))) 1896 1896 return 0; 1897 1897 1898 1898 if (isolate_lru_page(page)) ··· 2218 2218 pte_t pte; 2219 2219 2220 2220 pte = *ptep; 2221 - pfn = pte_pfn(pte); 2222 2221 2223 2222 if (pte_none(pte)) { 2224 2223 mpfn = MIGRATE_PFN_MIGRATE; 2225 2224 migrate->cpages++; 2226 - pfn = 0; 2227 2225 goto next; 2228 2226 } 2229 2227 2230 2228 if (!pte_present(pte)) { 2231 - mpfn = pfn = 0; 2229 + mpfn = 0; 2232 2230 2233 2231 /* 2234 2232 * Only care about unaddressable device page special ··· 2243 2245 if (is_write_device_private_entry(entry)) 2244 2246 mpfn |= MIGRATE_PFN_WRITE; 2245 2247 } else { 2248 + pfn = pte_pfn(pte); 2246 2249 if (is_zero_pfn(pfn)) { 2247 2250 mpfn = MIGRATE_PFN_MIGRATE; 2248 2251 migrate->cpages++; 2249 - pfn = 0; 2250 2252 goto next; 2251 2253 } 2252 2254 page = vm_normal_page(migrate->vma, addr, pte); ··· 2256 2258 2257 2259 /* FIXME support THP */ 2258 2260 if (!page || !page->mapping || PageTransCompound(page)) { 2259 - mpfn = pfn = 0; 2261 + mpfn = 0; 2260 2262 goto next; 2261 2263 } 2262 - pfn = page_to_pfn(page); 2263 2264 2264 2265 /* 2265 2266 * By getting a reference on the page we pin it and that blocks

+6 -6

mm/mmap.c

··· 1358 1358 if (S_ISBLK(inode->i_mode)) 1359 1359 return MAX_LFS_FILESIZE; 1360 1360 1361 + if (S_ISSOCK(inode->i_mode)) 1362 + return MAX_LFS_FILESIZE; 1363 + 1361 1364 /* Special "we do even unsigned file positions" case */ 1362 1365 if (file->f_mode & FMODE_UNSIGNED_OFFSET) 1363 1366 return 0; ··· 2277 2274 if (vma) { 2278 2275 *pprev = vma->vm_prev; 2279 2276 } else { 2280 - struct rb_node *rb_node = mm->mm_rb.rb_node; 2281 - *pprev = NULL; 2282 - while (rb_node) { 2283 - *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); 2284 - rb_node = rb_node->rb_right; 2285 - } 2277 + struct rb_node *rb_node = rb_last(&mm->mm_rb); 2278 + 2279 + *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL; 2286 2280 } 2287 2281 return vma; 2288 2282 }

-2

mm/mmu_gather.c

··· 271 271 272 272 tlb_flush_mmu(tlb); 273 273 274 - /* keep the page table cache within bounds */ 275 - check_pgt_cache(); 276 274 #ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER 277 275 tlb_batch_list_free(tlb); 278 276 #endif

+1 -1

mm/nommu.c

··· 108 108 * The ksize() function is only guaranteed to work for pointers 109 109 * returned by kmalloc(). So handle arbitrary pointers here. 110 110 */ 111 - return PAGE_SIZE << compound_order(page); 111 + return page_size(page); 112 112 } 113 113 114 114 /**

+13 -11

mm/oom_kill.c

··· 73 73 /** 74 74 * oom_cpuset_eligible() - check task eligiblity for kill 75 75 * @start: task struct of which task to consider 76 - * @mask: nodemask passed to page allocator for mempolicy ooms 76 + * @oc: pointer to struct oom_control 77 77 * 78 78 * Task eligibility is determined by whether or not a candidate task, @tsk, 79 79 * shares the same mempolicy nodes as current if it is bound by such a policy ··· 287 287 !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { 288 288 oc->totalpages = total_swap_pages; 289 289 for_each_node_mask(nid, *oc->nodemask) 290 - oc->totalpages += node_spanned_pages(nid); 290 + oc->totalpages += node_present_pages(nid); 291 291 return CONSTRAINT_MEMORY_POLICY; 292 292 } 293 293 ··· 300 300 if (cpuset_limited) { 301 301 oc->totalpages = total_swap_pages; 302 302 for_each_node_mask(nid, cpuset_current_mems_allowed) 303 - oc->totalpages += node_spanned_pages(nid); 303 + oc->totalpages += node_present_pages(nid); 304 304 return CONSTRAINT_CPUSET; 305 305 } 306 306 return CONSTRAINT_NONE; ··· 884 884 */ 885 885 do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); 886 886 mark_oom_victim(victim); 887 - pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", 888 - message, task_pid_nr(victim), victim->comm, 889 - K(victim->mm->total_vm), 890 - K(get_mm_counter(victim->mm, MM_ANONPAGES)), 891 - K(get_mm_counter(victim->mm, MM_FILEPAGES)), 892 - K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); 887 + pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n", 888 + message, task_pid_nr(victim), victim->comm, K(mm->total_vm), 889 + K(get_mm_counter(mm, MM_ANONPAGES)), 890 + K(get_mm_counter(mm, MM_FILEPAGES)), 891 + K(get_mm_counter(mm, MM_SHMEMPAGES)), 892 + from_kuid(&init_user_ns, task_uid(victim)), 893 + mm_pgtables_bytes(mm), victim->signal->oom_score_adj); 893 894 task_unlock(victim); 894 895 895 896 /* ··· 1069 1068 * The OOM killer does not compensate for IO-less reclaim. 1070 1069 * pagefault_out_of_memory lost its gfp context so we have to 1071 1070 * make sure exclude 0 mask - all other users should have at least 1072 - * ___GFP_DIRECT_RECLAIM to get here. 1071 + * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to 1072 + * invoke the OOM killer even if it is a GFP_NOFS allocation. 1073 1073 */ 1074 - if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) 1074 + if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc)) 1075 1075 return true; 1076 1076 1077 1077 /*

+20 -9

mm/page_alloc.c

··· 670 670 671 671 void free_compound_page(struct page *page) 672 672 { 673 + mem_cgroup_uncharge(page); 673 674 __free_pages_ok(page, compound_order(page)); 674 675 } 675 676 ··· 3956 3955 goto check_priority; 3957 3956 3958 3957 /* 3959 - * make sure the compaction wasn't deferred or didn't bail out early 3960 - * due to locks contention before we declare that we should give up. 3961 - * But do not retry if the given zonelist is not suitable for 3962 - * compaction. 3958 + * compaction was skipped because there are not enough order-0 pages 3959 + * to work with, so we retry only if it looks like reclaim can help. 3963 3960 */ 3964 - if (compaction_withdrawn(compact_result)) { 3961 + if (compaction_needs_reclaim(compact_result)) { 3965 3962 ret = compaction_zonelist_suitable(ac, order, alloc_flags); 3966 3963 goto out; 3964 + } 3965 + 3966 + /* 3967 + * make sure the compaction wasn't deferred or didn't bail out early 3968 + * due to locks contention before we declare that we should give up. 3969 + * But the next retry should use a higher priority if allowed, so 3970 + * we don't just keep bailing out endlessly. 3971 + */ 3972 + if (compaction_withdrawn(compact_result)) { 3973 + goto check_priority; 3967 3974 } 3968 3975 3969 3976 /* ··· 6647 6638 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 6648 6639 static void pgdat_init_split_queue(struct pglist_data *pgdat) 6649 6640 { 6650 - spin_lock_init(&pgdat->split_queue_lock); 6651 - INIT_LIST_HEAD(&pgdat->split_queue); 6652 - pgdat->split_queue_len = 0; 6641 + struct deferred_split *ds_queue = &pgdat->deferred_split_queue; 6642 + 6643 + spin_lock_init(&ds_queue->split_queue_lock); 6644 + INIT_LIST_HEAD(&ds_queue->split_queue); 6645 + ds_queue->split_queue_len = 0; 6653 6646 } 6654 6647 #else 6655 6648 static void pgdat_init_split_queue(struct pglist_data *pgdat) {} ··· 8207 8196 if (!hugepage_migration_supported(page_hstate(head))) 8208 8197 goto unmovable; 8209 8198 8210 - skip_pages = (1 << compound_order(head)) - (page - head); 8199 + skip_pages = compound_nr(head) - (page - head); 8211 8200 iter += skip_pages - 1; 8212 8201 continue; 8213 8202 }

+90 -33

mm/page_owner.c

··· 24 24 short last_migrate_reason; 25 25 gfp_t gfp_mask; 26 26 depot_stack_handle_t handle; 27 + #ifdef CONFIG_DEBUG_PAGEALLOC 28 + depot_stack_handle_t free_handle; 29 + #endif 27 30 }; 28 31 29 32 static bool page_owner_disabled = true; ··· 105 102 return (void *)page_ext + page_owner_ops.offset; 106 103 } 107 104 108 - void __reset_page_owner(struct page *page, unsigned int order) 109 - { 110 - int i; 111 - struct page_ext *page_ext; 112 - 113 - for (i = 0; i < (1 << order); i++) { 114 - page_ext = lookup_page_ext(page + i); 115 - if (unlikely(!page_ext)) 116 - continue; 117 - __clear_bit(PAGE_EXT_OWNER, &page_ext->flags); 118 - } 119 - } 120 - 121 105 static inline bool check_recursive_alloc(unsigned long *entries, 122 106 unsigned int nr_entries, 123 107 unsigned long ip) ··· 144 154 return handle; 145 155 } 146 156 147 - static inline void __set_page_owner_handle(struct page_ext *page_ext, 148 - depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask) 157 + void __reset_page_owner(struct page *page, unsigned int order) 149 158 { 159 + int i; 160 + struct page_ext *page_ext; 161 + #ifdef CONFIG_DEBUG_PAGEALLOC 162 + depot_stack_handle_t handle = 0; 150 163 struct page_owner *page_owner; 151 164 152 - page_owner = get_page_owner(page_ext); 153 - page_owner->handle = handle; 154 - page_owner->order = order; 155 - page_owner->gfp_mask = gfp_mask; 156 - page_owner->last_migrate_reason = -1; 165 + if (debug_pagealloc_enabled()) 166 + handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); 167 + #endif 157 168 158 - __set_bit(PAGE_EXT_OWNER, &page_ext->flags); 169 + for (i = 0; i < (1 << order); i++) { 170 + page_ext = lookup_page_ext(page + i); 171 + if (unlikely(!page_ext)) 172 + continue; 173 + __clear_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags); 174 + #ifdef CONFIG_DEBUG_PAGEALLOC 175 + if (debug_pagealloc_enabled()) { 176 + page_owner = get_page_owner(page_ext); 177 + page_owner->free_handle = handle; 178 + } 179 + #endif 180 + } 181 + } 182 + 183 + static inline void __set_page_owner_handle(struct page *page, 184 + struct page_ext *page_ext, depot_stack_handle_t handle, 185 + unsigned int order, gfp_t gfp_mask) 186 + { 187 + struct page_owner *page_owner; 188 + int i; 189 + 190 + for (i = 0; i < (1 << order); i++) { 191 + page_owner = get_page_owner(page_ext); 192 + page_owner->handle = handle; 193 + page_owner->order = order; 194 + page_owner->gfp_mask = gfp_mask; 195 + page_owner->last_migrate_reason = -1; 196 + __set_bit(PAGE_EXT_OWNER, &page_ext->flags); 197 + __set_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags); 198 + 199 + page_ext = lookup_page_ext(page + i); 200 + } 159 201 } 160 202 161 203 noinline void __set_page_owner(struct page *page, unsigned int order, ··· 200 178 return; 201 179 202 180 handle = save_stack(gfp_mask); 203 - __set_page_owner_handle(page_ext, handle, order, gfp_mask); 181 + __set_page_owner_handle(page, page_ext, handle, order, gfp_mask); 204 182 } 205 183 206 184 void __set_page_owner_migrate_reason(struct page *page, int reason) ··· 226 204 227 205 page_owner = get_page_owner(page_ext); 228 206 page_owner->order = 0; 229 - for (i = 1; i < (1 << order); i++) 230 - __copy_page_owner(page, page + i); 207 + for (i = 1; i < (1 << order); i++) { 208 + page_ext = lookup_page_ext(page + i); 209 + page_owner = get_page_owner(page_ext); 210 + page_owner->order = 0; 211 + } 231 212 } 232 213 233 214 void __copy_page_owner(struct page *oldpage, struct page *newpage) ··· 260 235 * the new page, which will be freed. 261 236 */ 262 237 __set_bit(PAGE_EXT_OWNER, &new_ext->flags); 238 + __set_bit(PAGE_EXT_OWNER_ACTIVE, &new_ext->flags); 263 239 } 264 240 265 241 void pagetypeinfo_showmixedcount_print(struct seq_file *m, ··· 320 294 if (unlikely(!page_ext)) 321 295 continue; 322 296 323 - if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) 297 + if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags)) 324 298 continue; 325 299 326 300 page_owner = get_page_owner(page_ext); ··· 431 405 mt = gfpflags_to_migratetype(gfp_mask); 432 406 433 407 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { 434 - pr_alert("page_owner info is not active (free page?)\n"); 408 + pr_alert("page_owner info is not present (never set?)\n"); 435 409 return; 436 410 } 411 + 412 + if (test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags)) 413 + pr_alert("page_owner tracks the page as allocated\n"); 414 + else 415 + pr_alert("page_owner tracks the page as freed\n"); 416 + 417 + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", 418 + page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); 437 419 438 420 handle = READ_ONCE(page_owner->handle); 439 421 if (!handle) { 440 - pr_alert("page_owner info is not active (free page?)\n"); 441 - return; 422 + pr_alert("page_owner allocation stack trace missing\n"); 423 + } else { 424 + nr_entries = stack_depot_fetch(handle, &entries); 425 + stack_trace_print(entries, nr_entries, 0); 442 426 } 443 427 444 - nr_entries = stack_depot_fetch(handle, &entries); 445 - pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", 446 - page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); 447 - stack_trace_print(entries, nr_entries, 0); 428 + #ifdef CONFIG_DEBUG_PAGEALLOC 429 + handle = READ_ONCE(page_owner->free_handle); 430 + if (!handle) { 431 + pr_alert("page_owner free stack trace missing\n"); 432 + } else { 433 + nr_entries = stack_depot_fetch(handle, &entries); 434 + pr_alert("page last free stack trace:\n"); 435 + stack_trace_print(entries, nr_entries, 0); 436 + } 437 + #endif 448 438 449 439 if (page_owner->last_migrate_reason != -1) 450 440 pr_alert("page has been migrated, last migrate reason: %s\n", ··· 523 481 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) 524 482 continue; 525 483 484 + /* 485 + * Although we do have the info about past allocation of free 486 + * pages, it's not relevant for current memory usage. 487 + */ 488 + if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags)) 489 + continue; 490 + 526 491 page_owner = get_page_owner(page_ext); 492 + 493 + /* 494 + * Don't print "tail" pages of high-order allocations as that 495 + * would inflate the stats. 496 + */ 497 + if (!IS_ALIGNED(pfn, 1 << page_owner->order)) 498 + continue; 527 499 528 500 /* 529 501 * Access to page_ext->handle isn't synchronous so we should ··· 618 562 continue; 619 563 620 564 /* Found early allocated page */ 621 - __set_page_owner_handle(page_ext, early_handle, 0, 0); 565 + __set_page_owner_handle(page, page_ext, early_handle, 566 + 0, 0); 622 567 count++; 623 568 } 624 569 cond_resched();

+1 -1

mm/page_poison.c

··· 101 101 /* 102 102 * Page poisoning when enabled poisons each and every page 103 103 * that is freed to buddy. Thus no extra check is done to 104 - * see if a page was posioned. 104 + * see if a page was poisoned. 105 105 */ 106 106 check_poison_mem(addr, PAGE_SIZE); 107 107 kunmap_atomic(addr);

+1 -2

mm/page_vma_mapped.c

··· 153 153 154 154 if (unlikely(PageHuge(pvmw->page))) { 155 155 /* when pud is not present, pte will be NULL */ 156 - pvmw->pte = huge_pte_offset(mm, pvmw->address, 157 - PAGE_SIZE << compound_order(page)); 156 + pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page)); 158 157 if (!pvmw->pte) 159 158 return false; 160 159

-103

mm/quicklist.c

··· 1 - // SPDX-License-Identifier: GPL-2.0 2 - /* 3 - * Quicklist support. 4 - * 5 - * Quicklists are light weight lists of pages that have a defined state 6 - * on alloc and free. Pages must be in the quicklist specific defined state 7 - * (zero by default) when the page is freed. It seems that the initial idea 8 - * for such lists first came from Dave Miller and then various other people 9 - * improved on it. 10 - * 11 - * Copyright (C) 2007 SGI, 12 - * Christoph Lameter <cl@linux.com> 13 - * Generalized, added support for multiple lists and 14 - * constructors / destructors. 15 - */ 16 - #include <linux/kernel.h> 17 - 18 - #include <linux/gfp.h> 19 - #include <linux/mm.h> 20 - #include <linux/mmzone.h> 21 - #include <linux/quicklist.h> 22 - 23 - DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); 24 - 25 - #define FRACTION_OF_NODE_MEM 16 26 - 27 - static unsigned long max_pages(unsigned long min_pages) 28 - { 29 - unsigned long node_free_pages, max; 30 - int node = numa_node_id(); 31 - struct zone *zones = NODE_DATA(node)->node_zones; 32 - int num_cpus_on_node; 33 - 34 - node_free_pages = 35 - #ifdef CONFIG_ZONE_DMA 36 - zone_page_state(&zones[ZONE_DMA], NR_FREE_PAGES) + 37 - #endif 38 - #ifdef CONFIG_ZONE_DMA32 39 - zone_page_state(&zones[ZONE_DMA32], NR_FREE_PAGES) + 40 - #endif 41 - zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); 42 - 43 - max = node_free_pages / FRACTION_OF_NODE_MEM; 44 - 45 - num_cpus_on_node = cpumask_weight(cpumask_of_node(node)); 46 - max /= num_cpus_on_node; 47 - 48 - return max(max, min_pages); 49 - } 50 - 51 - static long min_pages_to_free(struct quicklist *q, 52 - unsigned long min_pages, long max_free) 53 - { 54 - long pages_to_free; 55 - 56 - pages_to_free = q->nr_pages - max_pages(min_pages); 57 - 58 - return min(pages_to_free, max_free); 59 - } 60 - 61 - /* 62 - * Trim down the number of pages in the quicklist 63 - */ 64 - void quicklist_trim(int nr, void (*dtor)(void *), 65 - unsigned long min_pages, unsigned long max_free) 66 - { 67 - long pages_to_free; 68 - struct quicklist *q; 69 - 70 - q = &get_cpu_var(quicklist)[nr]; 71 - if (q->nr_pages > min_pages) { 72 - pages_to_free = min_pages_to_free(q, min_pages, max_free); 73 - 74 - while (pages_to_free > 0) { 75 - /* 76 - * We pass a gfp_t of 0 to quicklist_alloc here 77 - * because we will never call into the page allocator. 78 - */ 79 - void *p = quicklist_alloc(nr, 0, NULL); 80 - 81 - if (dtor) 82 - dtor(p); 83 - free_page((unsigned long)p); 84 - pages_to_free--; 85 - } 86 - } 87 - put_cpu_var(quicklist); 88 - } 89 - 90 - unsigned long quicklist_total_size(void) 91 - { 92 - unsigned long count = 0; 93 - int cpu; 94 - struct quicklist *ql, *q; 95 - 96 - for_each_online_cpu(cpu) { 97 - ql = per_cpu(quicklist, cpu); 98 - for (q = ql; q < ql + CONFIG_NR_QUICK; q++) 99 - count += q->nr_pages; 100 - } 101 - return count; 102 - } 103 -

+12 -13

mm/rmap.c

··· 898 898 */ 899 899 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 900 900 0, vma, vma->vm_mm, address, 901 - min(vma->vm_end, address + 902 - (PAGE_SIZE << compound_order(page)))); 901 + min(vma->vm_end, address + page_size(page))); 903 902 mmu_notifier_invalidate_range_start(&range); 904 903 905 904 while (page_vma_mapped_walk(&pvmw)) { 906 - unsigned long cstart; 907 905 int ret = 0; 908 906 909 - cstart = address = pvmw.address; 907 + address = pvmw.address; 910 908 if (pvmw.pte) { 911 909 pte_t entry; 912 910 pte_t *pte = pvmw.pte; ··· 931 933 entry = pmd_wrprotect(entry); 932 934 entry = pmd_mkclean(entry); 933 935 set_pmd_at(vma->vm_mm, address, pmd, entry); 934 - cstart &= PMD_MASK; 935 936 ret = 1; 936 937 #else 937 938 /* unexpected pmd-mapped page? */ ··· 1189 1192 } 1190 1193 if (!atomic_inc_and_test(compound_mapcount_ptr(page))) 1191 1194 goto out; 1192 - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 1193 - __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); 1195 + if (PageSwapBacked(page)) 1196 + __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); 1197 + else 1198 + __inc_node_page_state(page, NR_FILE_PMDMAPPED); 1194 1199 } else { 1195 1200 if (PageTransCompound(page) && page_mapping(page)) { 1196 1201 VM_WARN_ON_ONCE(!PageLocked(page)); ··· 1231 1232 } 1232 1233 if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) 1233 1234 goto out; 1234 - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 1235 - __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); 1235 + if (PageSwapBacked(page)) 1236 + __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); 1237 + else 1238 + __dec_node_page_state(page, NR_FILE_PMDMAPPED); 1236 1239 } else { 1237 1240 if (!atomic_add_negative(-1, &page->_mapcount)) 1238 1241 goto out; ··· 1375 1374 */ 1376 1375 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 1377 1376 address, 1378 - min(vma->vm_end, address + 1379 - (PAGE_SIZE << compound_order(page)))); 1377 + min(vma->vm_end, address + page_size(page))); 1380 1378 if (PageHuge(page)) { 1381 1379 /* 1382 1380 * If sharing is possible, start and end will be adjusted ··· 1524 1524 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 1525 1525 pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); 1526 1526 if (PageHuge(page)) { 1527 - int nr = 1 << compound_order(page); 1528 - hugetlb_count_sub(nr, mm); 1527 + hugetlb_count_sub(compound_nr(page), mm); 1529 1528 set_huge_swap_pte_at(mm, address, 1530 1529 pvmw.pte, pteval, 1531 1530 vma_mmu_pagesize(vma));

+6 -6

mm/shmem.c

··· 609 609 { 610 610 XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); 611 611 unsigned long i = 0; 612 - unsigned long nr = 1UL << compound_order(page); 612 + unsigned long nr = compound_nr(page); 613 613 614 614 VM_BUG_ON_PAGE(PageTail(page), page); 615 615 VM_BUG_ON_PAGE(index != round_down(index, nr), page); ··· 631 631 if (xas_error(&xas)) 632 632 goto unlock; 633 633 next: 634 - xas_store(&xas, page + i); 634 + xas_store(&xas, page); 635 635 if (++i < nr) { 636 636 xas_next(&xas); 637 637 goto next; ··· 1734 1734 * vm. If we swap it in we mark it dirty since we also free the swap 1735 1735 * entry since a page cannot live in both the swap and page cache. 1736 1736 * 1737 - * fault_mm and fault_type are only supplied by shmem_fault: 1737 + * vmf and fault_type are only supplied by shmem_fault: 1738 1738 * otherwise they are NULL. 1739 1739 */ 1740 1740 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, ··· 1884 1884 lru_cache_add_anon(page); 1885 1885 1886 1886 spin_lock_irq(&info->lock); 1887 - info->alloced += 1 << compound_order(page); 1887 + info->alloced += compound_nr(page); 1888 1888 inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); 1889 1889 shmem_recalc_inode(inode); 1890 1890 spin_unlock_irq(&info->lock); ··· 1925 1925 struct page *head = compound_head(page); 1926 1926 int i; 1927 1927 1928 - for (i = 0; i < (1 << compound_order(head)); i++) { 1928 + for (i = 0; i < compound_nr(head); i++) { 1929 1929 clear_highpage(head + i); 1930 1930 flush_dcache_page(head + i); 1931 1931 } ··· 1952 1952 * Error recovery. 1953 1953 */ 1954 1954 unacct: 1955 - shmem_inode_unacct_blocks(inode, 1 << compound_order(page)); 1955 + shmem_inode_unacct_blocks(inode, compound_nr(page)); 1956 1956 1957 1957 if (PageTransHuge(page)) { 1958 1958 unlock_page(page);

+64

mm/slab.h

··· 30 30 struct list_head list; /* List of all slab caches on the system */ 31 31 }; 32 32 33 + #else /* !CONFIG_SLOB */ 34 + 35 + struct memcg_cache_array { 36 + struct rcu_head rcu; 37 + struct kmem_cache *entries[0]; 38 + }; 39 + 40 + /* 41 + * This is the main placeholder for memcg-related information in kmem caches. 42 + * Both the root cache and the child caches will have it. For the root cache, 43 + * this will hold a dynamically allocated array large enough to hold 44 + * information about the currently limited memcgs in the system. To allow the 45 + * array to be accessed without taking any locks, on relocation we free the old 46 + * version only after a grace period. 47 + * 48 + * Root and child caches hold different metadata. 49 + * 50 + * @root_cache: Common to root and child caches. NULL for root, pointer to 51 + * the root cache for children. 52 + * 53 + * The following fields are specific to root caches. 54 + * 55 + * @memcg_caches: kmemcg ID indexed table of child caches. This table is 56 + * used to index child cachces during allocation and cleared 57 + * early during shutdown. 58 + * 59 + * @root_caches_node: List node for slab_root_caches list. 60 + * 61 + * @children: List of all child caches. While the child caches are also 62 + * reachable through @memcg_caches, a child cache remains on 63 + * this list until it is actually destroyed. 64 + * 65 + * The following fields are specific to child caches. 66 + * 67 + * @memcg: Pointer to the memcg this cache belongs to. 68 + * 69 + * @children_node: List node for @root_cache->children list. 70 + * 71 + * @kmem_caches_node: List node for @memcg->kmem_caches list. 72 + */ 73 + struct memcg_cache_params { 74 + struct kmem_cache *root_cache; 75 + union { 76 + struct { 77 + struct memcg_cache_array __rcu *memcg_caches; 78 + struct list_head __root_caches_node; 79 + struct list_head children; 80 + bool dying; 81 + }; 82 + struct { 83 + struct mem_cgroup *memcg; 84 + struct list_head children_node; 85 + struct list_head kmem_caches_node; 86 + struct percpu_ref refcnt; 87 + 88 + void (*work_fn)(struct kmem_cache *); 89 + union { 90 + struct rcu_head rcu_head; 91 + struct work_struct work; 92 + }; 93 + }; 94 + }; 95 + }; 33 96 #endif /* CONFIG_SLOB */ 34 97 35 98 #ifdef CONFIG_SLAB ··· 237 174 void __kmemcg_cache_deactivate(struct kmem_cache *s); 238 175 void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s); 239 176 void slab_kmem_cache_release(struct kmem_cache *); 177 + void kmem_cache_shrink_all(struct kmem_cache *s); 240 178 241 179 struct seq_file; 242 180 struct file;

+37

mm/slab_common.c

··· 981 981 } 982 982 EXPORT_SYMBOL(kmem_cache_shrink); 983 983 984 + /** 985 + * kmem_cache_shrink_all - shrink a cache and all memcg caches for root cache 986 + * @s: The cache pointer 987 + */ 988 + void kmem_cache_shrink_all(struct kmem_cache *s) 989 + { 990 + struct kmem_cache *c; 991 + 992 + if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || !is_root_cache(s)) { 993 + kmem_cache_shrink(s); 994 + return; 995 + } 996 + 997 + get_online_cpus(); 998 + get_online_mems(); 999 + kasan_cache_shrink(s); 1000 + __kmem_cache_shrink(s); 1001 + 1002 + /* 1003 + * We have to take the slab_mutex to protect from the memcg list 1004 + * modification. 1005 + */ 1006 + mutex_lock(&slab_mutex); 1007 + for_each_memcg_cache(c, s) { 1008 + /* 1009 + * Don't need to shrink deactivated memcg caches. 1010 + */ 1011 + if (s->flags & SLAB_DEACTIVATED) 1012 + continue; 1013 + kasan_cache_shrink(c); 1014 + __kmem_cache_shrink(c); 1015 + } 1016 + mutex_unlock(&slab_mutex); 1017 + put_online_mems(); 1018 + put_online_cpus(); 1019 + } 1020 + 984 1021 bool slab_is_available(void) 985 1022 { 986 1023 return slab_state >= UP;

+1 -1

mm/slob.c

··· 539 539 540 540 sp = virt_to_page(block); 541 541 if (unlikely(!PageSlab(sp))) 542 - return PAGE_SIZE << compound_order(sp); 542 + return page_size(sp); 543 543 544 544 align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 545 545 m = (unsigned int *)(block - align);

+12 -10

mm/slub.c

··· 829 829 return 1; 830 830 831 831 start = page_address(page); 832 - length = PAGE_SIZE << compound_order(page); 832 + length = page_size(page); 833 833 end = start + length; 834 834 remainder = length % s->size; 835 835 if (!remainder) ··· 1074 1074 init_tracking(s, object); 1075 1075 } 1076 1076 1077 - static void setup_page_debug(struct kmem_cache *s, void *addr, int order) 1077 + static 1078 + void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) 1078 1079 { 1079 1080 if (!(s->flags & SLAB_POISON)) 1080 1081 return; 1081 1082 1082 1083 metadata_access_enable(); 1083 - memset(addr, POISON_INUSE, PAGE_SIZE << order); 1084 + memset(addr, POISON_INUSE, page_size(page)); 1084 1085 metadata_access_disable(); 1085 1086 } 1086 1087 ··· 1341 1340 #else /* !CONFIG_SLUB_DEBUG */ 1342 1341 static inline void setup_object_debug(struct kmem_cache *s, 1343 1342 struct page *page, void *object) {} 1344 - static inline void setup_page_debug(struct kmem_cache *s, 1345 - void *addr, int order) {} 1343 + static inline 1344 + void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {} 1346 1345 1347 1346 static inline int alloc_debug_processing(struct kmem_cache *s, 1348 1347 struct page *page, void *object, unsigned long addr) { return 0; } ··· 1640 1639 struct kmem_cache_order_objects oo = s->oo; 1641 1640 gfp_t alloc_gfp; 1642 1641 void *start, *p, *next; 1643 - int idx, order; 1642 + int idx; 1644 1643 bool shuffle; 1645 1644 1646 1645 flags &= gfp_allowed_mask; ··· 1674 1673 1675 1674 page->objects = oo_objects(oo); 1676 1675 1677 - order = compound_order(page); 1678 1676 page->slab_cache = s; 1679 1677 __SetPageSlab(page); 1680 1678 if (page_is_pfmemalloc(page)) ··· 1683 1683 1684 1684 start = page_address(page); 1685 1685 1686 - setup_page_debug(s, start, order); 1686 + setup_page_debug(s, page, start); 1687 1687 1688 1688 shuffle = shuffle_freelist(s, page); 1689 1689 ··· 2004 2004 return tid + TID_STEP; 2005 2005 } 2006 2006 2007 + #ifdef SLUB_DEBUG_CMPXCHG 2007 2008 static inline unsigned int tid_to_cpu(unsigned long tid) 2008 2009 { 2009 2010 return tid % TID_STEP; ··· 2014 2013 { 2015 2014 return tid / TID_STEP; 2016 2015 } 2016 + #endif 2017 2017 2018 2018 static inline unsigned int init_tid(int cpu) 2019 2019 { ··· 3932 3930 3933 3931 if (unlikely(!PageSlab(page))) { 3934 3932 WARN_ON(!PageCompound(page)); 3935 - return PAGE_SIZE << compound_order(page); 3933 + return page_size(page); 3936 3934 } 3937 3935 3938 3936 return slab_ksize(page->slab_cache); ··· 5300 5298 const char *buf, size_t length) 5301 5299 { 5302 5300 if (buf[0] == '1') 5303 - kmem_cache_shrink(s); 5301 + kmem_cache_shrink_all(s); 5304 5302 else 5305 5303 return -EINVAL; 5306 5304 return length;

+17 -8

mm/sparse.c

··· 11 11 #include <linux/export.h> 12 12 #include <linux/spinlock.h> 13 13 #include <linux/vmalloc.h> 14 + #include <linux/swap.h> 15 + #include <linux/swapops.h> 14 16 15 17 #include "internal.h" 16 18 #include <asm/dma.h> ··· 472 470 static void *sparsemap_buf __meminitdata; 473 471 static void *sparsemap_buf_end __meminitdata; 474 472 473 + static inline void __meminit sparse_buffer_free(unsigned long size) 474 + { 475 + WARN_ON(!sparsemap_buf || size == 0); 476 + memblock_free_early(__pa(sparsemap_buf), size); 477 + } 478 + 475 479 static void __init sparse_buffer_init(unsigned long size, int nid) 476 480 { 477 481 phys_addr_t addr = __pa(MAX_DMA_ADDRESS); ··· 494 486 unsigned long size = sparsemap_buf_end - sparsemap_buf; 495 487 496 488 if (sparsemap_buf && size > 0) 497 - memblock_free_early(__pa(sparsemap_buf), size); 489 + sparse_buffer_free(size); 498 490 sparsemap_buf = NULL; 499 491 } 500 492 ··· 503 495 void *ptr = NULL; 504 496 505 497 if (sparsemap_buf) { 506 - ptr = PTR_ALIGN(sparsemap_buf, size); 498 + ptr = (void *) roundup((unsigned long)sparsemap_buf, size); 507 499 if (ptr + size > sparsemap_buf_end) 508 500 ptr = NULL; 509 - else 501 + else { 502 + /* Free redundant aligned space */ 503 + if ((unsigned long)(ptr - sparsemap_buf) > 0) 504 + sparse_buffer_free((unsigned long)(ptr - sparsemap_buf)); 510 505 sparsemap_buf = ptr + size; 506 + } 511 507 } 512 508 return ptr; 513 509 } ··· 879 867 */ 880 868 page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages); 881 869 882 - ms = __pfn_to_section(start_pfn); 870 + ms = __nr_to_section(section_nr); 883 871 set_section_nid(section_nr, nid); 884 872 section_mark_present(ms); 885 873 ··· 896 884 { 897 885 int i; 898 886 899 - if (!memmap) 900 - return; 901 - 902 887 /* 903 888 * A further optimization is to have per section refcounted 904 889 * num_poisoned_pages. But that would need more space per memmap, so ··· 907 898 908 899 for (i = 0; i < nr_pages; i++) { 909 900 if (PageHWPoison(&memmap[i])) { 910 - atomic_long_sub(1, &num_poisoned_pages); 901 + num_poisoned_pages_dec(); 911 902 ClearPageHWPoison(&memmap[i]); 912 903 } 913 904 }

+7 -9

mm/swap.c

··· 71 71 spin_unlock_irqrestore(&pgdat->lru_lock, flags); 72 72 } 73 73 __ClearPageWaiters(page); 74 - mem_cgroup_uncharge(page); 75 74 } 76 75 77 76 static void __put_single_page(struct page *page) 78 77 { 79 78 __page_cache_release(page); 79 + mem_cgroup_uncharge(page); 80 80 free_unref_page(page); 81 81 } 82 82 ··· 515 515 del_page_from_lru_list(page, lruvec, lru + active); 516 516 ClearPageActive(page); 517 517 ClearPageReferenced(page); 518 - add_page_to_lru_list(page, lruvec, lru); 519 518 520 519 if (PageWriteback(page) || PageDirty(page)) { 521 520 /* ··· 522 523 * It can make readahead confusing. But race window 523 524 * is _really_ small and it's non-critical problem. 524 525 */ 526 + add_page_to_lru_list(page, lruvec, lru); 525 527 SetPageReclaim(page); 526 528 } else { 527 529 /* 528 530 * The page's writeback ends up during pagevec 529 531 * We moves tha page into tail of inactive. 530 532 */ 531 - list_move_tail(&page->lru, &lruvec->lists[lru]); 533 + add_page_to_lru_list_tail(page, lruvec, lru); 532 534 __count_vm_event(PGROTATED); 533 535 } 534 536 ··· 844 844 get_page(page_tail); 845 845 list_add_tail(&page_tail->lru, list); 846 846 } else { 847 - struct list_head *list_head; 848 847 /* 849 848 * Head page has not yet been counted, as an hpage, 850 849 * so we must account for each subpage individually. 851 850 * 852 - * Use the standard add function to put page_tail on the list, 853 - * but then correct its position so they all end up in order. 851 + * Put page_tail on the list at the correct position 852 + * so they all end up in order. 854 853 */ 855 - add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); 856 - list_head = page_tail->lru.prev; 857 - list_move_tail(&page_tail->lru, list_head); 854 + add_page_to_lru_list_tail(page_tail, lruvec, 855 + page_lru(page_tail)); 858 856 } 859 857 860 858 if (!PageUnevictable(page))

+3 -3

mm/swap_state.c

··· 116 116 struct address_space *address_space = swap_address_space(entry); 117 117 pgoff_t idx = swp_offset(entry); 118 118 XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); 119 - unsigned long i, nr = 1UL << compound_order(page); 119 + unsigned long i, nr = compound_nr(page); 120 120 121 121 VM_BUG_ON_PAGE(!PageLocked(page), page); 122 122 VM_BUG_ON_PAGE(PageSwapCache(page), page); ··· 133 133 for (i = 0; i < nr; i++) { 134 134 VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); 135 135 set_page_private(page + i, entry.val + i); 136 - xas_store(&xas, page + i); 136 + xas_store(&xas, page); 137 137 xas_next(&xas); 138 138 } 139 139 address_space->nrpages += nr; ··· 168 168 169 169 for (i = 0; i < nr; i++) { 170 170 void *entry = xas_store(&xas, NULL); 171 - VM_BUG_ON_PAGE(entry != page + i, entry); 171 + VM_BUG_ON_PAGE(entry != page, entry); 172 172 set_page_private(page + i, 0); 173 173 xas_next(&xas); 174 174 }

+120 -2

mm/util.c

··· 16 16 #include <linux/hugetlb.h> 17 17 #include <linux/vmalloc.h> 18 18 #include <linux/userfaultfd_k.h> 19 + #include <linux/elf.h> 20 + #include <linux/elf-randomize.h> 21 + #include <linux/personality.h> 22 + #include <linux/random.h> 23 + #include <linux/processor.h> 24 + #include <linux/sizes.h> 25 + #include <linux/compat.h> 19 26 20 27 #include <linux/uaccess.h> 21 28 ··· 300 293 return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); 301 294 } 302 295 303 - #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) 296 + #ifndef STACK_RND_MASK 297 + #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ 298 + #endif 299 + 300 + unsigned long randomize_stack_top(unsigned long stack_top) 301 + { 302 + unsigned long random_variable = 0; 303 + 304 + if (current->flags & PF_RANDOMIZE) { 305 + random_variable = get_random_long(); 306 + random_variable &= STACK_RND_MASK; 307 + random_variable <<= PAGE_SHIFT; 308 + } 309 + #ifdef CONFIG_STACK_GROWSUP 310 + return PAGE_ALIGN(stack_top) + random_variable; 311 + #else 312 + return PAGE_ALIGN(stack_top) - random_variable; 313 + #endif 314 + } 315 + 316 + #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT 317 + unsigned long arch_randomize_brk(struct mm_struct *mm) 318 + { 319 + /* Is the current task 32bit ? */ 320 + if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) 321 + return randomize_page(mm->brk, SZ_32M); 322 + 323 + return randomize_page(mm->brk, SZ_1G); 324 + } 325 + 326 + unsigned long arch_mmap_rnd(void) 327 + { 328 + unsigned long rnd; 329 + 330 + #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS 331 + if (is_compat_task()) 332 + rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); 333 + else 334 + #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ 335 + rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); 336 + 337 + return rnd << PAGE_SHIFT; 338 + } 339 + 340 + static int mmap_is_legacy(struct rlimit *rlim_stack) 341 + { 342 + if (current->personality & ADDR_COMPAT_LAYOUT) 343 + return 1; 344 + 345 + if (rlim_stack->rlim_cur == RLIM_INFINITY) 346 + return 1; 347 + 348 + return sysctl_legacy_va_layout; 349 + } 350 + 351 + /* 352 + * Leave enough space between the mmap area and the stack to honour ulimit in 353 + * the face of randomisation. 354 + */ 355 + #define MIN_GAP (SZ_128M) 356 + #define MAX_GAP (STACK_TOP / 6 * 5) 357 + 358 + static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) 359 + { 360 + unsigned long gap = rlim_stack->rlim_cur; 361 + unsigned long pad = stack_guard_gap; 362 + 363 + /* Account for stack randomization if necessary */ 364 + if (current->flags & PF_RANDOMIZE) 365 + pad += (STACK_RND_MASK << PAGE_SHIFT); 366 + 367 + /* Values close to RLIM_INFINITY can overflow. */ 368 + if (gap + pad > gap) 369 + gap += pad; 370 + 371 + if (gap < MIN_GAP) 372 + gap = MIN_GAP; 373 + else if (gap > MAX_GAP) 374 + gap = MAX_GAP; 375 + 376 + return PAGE_ALIGN(STACK_TOP - gap - rnd); 377 + } 378 + 379 + void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) 380 + { 381 + unsigned long random_factor = 0UL; 382 + 383 + if (current->flags & PF_RANDOMIZE) 384 + random_factor = arch_mmap_rnd(); 385 + 386 + if (mmap_is_legacy(rlim_stack)) { 387 + mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; 388 + mm->get_unmapped_area = arch_get_unmapped_area; 389 + } else { 390 + mm->mmap_base = mmap_base(random_factor, rlim_stack); 391 + mm->get_unmapped_area = arch_get_unmapped_area_topdown; 392 + } 393 + } 394 + #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) 304 395 void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) 305 396 { 306 397 mm->mmap_base = TASK_UNMAPPED_BASE; ··· 626 521 return true; 627 522 if (PageHuge(page)) 628 523 return false; 629 - for (i = 0; i < (1 << compound_order(page)); i++) { 524 + for (i = 0; i < compound_nr(page); i++) { 630 525 if (atomic_read(&page[i]._mapcount) >= 0) 631 526 return true; 632 527 } ··· 887 782 mmput(mm); 888 783 out: 889 784 return res; 785 + } 786 + 787 + int memcmp_pages(struct page *page1, struct page *page2) 788 + { 789 + char *addr1, *addr2; 790 + int ret; 791 + 792 + addr1 = kmap_atomic(page1); 793 + addr2 = kmap_atomic(page2); 794 + ret = memcmp(addr1, addr2, PAGE_SIZE); 795 + kunmap_atomic(addr2); 796 + kunmap_atomic(addr1); 797 + return ret; 890 798 }

+59 -25

mm/vmalloc.c

··· 329 329 #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 330 330 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 331 331 332 - #define VM_LAZY_FREE 0x02 333 - #define VM_VM_AREA 0x04 334 332 335 333 static DEFINE_SPINLOCK(vmap_area_lock); 336 334 /* Export for kexec only */ ··· 1114 1116 1115 1117 va->va_start = addr; 1116 1118 va->va_end = addr + size; 1117 - va->flags = 0; 1119 + va->vm = NULL; 1118 1120 insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 1119 1121 1120 1122 spin_unlock(&vmap_area_lock); ··· 1280 1282 llist_for_each_entry_safe(va, n_va, valist, purge_list) { 1281 1283 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; 1282 1284 1283 - __free_vmap_area(va); 1285 + /* 1286 + * Finally insert or merge lazily-freed area. It is 1287 + * detached and there is no need to "unlink" it from 1288 + * anything. 1289 + */ 1290 + merge_or_add_vmap_area(va, 1291 + &free_vmap_area_root, &free_vmap_area_list); 1292 + 1284 1293 atomic_long_sub(nr, &vmap_lazy_nr); 1285 1294 1286 1295 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) ··· 1328 1323 static void free_vmap_area_noflush(struct vmap_area *va) 1329 1324 { 1330 1325 unsigned long nr_lazy; 1326 + 1327 + spin_lock(&vmap_area_lock); 1328 + unlink_va(va, &vmap_area_root); 1329 + spin_unlock(&vmap_area_lock); 1331 1330 1332 1331 nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> 1333 1332 PAGE_SHIFT, &vmap_lazy_nr); ··· 1927 1918 if (WARN_ON_ONCE(!va)) 1928 1919 continue; 1929 1920 1930 - va->flags = VM_VM_AREA; 1931 1921 va->va_start = (unsigned long)tmp->addr; 1932 1922 va->va_end = va->va_start + tmp->size; 1933 1923 va->vm = tmp; ··· 2024 2016 vm->size = va->va_end - va->va_start; 2025 2017 vm->caller = caller; 2026 2018 va->vm = vm; 2027 - va->flags |= VM_VM_AREA; 2028 2019 spin_unlock(&vmap_area_lock); 2029 2020 } 2030 2021 ··· 2128 2121 struct vmap_area *va; 2129 2122 2130 2123 va = find_vmap_area((unsigned long)addr); 2131 - if (va && va->flags & VM_VM_AREA) 2132 - return va->vm; 2124 + if (!va) 2125 + return NULL; 2133 2126 2134 - return NULL; 2127 + return va->vm; 2135 2128 } 2136 2129 2137 2130 /** ··· 2150 2143 2151 2144 might_sleep(); 2152 2145 2153 - va = find_vmap_area((unsigned long)addr); 2154 - if (va && va->flags & VM_VM_AREA) { 2146 + spin_lock(&vmap_area_lock); 2147 + va = __find_vmap_area((unsigned long)addr); 2148 + if (va && va->vm) { 2155 2149 struct vm_struct *vm = va->vm; 2156 2150 2157 - spin_lock(&vmap_area_lock); 2158 2151 va->vm = NULL; 2159 - va->flags &= ~VM_VM_AREA; 2160 - va->flags |= VM_LAZY_FREE; 2161 2152 spin_unlock(&vmap_area_lock); 2162 2153 2163 2154 kasan_free_shadow(vm); ··· 2163 2158 2164 2159 return vm; 2165 2160 } 2161 + 2162 + spin_unlock(&vmap_area_lock); 2166 2163 return NULL; 2167 2164 } 2168 2165 ··· 2409 2402 nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; 2410 2403 array_size = (nr_pages * sizeof(struct page *)); 2411 2404 2412 - area->nr_pages = nr_pages; 2413 2405 /* Please note that the recursion is strictly bounded. */ 2414 2406 if (array_size > PAGE_SIZE) { 2415 2407 pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, ··· 2416 2410 } else { 2417 2411 pages = kmalloc_node(array_size, nested_gfp, node); 2418 2412 } 2419 - area->pages = pages; 2420 - if (!area->pages) { 2413 + 2414 + if (!pages) { 2421 2415 remove_vm_area(area->addr); 2422 2416 kfree(area); 2423 2417 return NULL; 2424 2418 } 2419 + 2420 + area->pages = pages; 2421 + area->nr_pages = nr_pages; 2425 2422 2426 2423 for (i = 0; i < area->nr_pages; i++) { 2427 2424 struct page *page; ··· 2860 2851 if (!count) 2861 2852 break; 2862 2853 2863 - if (!(va->flags & VM_VM_AREA)) 2854 + if (!va->vm) 2864 2855 continue; 2865 2856 2866 2857 vm = va->vm; ··· 2940 2931 if (!count) 2941 2932 break; 2942 2933 2943 - if (!(va->flags & VM_VM_AREA)) 2934 + if (!va->vm) 2944 2935 continue; 2945 2936 2946 2937 vm = va->vm; ··· 3459 3450 } 3460 3451 } 3461 3452 3453 + static void show_purge_info(struct seq_file *m) 3454 + { 3455 + struct llist_node *head; 3456 + struct vmap_area *va; 3457 + 3458 + head = READ_ONCE(vmap_purge_list.first); 3459 + if (head == NULL) 3460 + return; 3461 + 3462 + llist_for_each_entry(va, head, purge_list) { 3463 + seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", 3464 + (void *)va->va_start, (void *)va->va_end, 3465 + va->va_end - va->va_start); 3466 + } 3467 + } 3468 + 3462 3469 static int s_show(struct seq_file *m, void *p) 3463 3470 { 3464 3471 struct vmap_area *va; ··· 3483 3458 va = list_entry(p, struct vmap_area, list); 3484 3459 3485 3460 /* 3486 - * s_show can encounter race with remove_vm_area, !VM_VM_AREA on 3487 - * behalf of vmap area is being tear down or vm_map_ram allocation. 3461 + * s_show can encounter race with remove_vm_area, !vm on behalf 3462 + * of vmap area is being tear down or vm_map_ram allocation. 3488 3463 */ 3489 - if (!(va->flags & VM_VM_AREA)) { 3490 - seq_printf(m, "0x%pK-0x%pK %7ld %s\n", 3464 + if (!va->vm) { 3465 + seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", 3491 3466 (void *)va->va_start, (void *)va->va_end, 3492 - va->va_end - va->va_start, 3493 - va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram"); 3467 + va->va_end - va->va_start); 3494 3468 3495 3469 return 0; 3496 3470 } ··· 3528 3504 3529 3505 show_numa_info(m, v); 3530 3506 seq_putc(m, '\n'); 3507 + 3508 + /* 3509 + * As a final step, dump "unpurged" areas. Note, 3510 + * that entire "/proc/vmallocinfo" output will not 3511 + * be address sorted, because the purge list is not 3512 + * sorted. 3513 + */ 3514 + if (list_is_last(&va->list, &vmap_area_list)) 3515 + show_purge_info(m); 3516 + 3531 3517 return 0; 3532 3518 } 3533 3519

+59 -90

mm/vmscan.c

··· 171 171 */ 172 172 unsigned long vm_total_pages; 173 173 174 + static void set_task_reclaim_state(struct task_struct *task, 175 + struct reclaim_state *rs) 176 + { 177 + /* Check for an overwrite */ 178 + WARN_ON_ONCE(rs && task->reclaim_state); 179 + 180 + /* Check for the nulling of an already-nulled member */ 181 + WARN_ON_ONCE(!rs && !task->reclaim_state); 182 + 183 + task->reclaim_state = rs; 184 + } 185 + 174 186 static LIST_HEAD(shrinker_list); 175 187 static DECLARE_RWSEM(shrinker_rwsem); 176 188 177 - #ifdef CONFIG_MEMCG_KMEM 178 - 189 + #ifdef CONFIG_MEMCG 179 190 /* 180 191 * We allow subsystems to populate their shrinker-related 181 192 * LRU lists before register_shrinker_prepared() is called ··· 238 227 idr_remove(&shrinker_idr, id); 239 228 up_write(&shrinker_rwsem); 240 229 } 241 - #else /* CONFIG_MEMCG_KMEM */ 242 - static int prealloc_memcg_shrinker(struct shrinker *shrinker) 243 - { 244 - return 0; 245 - } 246 230 247 - static void unregister_memcg_shrinker(struct shrinker *shrinker) 248 - { 249 - } 250 - #endif /* CONFIG_MEMCG_KMEM */ 251 - 252 - static void set_task_reclaim_state(struct task_struct *task, 253 - struct reclaim_state *rs) 254 - { 255 - /* Check for an overwrite */ 256 - WARN_ON_ONCE(rs && task->reclaim_state); 257 - 258 - /* Check for the nulling of an already-nulled member */ 259 - WARN_ON_ONCE(!rs && !task->reclaim_state); 260 - 261 - task->reclaim_state = rs; 262 - } 263 - 264 - #ifdef CONFIG_MEMCG 265 231 static bool global_reclaim(struct scan_control *sc) 266 232 { 267 233 return !sc->target_mem_cgroup; ··· 293 305 294 306 } 295 307 #else 308 + static int prealloc_memcg_shrinker(struct shrinker *shrinker) 309 + { 310 + return 0; 311 + } 312 + 313 + static void unregister_memcg_shrinker(struct shrinker *shrinker) 314 + { 315 + } 316 + 296 317 static bool global_reclaim(struct scan_control *sc) 297 318 { 298 319 return true; ··· 588 591 return freed; 589 592 } 590 593 591 - #ifdef CONFIG_MEMCG_KMEM 594 + #ifdef CONFIG_MEMCG 592 595 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, 593 596 struct mem_cgroup *memcg, int priority) 594 597 { ··· 596 599 unsigned long ret, freed = 0; 597 600 int i; 598 601 599 - if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)) 602 + if (!mem_cgroup_online(memcg)) 600 603 return 0; 601 604 602 605 if (!down_read_trylock(&shrinker_rwsem)) ··· 621 624 clear_bit(i, map->map); 622 625 continue; 623 626 } 627 + 628 + /* Call non-slab shrinkers even though kmem is disabled */ 629 + if (!memcg_kmem_enabled() && 630 + !(shrinker->flags & SHRINKER_NONSLAB)) 631 + continue; 624 632 625 633 ret = do_shrink_slab(&sc, shrinker, priority); 626 634 if (ret == SHRINK_EMPTY) { ··· 663 661 up_read(&shrinker_rwsem); 664 662 return freed; 665 663 } 666 - #else /* CONFIG_MEMCG_KMEM */ 664 + #else /* CONFIG_MEMCG */ 667 665 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, 668 666 struct mem_cgroup *memcg, int priority) 669 667 { 670 668 return 0; 671 669 } 672 - #endif /* CONFIG_MEMCG_KMEM */ 670 + #endif /* CONFIG_MEMCG */ 673 671 674 672 /** 675 673 * shrink_slab - shrink slab caches ··· 1151 1149 1152 1150 VM_BUG_ON_PAGE(PageActive(page), page); 1153 1151 1154 - nr_pages = 1 << compound_order(page); 1152 + nr_pages = compound_nr(page); 1155 1153 1156 1154 /* Account the number of base pages even though THP */ 1157 1155 sc->nr_scanned += nr_pages; ··· 1489 1487 * Is there need to periodically free_page_list? It would 1490 1488 * appear not as the counts should be low 1491 1489 */ 1492 - if (unlikely(PageTransHuge(page))) { 1493 - mem_cgroup_uncharge(page); 1490 + if (unlikely(PageTransHuge(page))) 1494 1491 (*get_compound_page_dtor(page))(page); 1495 - } else 1492 + else 1496 1493 list_add(&page->lru, &free_pages); 1497 1494 continue; 1498 1495 ··· 1706 1705 1707 1706 VM_BUG_ON_PAGE(!PageLRU(page), page); 1708 1707 1709 - nr_pages = 1 << compound_order(page); 1708 + nr_pages = compound_nr(page); 1710 1709 total_scan += nr_pages; 1711 1710 1712 1711 if (page_zonenum(page) > sc->reclaim_idx) { ··· 1912 1911 1913 1912 if (unlikely(PageCompound(page))) { 1914 1913 spin_unlock_irq(&pgdat->lru_lock); 1915 - mem_cgroup_uncharge(page); 1916 1914 (*get_compound_page_dtor(page))(page); 1917 1915 spin_lock_irq(&pgdat->lru_lock); 1918 1916 } else ··· 2586 2586 */ 2587 2587 static inline bool should_continue_reclaim(struct pglist_data *pgdat, 2588 2588 unsigned long nr_reclaimed, 2589 - unsigned long nr_scanned, 2590 2589 struct scan_control *sc) 2591 2590 { 2592 2591 unsigned long pages_for_compaction; ··· 2596 2597 if (!in_reclaim_compaction(sc)) 2597 2598 return false; 2598 2599 2599 - /* Consider stopping depending on scan and reclaim activity */ 2600 - if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) { 2601 - /* 2602 - * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the 2603 - * full LRU list has been scanned and we are still failing 2604 - * to reclaim pages. This full LRU scan is potentially 2605 - * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed 2606 - */ 2607 - if (!nr_reclaimed && !nr_scanned) 2608 - return false; 2609 - } else { 2610 - /* 2611 - * For non-__GFP_RETRY_MAYFAIL allocations which can presumably 2612 - * fail without consequence, stop if we failed to reclaim 2613 - * any pages from the last SWAP_CLUSTER_MAX number of 2614 - * pages that were scanned. This will return to the 2615 - * caller faster at the risk reclaim/compaction and 2616 - * the resulting allocation attempt fails 2617 - */ 2618 - if (!nr_reclaimed) 2619 - return false; 2620 - } 2621 - 2622 2600 /* 2623 - * If we have not reclaimed enough pages for compaction and the 2624 - * inactive lists are large enough, continue reclaiming 2601 + * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX 2602 + * number of pages that were scanned. This will return to the caller 2603 + * with the risk reclaim/compaction and the resulting allocation attempt 2604 + * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL 2605 + * allocations through requiring that the full LRU list has been scanned 2606 + * first, by assuming that zero delta of sc->nr_scanned means full LRU 2607 + * scan, but that approximation was wrong, and there were corner cases 2608 + * where always a non-zero amount of pages were scanned. 2625 2609 */ 2626 - pages_for_compaction = compact_gap(sc->order); 2627 - inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); 2628 - if (get_nr_swap_pages() > 0) 2629 - inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); 2630 - if (sc->nr_reclaimed < pages_for_compaction && 2631 - inactive_lru_pages > pages_for_compaction) 2632 - return true; 2610 + if (!nr_reclaimed) 2611 + return false; 2633 2612 2634 2613 /* If compaction would go ahead or the allocation would succeed, stop */ 2635 2614 for (z = 0; z <= sc->reclaim_idx; z++) { ··· 2624 2647 ; 2625 2648 } 2626 2649 } 2627 - return true; 2650 + 2651 + /* 2652 + * If we have not reclaimed enough pages for compaction and the 2653 + * inactive lists are large enough, continue reclaiming 2654 + */ 2655 + pages_for_compaction = compact_gap(sc->order); 2656 + inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); 2657 + if (get_nr_swap_pages() > 0) 2658 + inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); 2659 + 2660 + return inactive_lru_pages > pages_for_compaction; 2628 2661 } 2629 2662 2630 2663 static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) ··· 2651 2664 2652 2665 do { 2653 2666 struct mem_cgroup *root = sc->target_mem_cgroup; 2654 - struct mem_cgroup_reclaim_cookie reclaim = { 2655 - .pgdat = pgdat, 2656 - .priority = sc->priority, 2657 - }; 2658 2667 unsigned long node_lru_pages = 0; 2659 2668 struct mem_cgroup *memcg; 2660 2669 ··· 2659 2676 nr_reclaimed = sc->nr_reclaimed; 2660 2677 nr_scanned = sc->nr_scanned; 2661 2678 2662 - memcg = mem_cgroup_iter(root, NULL, &reclaim); 2679 + memcg = mem_cgroup_iter(root, NULL, NULL); 2663 2680 do { 2664 2681 unsigned long lru_pages; 2665 2682 unsigned long reclaimed; ··· 2702 2719 sc->nr_scanned - scanned, 2703 2720 sc->nr_reclaimed - reclaimed); 2704 2721 2705 - /* 2706 - * Kswapd have to scan all memory cgroups to fulfill 2707 - * the overall scan target for the node. 2708 - * 2709 - * Limit reclaim, on the other hand, only cares about 2710 - * nr_to_reclaim pages to be reclaimed and it will 2711 - * retry with decreasing priority if one round over the 2712 - * whole hierarchy is not sufficient. 2713 - */ 2714 - if (!current_is_kswapd() && 2715 - sc->nr_reclaimed >= sc->nr_to_reclaim) { 2716 - mem_cgroup_iter_break(root, memcg); 2717 - break; 2718 - } 2719 - } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); 2722 + } while ((memcg = mem_cgroup_iter(root, memcg, NULL))); 2720 2723 2721 2724 if (reclaim_state) { 2722 2725 sc->nr_reclaimed += reclaim_state->reclaimed_slab; ··· 2779 2810 wait_iff_congested(BLK_RW_ASYNC, HZ/10); 2780 2811 2781 2812 } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, 2782 - sc->nr_scanned - nr_scanned, sc)); 2813 + sc)); 2783 2814 2784 2815 /* 2785 2816 * Kswapd gives up on balancing particular nodes after too

+2

mm/vmstat.c

··· 1158 1158 "nr_shmem", 1159 1159 "nr_shmem_hugepages", 1160 1160 "nr_shmem_pmdmapped", 1161 + "nr_file_hugepages", 1162 + "nr_file_pmdmapped", 1161 1163 "nr_anon_transparent_hugepages", 1162 1164 "nr_unstable", 1163 1165 "nr_vmscan_write",

+43 -111

mm/z3fold.c

··· 41 41 #include <linux/workqueue.h> 42 42 #include <linux/slab.h> 43 43 #include <linux/spinlock.h> 44 - #include <linux/wait.h> 45 44 #include <linux/zpool.h> 46 45 #include <linux/magic.h> 47 46 ··· 145 146 * @release_wq: workqueue for safe page release 146 147 * @work: work_struct for safe page release 147 148 * @inode: inode for z3fold pseudo filesystem 148 - * @destroying: bool to stop migration once we start destruction 149 - * @isolated: int to count the number of pages currently in isolation 150 149 * 151 150 * This structure is allocated at pool creation time and maintains metadata 152 151 * pertaining to a particular z3fold pool. ··· 163 166 const struct zpool_ops *zpool_ops; 164 167 struct workqueue_struct *compact_wq; 165 168 struct workqueue_struct *release_wq; 166 - struct wait_queue_head isolate_wait; 167 169 struct work_struct work; 168 170 struct inode *inode; 169 - bool destroying; 170 - int isolated; 171 171 }; 172 172 173 173 /* ··· 295 301 } 296 302 297 303 /* Initializes the z3fold header of a newly allocated z3fold page */ 298 - static struct z3fold_header *init_z3fold_page(struct page *page, 304 + static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, 299 305 struct z3fold_pool *pool, gfp_t gfp) 300 306 { 301 307 struct z3fold_header *zhdr = page_address(page); 302 - struct z3fold_buddy_slots *slots = alloc_slots(pool, gfp); 303 - 304 - if (!slots) 305 - return NULL; 308 + struct z3fold_buddy_slots *slots; 306 309 307 310 INIT_LIST_HEAD(&page->lru); 308 311 clear_bit(PAGE_HEADLESS, &page->private); ··· 307 316 clear_bit(NEEDS_COMPACTING, &page->private); 308 317 clear_bit(PAGE_STALE, &page->private); 309 318 clear_bit(PAGE_CLAIMED, &page->private); 319 + if (headless) 320 + return zhdr; 321 + 322 + slots = alloc_slots(pool, gfp); 323 + if (!slots) 324 + return NULL; 310 325 311 326 spin_lock_init(&zhdr->page_lock); 312 327 kref_init(&zhdr->refcount); ··· 369 372 * Encodes the handle of a particular buddy within a z3fold page 370 373 * Pool lock should be held as this function accesses first_num 371 374 */ 372 - static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) 375 + static unsigned long __encode_handle(struct z3fold_header *zhdr, 376 + struct z3fold_buddy_slots *slots, 377 + enum buddy bud) 373 378 { 374 - struct z3fold_buddy_slots *slots; 375 379 unsigned long h = (unsigned long)zhdr; 376 380 int idx = 0; 377 381 ··· 389 391 if (bud == LAST) 390 392 h |= (zhdr->last_chunks << BUDDY_SHIFT); 391 393 392 - slots = zhdr->slots; 393 394 slots->slot[idx] = h; 394 395 return (unsigned long)&slots->slot[idx]; 396 + } 397 + 398 + static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) 399 + { 400 + return __encode_handle(zhdr, zhdr->slots, bud); 395 401 } 396 402 397 403 /* Returns the z3fold page where a given handle is stored */ ··· 632 630 } 633 631 634 632 if (unlikely(PageIsolated(page) || 633 + test_bit(PAGE_CLAIMED, &page->private) || 635 634 test_bit(PAGE_STALE, &page->private))) { 636 635 z3fold_page_unlock(zhdr); 637 636 return; ··· 778 775 goto out_c; 779 776 spin_lock_init(&pool->lock); 780 777 spin_lock_init(&pool->stale_lock); 781 - init_waitqueue_head(&pool->isolate_wait); 782 778 pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); 783 779 if (!pool->unbuddied) 784 780 goto out_pool; ··· 817 815 return NULL; 818 816 } 819 817 820 - static bool pool_isolated_are_drained(struct z3fold_pool *pool) 821 - { 822 - bool ret; 823 - 824 - spin_lock(&pool->lock); 825 - ret = pool->isolated == 0; 826 - spin_unlock(&pool->lock); 827 - return ret; 828 - } 829 818 /** 830 819 * z3fold_destroy_pool() - destroys an existing z3fold pool 831 820 * @pool: the z3fold pool to be destroyed ··· 826 833 static void z3fold_destroy_pool(struct z3fold_pool *pool) 827 834 { 828 835 kmem_cache_destroy(pool->c_handle); 829 - /* 830 - * We set pool-> destroying under lock to ensure that 831 - * z3fold_page_isolate() sees any changes to destroying. This way we 832 - * avoid the need for any memory barriers. 833 - */ 834 - 835 - spin_lock(&pool->lock); 836 - pool->destroying = true; 837 - spin_unlock(&pool->lock); 838 - 839 - /* 840 - * We need to ensure that no pages are being migrated while we destroy 841 - * these workqueues, as migration can queue work on either of the 842 - * workqueues. 843 - */ 844 - wait_event(pool->isolate_wait, !pool_isolated_are_drained(pool)); 845 836 846 837 /* 847 838 * We need to destroy pool->compact_wq before pool->release_wq, ··· 933 956 if (!page) 934 957 return -ENOMEM; 935 958 936 - zhdr = init_z3fold_page(page, pool, gfp); 959 + zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp); 937 960 if (!zhdr) { 938 961 __free_page(page); 939 962 return -ENOMEM; ··· 1109 1132 struct z3fold_header *zhdr = NULL; 1110 1133 struct page *page = NULL; 1111 1134 struct list_head *pos; 1135 + struct z3fold_buddy_slots slots; 1112 1136 unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; 1113 1137 1114 1138 spin_lock(&pool->lock); ··· 1128 1150 /* this bit could have been set by free, in which case 1129 1151 * we pass over to the next page in the pool. 1130 1152 */ 1131 - if (test_and_set_bit(PAGE_CLAIMED, &page->private)) 1153 + if (test_and_set_bit(PAGE_CLAIMED, &page->private)) { 1154 + page = NULL; 1132 1155 continue; 1156 + } 1133 1157 1134 - if (unlikely(PageIsolated(page))) 1158 + if (unlikely(PageIsolated(page))) { 1159 + clear_bit(PAGE_CLAIMED, &page->private); 1160 + page = NULL; 1135 1161 continue; 1162 + } 1163 + zhdr = page_address(page); 1136 1164 if (test_bit(PAGE_HEADLESS, &page->private)) 1137 1165 break; 1138 1166 1139 - zhdr = page_address(page); 1140 1167 if (!z3fold_page_trylock(zhdr)) { 1168 + clear_bit(PAGE_CLAIMED, &page->private); 1141 1169 zhdr = NULL; 1142 1170 continue; /* can't evict at this point */ 1143 1171 } ··· 1161 1177 1162 1178 if (!test_bit(PAGE_HEADLESS, &page->private)) { 1163 1179 /* 1164 - * We need encode the handles before unlocking, since 1165 - * we can race with free that will set 1166 - * (first|last)_chunks to 0 1180 + * We need encode the handles before unlocking, and 1181 + * use our local slots structure because z3fold_free 1182 + * can zero out zhdr->slots and we can't do much 1183 + * about that 1167 1184 */ 1168 1185 first_handle = 0; 1169 1186 last_handle = 0; 1170 1187 middle_handle = 0; 1171 1188 if (zhdr->first_chunks) 1172 - first_handle = encode_handle(zhdr, FIRST); 1189 + first_handle = __encode_handle(zhdr, &slots, 1190 + FIRST); 1173 1191 if (zhdr->middle_chunks) 1174 - middle_handle = encode_handle(zhdr, MIDDLE); 1192 + middle_handle = __encode_handle(zhdr, &slots, 1193 + MIDDLE); 1175 1194 if (zhdr->last_chunks) 1176 - last_handle = encode_handle(zhdr, LAST); 1195 + last_handle = __encode_handle(zhdr, &slots, 1196 + LAST); 1177 1197 /* 1178 1198 * it's safe to unlock here because we hold a 1179 1199 * reference to this page 1180 1200 */ 1181 1201 z3fold_page_unlock(zhdr); 1182 1202 } else { 1183 - first_handle = encode_handle(zhdr, HEADLESS); 1203 + first_handle = __encode_handle(zhdr, &slots, HEADLESS); 1184 1204 last_handle = middle_handle = 0; 1185 1205 } 1186 1206 ··· 1214 1226 spin_lock(&pool->lock); 1215 1227 list_add(&page->lru, &pool->lru); 1216 1228 spin_unlock(&pool->lock); 1229 + clear_bit(PAGE_CLAIMED, &page->private); 1217 1230 } else { 1218 1231 z3fold_page_lock(zhdr); 1219 - clear_bit(PAGE_CLAIMED, &page->private); 1220 1232 if (kref_put(&zhdr->refcount, 1221 1233 release_z3fold_page_locked)) { 1222 1234 atomic64_dec(&pool->pages_nr); ··· 1231 1243 list_add(&page->lru, &pool->lru); 1232 1244 spin_unlock(&pool->lock); 1233 1245 z3fold_page_unlock(zhdr); 1246 + clear_bit(PAGE_CLAIMED, &page->private); 1234 1247 } 1235 1248 1236 1249 /* We started off locked to we need to lock the pool back */ ··· 1328 1339 return atomic64_read(&pool->pages_nr); 1329 1340 } 1330 1341 1331 - /* 1332 - * z3fold_dec_isolated() expects to be called while pool->lock is held. 1333 - */ 1334 - static void z3fold_dec_isolated(struct z3fold_pool *pool) 1335 - { 1336 - assert_spin_locked(&pool->lock); 1337 - VM_BUG_ON(pool->isolated <= 0); 1338 - pool->isolated--; 1339 - 1340 - /* 1341 - * If we have no more isolated pages, we have to see if 1342 - * z3fold_destroy_pool() is waiting for a signal. 1343 - */ 1344 - if (pool->isolated == 0 && waitqueue_active(&pool->isolate_wait)) 1345 - wake_up_all(&pool->isolate_wait); 1346 - } 1347 - 1348 - static void z3fold_inc_isolated(struct z3fold_pool *pool) 1349 - { 1350 - pool->isolated++; 1351 - } 1352 - 1353 1342 static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) 1354 1343 { 1355 1344 struct z3fold_header *zhdr; ··· 1336 1369 VM_BUG_ON_PAGE(!PageMovable(page), page); 1337 1370 VM_BUG_ON_PAGE(PageIsolated(page), page); 1338 1371 1339 - if (test_bit(PAGE_HEADLESS, &page->private)) 1372 + if (test_bit(PAGE_HEADLESS, &page->private) || 1373 + test_bit(PAGE_CLAIMED, &page->private)) 1340 1374 return false; 1341 1375 1342 1376 zhdr = page_address(page); ··· 1355 1387 spin_lock(&pool->lock); 1356 1388 if (!list_empty(&page->lru)) 1357 1389 list_del(&page->lru); 1358 - /* 1359 - * We need to check for destruction while holding pool->lock, as 1360 - * otherwise destruction could see 0 isolated pages, and 1361 - * proceed. 1362 - */ 1363 - if (unlikely(pool->destroying)) { 1364 - spin_unlock(&pool->lock); 1365 - /* 1366 - * If this page isn't stale, somebody else holds a 1367 - * reference to it. Let't drop our refcount so that they 1368 - * can call the release logic. 1369 - */ 1370 - if (unlikely(kref_put(&zhdr->refcount, 1371 - release_z3fold_page_locked))) { 1372 - /* 1373 - * If we get here we have kref problems, so we 1374 - * should freak out. 1375 - */ 1376 - WARN(1, "Z3fold is experiencing kref problems\n"); 1377 - z3fold_page_unlock(zhdr); 1378 - return false; 1379 - } 1380 - z3fold_page_unlock(zhdr); 1381 - return false; 1382 - } 1383 - 1384 - 1385 - z3fold_inc_isolated(pool); 1386 1390 spin_unlock(&pool->lock); 1387 1391 z3fold_page_unlock(zhdr); 1388 1392 return true; ··· 1423 1483 1424 1484 queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); 1425 1485 1426 - spin_lock(&pool->lock); 1427 - z3fold_dec_isolated(pool); 1428 - spin_unlock(&pool->lock); 1429 - 1430 1486 page_mapcount_reset(page); 1431 1487 put_page(page); 1432 1488 return 0; ··· 1442 1506 INIT_LIST_HEAD(&page->lru); 1443 1507 if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { 1444 1508 atomic64_dec(&pool->pages_nr); 1445 - spin_lock(&pool->lock); 1446 - z3fold_dec_isolated(pool); 1447 - spin_unlock(&pool->lock); 1448 1509 return; 1449 1510 } 1450 1511 spin_lock(&pool->lock); 1451 1512 list_add(&page->lru, &pool->lru); 1452 - z3fold_dec_isolated(pool); 1453 1513 spin_unlock(&pool->lock); 1454 1514 z3fold_page_unlock(zhdr); 1455 1515 }

+16

mm/zpool.c

··· 239 239 } 240 240 241 241 /** 242 + * zpool_malloc_support_movable() - Check if the zpool support 243 + * allocate movable memory 244 + * @zpool: The zpool to check 245 + * 246 + * This returns if the zpool support allocate movable memory. 247 + * 248 + * Implementations must guarantee this to be thread-safe. 249 + * 250 + * Returns: true if if the zpool support allocate movable memory, false if not 251 + */ 252 + bool zpool_malloc_support_movable(struct zpool *zpool) 253 + { 254 + return zpool->driver->malloc_support_movable; 255 + } 256 + 257 + /** 242 258 * zpool_malloc() - Allocate memory 243 259 * @zpool: The zpool to allocate from. 244 260 * @size: The amount of memory to allocate.

+10 -13

mm/zsmalloc.c

··· 443 443 } 444 444 445 445 static struct zpool_driver zs_zpool_driver = { 446 - .type = "zsmalloc", 447 - .owner = THIS_MODULE, 448 - .create = zs_zpool_create, 449 - .destroy = zs_zpool_destroy, 450 - .malloc = zs_zpool_malloc, 451 - .free = zs_zpool_free, 452 - .map = zs_zpool_map, 453 - .unmap = zs_zpool_unmap, 454 - .total_size = zs_zpool_total_size, 446 + .type = "zsmalloc", 447 + .owner = THIS_MODULE, 448 + .create = zs_zpool_create, 449 + .destroy = zs_zpool_destroy, 450 + .malloc_support_movable = true, 451 + .malloc = zs_zpool_malloc, 452 + .free = zs_zpool_free, 453 + .map = zs_zpool_map, 454 + .unmap = zs_zpool_unmap, 455 + .total_size = zs_zpool_total_size, 455 456 }; 456 457 457 458 MODULE_ALIAS("zpool-zsmalloc"); ··· 477 476 return zspage->inuse; 478 477 } 479 478 480 - static inline void set_zspage_inuse(struct zspage *zspage, int val) 481 - { 482 - zspage->inuse = val; 483 - } 484 479 485 480 static inline void mod_zspage_inuse(struct zspage *zspage, int val) 486 481 {

+8 -7

mm/zswap.c

··· 856 856 /* extract swpentry from data */ 857 857 zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); 858 858 swpentry = zhdr->swpentry; /* here */ 859 - zpool_unmap_handle(pool, handle); 860 859 tree = zswap_trees[swp_type(swpentry)]; 861 860 offset = swp_offset(swpentry); 862 861 ··· 865 866 if (!entry) { 866 867 /* entry was invalidated */ 867 868 spin_unlock(&tree->lock); 869 + zpool_unmap_handle(pool, handle); 868 870 return 0; 869 871 } 870 872 spin_unlock(&tree->lock); ··· 886 886 case ZSWAP_SWAPCACHE_NEW: /* page is locked */ 887 887 /* decompress */ 888 888 dlen = PAGE_SIZE; 889 - src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, 890 - ZPOOL_MM_RO) + sizeof(struct zswap_header); 889 + src = (u8 *)zhdr + sizeof(struct zswap_header); 891 890 dst = kmap_atomic(page); 892 891 tfm = *get_cpu_ptr(entry->pool->tfm); 893 892 ret = crypto_comp_decompress(tfm, src, entry->length, 894 893 dst, &dlen); 895 894 put_cpu_ptr(entry->pool->tfm); 896 895 kunmap_atomic(dst); 897 - zpool_unmap_handle(entry->pool->zpool, entry->handle); 898 896 BUG_ON(ret); 899 897 BUG_ON(dlen != PAGE_SIZE); 900 898 ··· 938 940 spin_unlock(&tree->lock); 939 941 940 942 end: 943 + zpool_unmap_handle(pool, handle); 941 944 return ret; 942 945 } 943 946 ··· 996 997 char *buf; 997 998 u8 *src, *dst; 998 999 struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) }; 1000 + gfp_t gfp; 999 1001 1000 1002 /* THP isn't supported */ 1001 1003 if (PageTransHuge(page)) { ··· 1070 1070 1071 1071 /* store */ 1072 1072 hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0; 1073 - ret = zpool_malloc(entry->pool->zpool, hlen + dlen, 1074 - __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, 1075 - &handle); 1073 + gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; 1074 + if (zpool_malloc_support_movable(entry->pool->zpool)) 1075 + gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; 1076 + ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle); 1076 1077 if (ret == -ENOSPC) { 1077 1078 zswap_reject_compress_poor++; 1078 1079 goto put_dstmem;

+1 -8

net/xdp/xdp_umem.c

··· 206 206 207 207 static void xdp_umem_unpin_pages(struct xdp_umem *umem) 208 208 { 209 - unsigned int i; 210 - 211 - for (i = 0; i < umem->npgs; i++) { 212 - struct page *page = umem->pgs[i]; 213 - 214 - set_page_dirty_lock(page); 215 - put_page(page); 216 - } 209 + put_user_pages_dirty_lock(umem->pgs, umem->npgs, true); 217 210 218 211 kfree(umem->pgs); 219 212 umem->pgs = NULL;

+1 -1

net/xdp/xsk.c

··· 977 977 /* Matches the smp_wmb() in xsk_init_queue */ 978 978 smp_rmb(); 979 979 qpg = virt_to_head_page(q->ring); 980 - if (size > (PAGE_SIZE << compound_order(qpg))) 980 + if (size > page_size(qpg)) 981 981 return -EINVAL; 982 982 983 983 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;

+3

usr/Makefile

··· 11 11 datafile_d_y = .$(datafile_y).d 12 12 AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/$(datafile_y)" 13 13 14 + # clean rules do not have CONFIG_INITRAMFS_COMPRESSION. So clean up after all 15 + # possible compression formats. 16 + clean-files += initramfs_data.cpio* 14 17 15 18 # Generate builtin.o based on initramfs_data.o 16 19 obj-$(CONFIG_BLK_DEV_INITRD) := initramfs_data.o