Merge branch 'akpm' (patches from Andrew)

+16

Documentation/admin-guide/cgroup-v2.rst

··· 1189 1189 Amount of cached filesystem data that was modified and 1190 1190 is currently being written back to disk 1191 1191 1192 + anon_thp 1193 + Amount of memory used in anonymous mappings backed by 1194 + transparent hugepages 1195 + 1192 1196 inactive_anon, active_anon, inactive_file, active_file, unevictable 1193 1197 Amount of memory, swap-backed and filesystem-backed, 1194 1198 on the internal memory management lists used by the ··· 1251 1247 pglazyfreed 1252 1248 1253 1249 Amount of reclaimed lazyfree pages 1250 + 1251 + thp_fault_alloc 1252 + 1253 + Number of transparent hugepages which were allocated to satisfy 1254 + a page fault, including COW faults. This counter is not present 1255 + when CONFIG_TRANSPARENT_HUGEPAGE is not set. 1256 + 1257 + thp_collapse_alloc 1258 + 1259 + Number of transparent hugepages which were allocated to allow 1260 + collapsing an existing range of pages. This counter is not 1261 + present when CONFIG_TRANSPARENT_HUGEPAGE is not set. 1254 1262 1255 1263 memory.swap.current 1256 1264 A read-only single value file which exists on non-root

+6 -3

Documentation/admin-guide/mm/pagemap.rst

··· 75 75 20. NOPAGE 76 76 21. KSM 77 77 22. THP 78 - 23. BALLOON 78 + 23. OFFLINE 79 79 24. ZERO_PAGE 80 80 25. IDLE 81 + 26. PGTABLE 81 82 82 83 * ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the 83 84 memory cgroup each page is charged to, indexed by PFN. Only available when ··· 119 118 identical memory pages dynamically shared between one or more processes 120 119 22 - THP 121 120 contiguous pages which construct transparent hugepages 122 - 23 - BALLOON 123 - balloon compaction page 121 + 23 - OFFLINE 122 + page is logically offline 124 123 24 - ZERO_PAGE 125 124 zero page for pfn_zero or huge_zero page 126 125 25 - IDLE ··· 129 128 Note that this flag may be stale in case the page was accessed via 130 129 a PTE. To make sure the flag is up-to-date one has to read 131 130 ``/sys/kernel/mm/page_idle/bitmap`` first. 131 + 26 - PGTABLE 132 + page is in use as a page table 132 133 133 134 IO related page flags 134 135 ---------------------

+2 -2

Documentation/cgroup-v1/memcg_test.txt

··· 107 107 108 108 8. LRU 109 109 Each memcg has its own private LRU. Now, its handling is under global 110 - VM's control (means that it's handled under global zone_lru_lock). 110 + VM's control (means that it's handled under global pgdat->lru_lock). 111 111 Almost all routines around memcg's LRU is called by global LRU's 112 - list management functions under zone_lru_lock(). 112 + list management functions under pgdat->lru_lock. 113 113 114 114 A special function is mem_cgroup_isolate_pages(). This scans 115 115 memcg's private LRU and call __isolate_lru_page() to extract a page

+2 -2

Documentation/cgroup-v1/memory.txt

··· 267 267 Other lock order is following: 268 268 PG_locked. 269 269 mm->page_table_lock 270 - zone_lru_lock 270 + pgdat->lru_lock 271 271 lock_page_cgroup. 272 272 In many cases, just lock_page_cgroup() is called. 273 273 per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by 274 - zone_lru_lock, it has no lock of its own. 274 + pgdat->lru_lock, it has no lock of its own. 275 275 276 276 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) 277 277

+8

MAINTAINERS

··· 9835 9835 F: include/uapi/linux/membarrier.h 9836 9836 F: arch/powerpc/include/asm/membarrier.h 9837 9837 9838 + MEMBLOCK 9839 + M: Mike Rapoport <rppt@linux.ibm.com> 9840 + L: linux-mm@kvack.org 9841 + S: Maintained 9842 + F: include/linux/memblock.h 9843 + F: mm/memblock.c 9844 + F: Documentation/core-api/boot-time-mm.rst 9845 + 9838 9846 MEMORY MANAGEMENT 9839 9847 L: linux-mm@kvack.org 9840 9848 W: http://www.linux-mm.org

+2 -1

arch/alpha/include/asm/topology.h

··· 4 4 5 5 #include <linux/smp.h> 6 6 #include <linux/threads.h> 7 + #include <linux/numa.h> 7 8 #include <asm/machvec.h> 8 9 9 10 #ifdef CONFIG_NUMA ··· 30 29 { 31 30 int cpu; 32 31 33 - if (node == -1) 32 + if (node == NUMA_NO_NODE) 34 33 return cpu_all_mask; 35 34 36 35 cpumask_clear(&node_to_cpumask_map[node]);

+4

arch/arm64/Kconfig

··· 1467 1467 def_bool y 1468 1468 depends on COMPAT && SYSVIPC 1469 1469 1470 + config ARCH_ENABLE_HUGEPAGE_MIGRATION 1471 + def_bool y 1472 + depends on HUGETLB_PAGE && MIGRATION 1473 + 1470 1474 menu "Power management options" 1471 1475 1472 1476 source "kernel/power/Kconfig"

+5

arch/arm64/include/asm/hugetlb.h

··· 20 20 21 21 #include <asm/page.h> 22 22 23 + #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION 24 + #define arch_hugetlb_migration_supported arch_hugetlb_migration_supported 25 + extern bool arch_hugetlb_migration_supported(struct hstate *h); 26 + #endif 27 + 23 28 #define __HAVE_ARCH_HUGE_PTEP_GET 24 29 static inline pte_t huge_ptep_get(pte_t *ptep) 25 30 {

-4

arch/arm64/include/asm/memory.h

··· 80 80 */ 81 81 #ifdef CONFIG_KASAN 82 82 #define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - KASAN_SHADOW_SCALE_SHIFT)) 83 - #ifdef CONFIG_KASAN_EXTRA 84 - #define KASAN_THREAD_SHIFT 2 85 - #else 86 83 #define KASAN_THREAD_SHIFT 1 87 - #endif /* CONFIG_KASAN_EXTRA */ 88 84 #else 89 85 #define KASAN_SHADOW_SIZE (0) 90 86 #define KASAN_THREAD_SHIFT 0

+1 -2

arch/arm64/kernel/machine_kexec.c

··· 321 321 * but does not hold any data of loaded kernel image. 322 322 * 323 323 * Note that all the pages in crash dump kernel memory have been initially 324 - * marked as Reserved in kexec_reserve_crashkres_pages(). 324 + * marked as Reserved as memory was allocated via memblock_reserve(). 325 325 * 326 326 * In hibernation, the pages which are Reserved and yet "nosave" are excluded 327 327 * from the hibernation iamge. crash_is_nosave() does thich check for crash ··· 361 361 362 362 for (addr = begin; addr < end; addr += PAGE_SIZE) { 363 363 page = phys_to_page(addr); 364 - ClearPageReserved(page); 365 364 free_reserved_page(page); 366 365 } 367 366 }

+20

arch/arm64/mm/hugetlbpage.c

··· 27 27 #include <asm/tlbflush.h> 28 28 #include <asm/pgalloc.h> 29 29 30 + #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION 31 + bool arch_hugetlb_migration_supported(struct hstate *h) 32 + { 33 + size_t pagesize = huge_page_size(h); 34 + 35 + switch (pagesize) { 36 + #ifdef CONFIG_ARM64_4K_PAGES 37 + case PUD_SIZE: 38 + #endif 39 + case PMD_SIZE: 40 + case CONT_PMD_SIZE: 41 + case CONT_PTE_SIZE: 42 + return true; 43 + } 44 + pr_warn("%s: unrecognized huge page size 0x%lx\n", 45 + __func__, pagesize); 46 + return false; 47 + } 48 + #endif 49 + 30 50 int pmd_huge(pmd_t pmd) 31 51 { 32 52 return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);

-27

arch/arm64/mm/init.c

··· 118 118 crashk_res.start = crash_base; 119 119 crashk_res.end = crash_base + crash_size - 1; 120 120 } 121 - 122 - static void __init kexec_reserve_crashkres_pages(void) 123 - { 124 - #ifdef CONFIG_HIBERNATION 125 - phys_addr_t addr; 126 - struct page *page; 127 - 128 - if (!crashk_res.end) 129 - return; 130 - 131 - /* 132 - * To reduce the size of hibernation image, all the pages are 133 - * marked as Reserved initially. 134 - */ 135 - for (addr = crashk_res.start; addr < (crashk_res.end + 1); 136 - addr += PAGE_SIZE) { 137 - page = phys_to_page(addr); 138 - SetPageReserved(page); 139 - } 140 - #endif 141 - } 142 121 #else 143 122 static void __init reserve_crashkernel(void) 144 - { 145 - } 146 - 147 - static void __init kexec_reserve_crashkres_pages(void) 148 123 { 149 124 } 150 125 #endif /* CONFIG_KEXEC_CORE */ ··· 560 585 #endif 561 586 /* this will put all unused low memory onto the freelists */ 562 587 memblock_free_all(); 563 - 564 - kexec_reserve_crashkres_pages(); 565 588 566 589 mem_init_print_info(NULL); 567 590

+1 -1

arch/arm64/mm/numa.c

··· 120 120 } 121 121 122 122 /* cpumask_of_node() will now work */ 123 - pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); 123 + pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); 124 124 } 125 125 126 126 /*

+1 -1

arch/ia64/kernel/numa.c

··· 74 74 cpumask_clear(&node_to_cpu_mask[node]); 75 75 76 76 for_each_possible_early_cpu(cpu) { 77 - node = -1; 77 + node = NUMA_NO_NODE; 78 78 for (i = 0; i < NR_CPUS; ++i) 79 79 if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) { 80 80 node = node_cpuid[i].nid;

+4 -55

arch/ia64/kernel/perfmon.c

··· 583 583 if (task != current) put_task_struct(task); 584 584 } 585 585 586 - static inline void 587 - pfm_reserve_page(unsigned long a) 588 - { 589 - SetPageReserved(vmalloc_to_page((void *)a)); 590 - } 591 - static inline void 592 - pfm_unreserve_page(unsigned long a) 593 - { 594 - ClearPageReserved(vmalloc_to_page((void*)a)); 595 - } 596 - 597 586 static inline unsigned long 598 587 pfm_protect_ctx_ctxsw(pfm_context_t *x) 599 588 { ··· 803 814 { 804 815 ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0; 805 816 DPRINT(("ctx=%p msgq reset\n", ctx)); 806 - } 807 - 808 - static void * 809 - pfm_rvmalloc(unsigned long size) 810 - { 811 - void *mem; 812 - unsigned long addr; 813 - 814 - size = PAGE_ALIGN(size); 815 - mem = vzalloc(size); 816 - if (mem) { 817 - //printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem); 818 - addr = (unsigned long)mem; 819 - while (size > 0) { 820 - pfm_reserve_page(addr); 821 - addr+=PAGE_SIZE; 822 - size-=PAGE_SIZE; 823 - } 824 - } 825 - return mem; 826 - } 827 - 828 - static void 829 - pfm_rvfree(void *mem, unsigned long size) 830 - { 831 - unsigned long addr; 832 - 833 - if (mem) { 834 - DPRINT(("freeing physical buffer @%p size=%lu\n", mem, size)); 835 - addr = (unsigned long) mem; 836 - while ((long) size > 0) { 837 - pfm_unreserve_page(addr); 838 - addr+=PAGE_SIZE; 839 - size-=PAGE_SIZE; 840 - } 841 - vfree(mem); 842 - } 843 - return; 844 817 } 845 818 846 819 static pfm_context_t * ··· 1449 1498 /* 1450 1499 * free the buffer 1451 1500 */ 1452 - pfm_rvfree(ctx->ctx_smpl_hdr, ctx->ctx_smpl_size); 1501 + vfree(ctx->ctx_smpl_hdr); 1453 1502 1454 1503 ctx->ctx_smpl_hdr = NULL; 1455 1504 ctx->ctx_smpl_size = 0UL; ··· 2088 2137 * All memory free operations (especially for vmalloc'ed memory) 2089 2138 * MUST be done with interrupts ENABLED. 2090 2139 */ 2091 - if (smpl_buf_addr) pfm_rvfree(smpl_buf_addr, smpl_buf_size); 2140 + vfree(smpl_buf_addr); 2092 2141 2093 2142 /* 2094 2143 * return the memory used by the context ··· 2217 2266 2218 2267 /* 2219 2268 * We do the easy to undo allocations first. 2220 - * 2221 - * pfm_rvmalloc(), clears the buffer, so there is no leak 2222 2269 */ 2223 - smpl_buf = pfm_rvmalloc(size); 2270 + smpl_buf = vzalloc(size); 2224 2271 if (smpl_buf == NULL) { 2225 2272 DPRINT(("Can't allocate sampling buffer\n")); 2226 2273 return -ENOMEM; ··· 2295 2346 error: 2296 2347 vm_area_free(vma); 2297 2348 error_kmem: 2298 - pfm_rvfree(smpl_buf, size); 2349 + vfree(smpl_buf); 2299 2350 2300 2351 return -ENOMEM; 2301 2352 }

+3 -3

arch/ia64/mm/discontig.c

··· 227 227 * CPUs are put into groups according to node. Walk cpu_map 228 228 * and create new groups at node boundaries. 229 229 */ 230 - prev_node = -1; 230 + prev_node = NUMA_NO_NODE; 231 231 ai->nr_groups = 0; 232 232 for (unit = 0; unit < nr_units; unit++) { 233 233 cpu = cpu_map[unit]; ··· 435 435 { 436 436 void *ptr = NULL; 437 437 u8 best = 0xff; 438 - int bestnode = -1, node, anynode = 0; 438 + int bestnode = NUMA_NO_NODE, node, anynode = 0; 439 439 440 440 for_each_online_node(node) { 441 441 if (node_isset(node, memory_less_mask)) ··· 447 447 anynode = node; 448 448 } 449 449 450 - if (bestnode == -1) 450 + if (bestnode == NUMA_NO_NODE) 451 451 bestnode = anynode; 452 452 453 453 ptr = memblock_alloc_try_nid(pernodesize, PERCPU_PAGE_SIZE,

+1 -1

arch/m68k/mm/memory.c

··· 51 51 pr_debug("init_pointer_table: %lx, %x\n", ptable, PD_MARKBITS(dp)); 52 52 53 53 /* unreserve the page so it's possible to free that page */ 54 - PD_PAGE(dp)->flags &= ~(1 << PG_reserved); 54 + __ClearPageReserved(PD_PAGE(dp)); 55 55 init_page_count(PD_PAGE(dp)); 56 56 57 57 return;

+12

arch/powerpc/include/asm/book3s/64/hugetlb.h

··· 13 13 unsigned long len, unsigned long pgoff, 14 14 unsigned long flags); 15 15 16 + extern void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, 17 + unsigned long addr, pte_t *ptep, 18 + pte_t old_pte, pte_t pte); 19 + 16 20 static inline int hstate_get_psize(struct hstate *hstate) 17 21 { 18 22 unsigned long shift; ··· 46 42 /* hugepd entry valid bit */ 47 43 #define HUGEPD_VAL_BITS (0x8000000000000000UL) 48 44 45 + #define huge_ptep_modify_prot_start huge_ptep_modify_prot_start 46 + extern pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, 47 + unsigned long addr, pte_t *ptep); 48 + 49 + #define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit 50 + extern void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, 51 + unsigned long addr, pte_t *ptep, 52 + pte_t old_pte, pte_t new_pte); 49 53 #endif

+18

arch/powerpc/include/asm/book3s/64/pgtable.h

··· 1306 1306 BUILD_BUG(); 1307 1307 return 0; 1308 1308 } 1309 + #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION 1310 + pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *); 1311 + void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, 1312 + pte_t *, pte_t, pte_t); 1313 + 1314 + /* 1315 + * Returns true for a R -> RW upgrade of pte 1316 + */ 1317 + static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_val) 1318 + { 1319 + if (!(old_val & _PAGE_READ)) 1320 + return false; 1321 + 1322 + if ((!(old_val & _PAGE_WRITE)) && (new_val & _PAGE_WRITE)) 1323 + return true; 1324 + 1325 + return false; 1326 + } 1309 1327 1310 1328 #endif /* __ASSEMBLY__ */ 1311 1329 #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */

+4

arch/powerpc/include/asm/book3s/64/radix.h

··· 127 127 pte_t entry, unsigned long address, 128 128 int psize); 129 129 130 + extern void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, 131 + unsigned long addr, pte_t *ptep, 132 + pte_t old_pte, pte_t pte); 133 + 130 134 static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr, 131 135 unsigned long set) 132 136 {

+2 -1

arch/powerpc/include/asm/pci-bridge.h

··· 10 10 #include <linux/pci.h> 11 11 #include <linux/list.h> 12 12 #include <linux/ioport.h> 13 + #include <linux/numa.h> 13 14 14 15 struct device_node; 15 16 ··· 266 265 #ifdef CONFIG_NUMA 267 266 #define PHB_SET_NODE(PHB, NODE) ((PHB)->node = (NODE)) 268 267 #else 269 - #define PHB_SET_NODE(PHB, NODE) ((PHB)->node = -1) 268 + #define PHB_SET_NODE(PHB, NODE) ((PHB)->node = NUMA_NO_NODE) 270 269 #endif 271 270 272 271 #endif /* CONFIG_PPC64 */

+2 -1

arch/powerpc/kernel/paca.c

··· 11 11 #include <linux/export.h> 12 12 #include <linux/memblock.h> 13 13 #include <linux/sched/task.h> 14 + #include <linux/numa.h> 14 15 15 16 #include <asm/lppaca.h> 16 17 #include <asm/paca.h> ··· 37 36 * which will put its paca in the right place. 38 37 */ 39 38 if (cpu == boot_cpuid) { 40 - nid = -1; 39 + nid = NUMA_NO_NODE; 41 40 memblock_set_bottom_up(true); 42 41 } else { 43 42 nid = early_cpu_to_node(cpu);

+2 -1

arch/powerpc/kernel/pci-common.c

··· 32 32 #include <linux/vmalloc.h> 33 33 #include <linux/slab.h> 34 34 #include <linux/vgaarb.h> 35 + #include <linux/numa.h> 35 36 36 37 #include <asm/processor.h> 37 38 #include <asm/io.h> ··· 133 132 int nid = of_node_to_nid(dev); 134 133 135 134 if (nid < 0 || !node_online(nid)) 136 - nid = -1; 135 + nid = NUMA_NO_NODE; 137 136 138 137 PHB_SET_NODE(phb, nid); 139 138 }

-2

arch/powerpc/kernel/vdso.c

··· 798 798 BUG_ON(vdso32_pagelist == NULL); 799 799 for (i = 0; i < vdso32_pages; i++) { 800 800 struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); 801 - ClearPageReserved(pg); 802 801 get_page(pg); 803 802 vdso32_pagelist[i] = pg; 804 803 } ··· 811 812 BUG_ON(vdso64_pagelist == NULL); 812 813 for (i = 0; i < vdso64_pages; i++) { 813 814 struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); 814 - ClearPageReserved(pg); 815 815 get_page(pg); 816 816 vdso64_pagelist[i] = pg; 817 817 }

+25

arch/powerpc/mm/hugetlbpage-hash64.c

··· 121 121 *ptep = __pte(new_pte & ~H_PAGE_BUSY); 122 122 return 0; 123 123 } 124 + 125 + pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, 126 + unsigned long addr, pte_t *ptep) 127 + { 128 + unsigned long pte_val; 129 + /* 130 + * Clear the _PAGE_PRESENT so that no hardware parallel update is 131 + * possible. Also keep the pte_present true so that we don't take 132 + * wrong fault. 133 + */ 134 + pte_val = pte_update(vma->vm_mm, addr, ptep, 135 + _PAGE_PRESENT, _PAGE_INVALID, 1); 136 + 137 + return __pte(pte_val); 138 + } 139 + 140 + void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, 141 + pte_t *ptep, pte_t old_pte, pte_t pte) 142 + { 143 + 144 + if (radix_enabled()) 145 + return radix__huge_ptep_modify_prot_commit(vma, addr, ptep, 146 + old_pte, pte); 147 + set_huge_pte_at(vma->vm_mm, addr, ptep, pte); 148 + }

+17

arch/powerpc/mm/hugetlbpage-radix.c

··· 90 90 91 91 return vm_unmapped_area(&info); 92 92 } 93 + 94 + void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, 95 + unsigned long addr, pte_t *ptep, 96 + pte_t old_pte, pte_t pte) 97 + { 98 + struct mm_struct *mm = vma->vm_mm; 99 + 100 + /* 101 + * To avoid NMMU hang while relaxing access we need to flush the tlb before 102 + * we set the new value. 103 + */ 104 + if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && 105 + (atomic_read(&mm->context.copros) > 0)) 106 + radix__flush_hugetlb_page(vma, addr); 107 + 108 + set_huge_pte_at(vma->vm_mm, addr, ptep, pte); 109 + }

+42 -101

arch/powerpc/mm/mmu_context_iommu.c

··· 21 21 #include <linux/sizes.h> 22 22 #include <asm/mmu_context.h> 23 23 #include <asm/pte-walk.h> 24 + #include <linux/mm_inline.h> 24 25 25 26 static DEFINE_MUTEX(mem_list_mutex); 26 27 ··· 35 34 atomic64_t mapped; 36 35 unsigned int pageshift; 37 36 u64 ua; /* userspace address */ 38 - u64 entries; /* number of entries in hpas[] */ 39 - u64 *hpas; /* vmalloc'ed */ 37 + u64 entries; /* number of entries in hpas/hpages[] */ 38 + /* 39 + * in mm_iommu_get we temporarily use this to store 40 + * struct page address. 41 + * 42 + * We need to convert ua to hpa in real mode. Make it 43 + * simpler by storing physical address. 44 + */ 45 + union { 46 + struct page **hpages; /* vmalloc'ed */ 47 + phys_addr_t *hpas; 48 + }; 40 49 #define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1) 41 50 u64 dev_hpa; /* Device memory base address */ 42 51 }; ··· 91 80 } 92 81 EXPORT_SYMBOL_GPL(mm_iommu_preregistered); 93 82 94 - /* 95 - * Taken from alloc_migrate_target with changes to remove CMA allocations 96 - */ 97 - struct page *new_iommu_non_cma_page(struct page *page, unsigned long private) 98 - { 99 - gfp_t gfp_mask = GFP_USER; 100 - struct page *new_page; 101 - 102 - if (PageCompound(page)) 103 - return NULL; 104 - 105 - if (PageHighMem(page)) 106 - gfp_mask |= __GFP_HIGHMEM; 107 - 108 - /* 109 - * We don't want the allocation to force an OOM if possibe 110 - */ 111 - new_page = alloc_page(gfp_mask | __GFP_NORETRY | __GFP_NOWARN); 112 - return new_page; 113 - } 114 - 115 - static int mm_iommu_move_page_from_cma(struct page *page) 116 - { 117 - int ret = 0; 118 - LIST_HEAD(cma_migrate_pages); 119 - 120 - /* Ignore huge pages for now */ 121 - if (PageCompound(page)) 122 - return -EBUSY; 123 - 124 - lru_add_drain(); 125 - ret = isolate_lru_page(page); 126 - if (ret) 127 - return ret; 128 - 129 - list_add(&page->lru, &cma_migrate_pages); 130 - put_page(page); /* Drop the gup reference */ 131 - 132 - ret = migrate_pages(&cma_migrate_pages, new_iommu_non_cma_page, 133 - NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE); 134 - if (ret) { 135 - if (!list_empty(&cma_migrate_pages)) 136 - putback_movable_pages(&cma_migrate_pages); 137 - } 138 - 139 - return 0; 140 - } 141 - 142 83 static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, 143 - unsigned long entries, unsigned long dev_hpa, 144 - struct mm_iommu_table_group_mem_t **pmem) 84 + unsigned long entries, unsigned long dev_hpa, 85 + struct mm_iommu_table_group_mem_t **pmem) 145 86 { 146 87 struct mm_iommu_table_group_mem_t *mem; 147 - long i, j, ret = 0, locked_entries = 0; 88 + long i, ret, locked_entries = 0; 148 89 unsigned int pageshift; 149 - unsigned long flags; 150 - unsigned long cur_ua; 151 - struct page *page = NULL; 152 90 153 91 mutex_lock(&mem_list_mutex); 154 92 ··· 147 187 goto unlock_exit; 148 188 } 149 189 190 + down_read(&mm->mmap_sem); 191 + ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL); 192 + up_read(&mm->mmap_sem); 193 + if (ret != entries) { 194 + /* free the reference taken */ 195 + for (i = 0; i < ret; i++) 196 + put_page(mem->hpages[i]); 197 + 198 + vfree(mem->hpas); 199 + kfree(mem); 200 + ret = -EFAULT; 201 + goto unlock_exit; 202 + } 203 + 204 + pageshift = PAGE_SHIFT; 150 205 for (i = 0; i < entries; ++i) { 151 - cur_ua = ua + (i << PAGE_SHIFT); 152 - if (1 != get_user_pages_fast(cur_ua, 153 - 1/* pages */, 1/* iswrite */, &page)) { 154 - ret = -EFAULT; 155 - for (j = 0; j < i; ++j) 156 - put_page(pfn_to_page(mem->hpas[j] >> 157 - PAGE_SHIFT)); 158 - vfree(mem->hpas); 159 - kfree(mem); 160 - goto unlock_exit; 161 - } 206 + struct page *page = mem->hpages[i]; 207 + 162 208 /* 163 - * If we get a page from the CMA zone, since we are going to 164 - * be pinning these entries, we might as well move them out 165 - * of the CMA zone if possible. NOTE: faulting in + migration 166 - * can be expensive. Batching can be considered later 209 + * Allow to use larger than 64k IOMMU pages. Only do that 210 + * if we are backed by hugetlb. 167 211 */ 168 - if (is_migrate_cma_page(page)) { 169 - if (mm_iommu_move_page_from_cma(page)) 170 - goto populate; 171 - if (1 != get_user_pages_fast(cur_ua, 172 - 1/* pages */, 1/* iswrite */, 173 - &page)) { 174 - ret = -EFAULT; 175 - for (j = 0; j < i; ++j) 176 - put_page(pfn_to_page(mem->hpas[j] >> 177 - PAGE_SHIFT)); 178 - vfree(mem->hpas); 179 - kfree(mem); 180 - goto unlock_exit; 181 - } 182 - } 183 - populate: 184 - pageshift = PAGE_SHIFT; 185 - if (mem->pageshift > PAGE_SHIFT && PageCompound(page)) { 186 - pte_t *pte; 212 + if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) { 187 213 struct page *head = compound_head(page); 188 - unsigned int compshift = compound_order(head); 189 - unsigned int pteshift; 190 214 191 - local_irq_save(flags); /* disables as well */ 192 - pte = find_linux_pte(mm->pgd, cur_ua, NULL, &pteshift); 193 - 194 - /* Double check it is still the same pinned page */ 195 - if (pte && pte_page(*pte) == head && 196 - pteshift == compshift + PAGE_SHIFT) 197 - pageshift = max_t(unsigned int, pteshift, 198 - PAGE_SHIFT); 199 - local_irq_restore(flags); 215 + pageshift = compound_order(head) + PAGE_SHIFT; 200 216 } 201 217 mem->pageshift = min(mem->pageshift, pageshift); 218 + /* 219 + * We don't need struct page reference any more, switch 220 + * to physical address. 221 + */ 202 222 mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; 203 223 } 204 224 205 225 good_exit: 226 + ret = 0; 206 227 atomic64_set(&mem->mapped, 1); 207 228 mem->used = 1; 208 229 mem->ua = ua;

+8 -8

arch/powerpc/mm/numa.c

··· 84 84 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 85 85 86 86 /* cpumask_of_node() will now work */ 87 - dbg("Node to cpumask map for %d nodes\n", nr_node_ids); 87 + dbg("Node to cpumask map for %u nodes\n", nr_node_ids); 88 88 } 89 89 90 90 static int __init fake_numa_create_new_node(unsigned long end_pfn, ··· 215 215 */ 216 216 static int associativity_to_nid(const __be32 *associativity) 217 217 { 218 - int nid = -1; 218 + int nid = NUMA_NO_NODE; 219 219 220 220 if (min_common_depth == -1) 221 221 goto out; ··· 225 225 226 226 /* POWER4 LPAR uses 0xffff as invalid node */ 227 227 if (nid == 0xffff || nid >= MAX_NUMNODES) 228 - nid = -1; 228 + nid = NUMA_NO_NODE; 229 229 230 230 if (nid > 0 && 231 231 of_read_number(associativity, 1) >= distance_ref_points_depth) { ··· 244 244 */ 245 245 static int of_node_to_nid_single(struct device_node *device) 246 246 { 247 - int nid = -1; 247 + int nid = NUMA_NO_NODE; 248 248 const __be32 *tmp; 249 249 250 250 tmp = of_get_associativity(device); ··· 256 256 /* Walk the device tree upwards, looking for an associativity id */ 257 257 int of_node_to_nid(struct device_node *device) 258 258 { 259 - int nid = -1; 259 + int nid = NUMA_NO_NODE; 260 260 261 261 of_node_get(device); 262 262 while (device) { ··· 454 454 */ 455 455 static int numa_setup_cpu(unsigned long lcpu) 456 456 { 457 - int nid = -1; 457 + int nid = NUMA_NO_NODE; 458 458 struct device_node *cpu; 459 459 460 460 /* ··· 930 930 { 931 931 struct drmem_lmb *lmb; 932 932 unsigned long lmb_size; 933 - int nid = -1; 933 + int nid = NUMA_NO_NODE; 934 934 935 935 lmb_size = drmem_lmb_size(); 936 936 ··· 960 960 static int hot_add_node_scn_to_nid(unsigned long scn_addr) 961 961 { 962 962 struct device_node *memory; 963 - int nid = -1; 963 + int nid = NUMA_NO_NODE; 964 964 965 965 for_each_node_by_type(memory, "memory") { 966 966 unsigned long start, size;

+25

arch/powerpc/mm/pgtable-book3s64.c

··· 401 401 } 402 402 #endif /* CONFIG_PROC_FS */ 403 403 404 + pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, 405 + pte_t *ptep) 406 + { 407 + unsigned long pte_val; 408 + 409 + /* 410 + * Clear the _PAGE_PRESENT so that no hardware parallel update is 411 + * possible. Also keep the pte_present true so that we don't take 412 + * wrong fault. 413 + */ 414 + pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0); 415 + 416 + return __pte(pte_val); 417 + 418 + } 419 + 420 + void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, 421 + pte_t *ptep, pte_t old_pte, pte_t pte) 422 + { 423 + if (radix_enabled()) 424 + return radix__ptep_modify_prot_commit(vma, addr, 425 + ptep, old_pte, pte); 426 + set_pte_at(vma->vm_mm, addr, ptep, pte); 427 + } 428 + 404 429 /* 405 430 * For hash translation mode, we use the deposited table to store hash slot 406 431 * information and they are stored at PTRS_PER_PMD offset from related pmd

+18

arch/powerpc/mm/pgtable-radix.c

··· 1063 1063 } 1064 1064 /* See ptesync comment in radix__set_pte_at */ 1065 1065 } 1066 + 1067 + void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, 1068 + unsigned long addr, pte_t *ptep, 1069 + pte_t old_pte, pte_t pte) 1070 + { 1071 + struct mm_struct *mm = vma->vm_mm; 1072 + 1073 + /* 1074 + * To avoid NMMU hang while relaxing access we need to flush the tlb before 1075 + * we set the new value. We need to do this only for radix, because hash 1076 + * translation does flush when updating the linux pte. 1077 + */ 1078 + if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && 1079 + (atomic_read(&mm->context.copros) > 0)) 1080 + radix__flush_tlb_page(vma, addr); 1081 + 1082 + set_pte_at(mm, addr, ptep, pte); 1083 + }

+3 -2

arch/powerpc/platforms/powernv/memtrace.c

··· 20 20 #include <linux/slab.h> 21 21 #include <linux/memory.h> 22 22 #include <linux/memory_hotplug.h> 23 + #include <linux/numa.h> 23 24 #include <asm/machdep.h> 24 25 #include <asm/debugfs.h> 25 26 ··· 224 223 ent = &memtrace_array[i]; 225 224 226 225 /* We have onlined this chunk previously */ 227 - if (ent->nid == -1) 226 + if (ent->nid == NUMA_NO_NODE) 228 227 continue; 229 228 230 229 /* Remove from io mappings */ ··· 258 257 */ 259 258 debugfs_remove_recursive(ent->dir); 260 259 pr_info("Added trace memory back to node %d\n", ent->nid); 261 - ent->size = ent->start = ent->nid = -1; 260 + ent->size = ent->start = ent->nid = NUMA_NO_NODE; 262 261 } 263 262 if (ret) 264 263 return ret;

-1

arch/riscv/kernel/vdso.c

··· 54 54 struct page *pg; 55 55 56 56 pg = virt_to_page(vdso_start + (i << PAGE_SHIFT)); 57 - ClearPageReserved(pg); 58 57 vdso_pagelist[i] = pg; 59 58 } 60 59 vdso_pagelist[i] = virt_to_page(vdso_data);

+3 -2

arch/s390/include/asm/pgtable.h

··· 1069 1069 } 1070 1070 1071 1071 #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION 1072 - pte_t ptep_modify_prot_start(struct mm_struct *, unsigned long, pte_t *); 1073 - void ptep_modify_prot_commit(struct mm_struct *, unsigned long, pte_t *, pte_t); 1072 + pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *); 1073 + void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, 1074 + pte_t *, pte_t, pte_t); 1074 1075 1075 1076 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH 1076 1077 static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,

-2

arch/s390/kernel/vdso.c

··· 291 291 BUG_ON(vdso32_pagelist == NULL); 292 292 for (i = 0; i < vdso32_pages - 1; i++) { 293 293 struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); 294 - ClearPageReserved(pg); 295 294 get_page(pg); 296 295 vdso32_pagelist[i] = pg; 297 296 } ··· 308 309 BUG_ON(vdso64_pagelist == NULL); 309 310 for (i = 0; i < vdso64_pages - 1; i++) { 310 311 struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); 311 - ClearPageReserved(pg); 312 312 get_page(pg); 313 313 vdso64_pagelist[i] = pg; 314 314 }

+5 -3

arch/s390/mm/pgtable.c

··· 301 301 } 302 302 EXPORT_SYMBOL(ptep_xchg_lazy); 303 303 304 - pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, 304 + pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, 305 305 pte_t *ptep) 306 306 { 307 307 pgste_t pgste; 308 308 pte_t old; 309 309 int nodat; 310 + struct mm_struct *mm = vma->vm_mm; 310 311 311 312 preempt_disable(); 312 313 pgste = ptep_xchg_start(mm, addr, ptep); ··· 320 319 return old; 321 320 } 322 321 323 - void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 324 - pte_t *ptep, pte_t pte) 322 + void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, 323 + pte_t *ptep, pte_t old_pte, pte_t pte) 325 324 { 326 325 pgste_t pgste; 326 + struct mm_struct *mm = vma->vm_mm; 327 327 328 328 if (!MACHINE_HAS_NX) 329 329 pte_val(pte) &= ~_PAGE_NOEXEC;

+2 -2

arch/sh/kernel/syscalls/syscalltbl.sh

··· 13 13 t_entry="$3" 14 14 15 15 while [ $t_nxt -lt $t_nr ]; do 16 - printf "__SYSCALL(%s, sys_ni_syscall, )\n" "${t_nxt}" 16 + printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}" 17 17 t_nxt=$((t_nxt+1)) 18 18 done 19 - printf "__SYSCALL(%s, %s, )\n" "${t_nxt}" "${t_entry}" 19 + printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}" 20 20 } 21 21 22 22 grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (

+1 -1

arch/sh/kernel/syscalls_32.S

··· 10 10 #include <linux/sys.h> 11 11 #include <linux/linkage.h> 12 12 13 - #define __SYSCALL(nr, entry, nargs) .long entry 13 + #define __SYSCALL(nr, entry) .long entry 14 14 .data 15 15 ENTRY(sys_call_table) 16 16 #include <asm/syscall_table.h>

+2 -1

arch/sparc/kernel/pci_fire.c

··· 11 11 #include <linux/export.h> 12 12 #include <linux/irq.h> 13 13 #include <linux/of_device.h> 14 + #include <linux/numa.h> 14 15 15 16 #include <asm/prom.h> 16 17 #include <asm/irq.h> ··· 417 416 struct device_node *dp = op->dev.of_node; 418 417 int err; 419 418 420 - pbm->numa_node = -1; 419 + pbm->numa_node = NUMA_NO_NODE; 421 420 422 421 pbm->pci_ops = &sun4u_pci_ops; 423 422 pbm->config_space_reg_bits = 12;

+2 -1

arch/sparc/kernel/pci_schizo.c

··· 12 12 #include <linux/export.h> 13 13 #include <linux/interrupt.h> 14 14 #include <linux/of_device.h> 15 + #include <linux/numa.h> 15 16 16 17 #include <asm/iommu.h> 17 18 #include <asm/irq.h> ··· 1348 1347 pbm->next = pci_pbm_root; 1349 1348 pci_pbm_root = pbm; 1350 1349 1351 - pbm->numa_node = -1; 1350 + pbm->numa_node = NUMA_NO_NODE; 1352 1351 1353 1352 pbm->pci_ops = &sun4u_pci_ops; 1354 1353 pbm->config_space_reg_bits = 8;

+2 -1

arch/sparc/kernel/psycho_common.c

··· 5 5 */ 6 6 #include <linux/kernel.h> 7 7 #include <linux/interrupt.h> 8 + #include <linux/numa.h> 8 9 9 10 #include <asm/upa.h> 10 11 ··· 455 454 struct device_node *dp = op->dev.of_node; 456 455 457 456 pbm->name = dp->full_name; 458 - pbm->numa_node = -1; 457 + pbm->numa_node = NUMA_NO_NODE; 459 458 pbm->chip_type = chip_type; 460 459 pbm->chip_version = of_getintprop_default(dp, "version#", 0); 461 460 pbm->chip_revision = of_getintprop_default(dp, "module-revision#", 0);

+2 -1

arch/sparc/kernel/sbus.c

··· 15 15 #include <linux/interrupt.h> 16 16 #include <linux/of.h> 17 17 #include <linux/of_device.h> 18 + #include <linux/numa.h> 18 19 19 20 #include <asm/page.h> 20 21 #include <asm/io.h> ··· 562 561 563 562 op->dev.archdata.iommu = iommu; 564 563 op->dev.archdata.stc = strbuf; 565 - op->dev.archdata.numa_node = -1; 564 + op->dev.archdata.numa_node = NUMA_NO_NODE; 566 565 567 566 reg_base = regs + SYSIO_IOMMUREG_BASE; 568 567 iommu->iommu_control = reg_base + IOMMU_CONTROL;

+3 -3

arch/sparc/mm/init_64.c

··· 976 976 { 977 977 int prev_nid, new_nid; 978 978 979 - prev_nid = -1; 979 + prev_nid = NUMA_NO_NODE; 980 980 for ( ; start < end; start += PAGE_SIZE) { 981 981 for (new_nid = 0; new_nid < num_node_masks; new_nid++) { 982 982 struct node_mem_mask *p = &node_masks[new_nid]; 983 983 984 984 if ((start & p->mask) == p->match) { 985 - if (prev_nid == -1) 985 + if (prev_nid == NUMA_NO_NODE) 986 986 prev_nid = new_nid; 987 987 break; 988 988 } ··· 1208 1208 md = mdesc_grab(); 1209 1209 1210 1210 count = 0; 1211 - nid = -1; 1211 + nid = NUMA_NO_NODE; 1212 1212 mdesc_for_each_node_by_name(md, grp, "group") { 1213 1213 if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) { 1214 1214 nid = count;

+7 -6

arch/x86/include/asm/paravirt.h

··· 422 422 } 423 423 424 424 #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION 425 - static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, 425 + static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, 426 426 pte_t *ptep) 427 427 { 428 428 pteval_t ret; 429 429 430 - ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, mm, addr, ptep); 430 + ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, vma, addr, ptep); 431 431 432 432 return (pte_t) { .pte = ret }; 433 433 } 434 434 435 - static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 436 - pte_t *ptep, pte_t pte) 435 + static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, 436 + pte_t *ptep, pte_t old_pte, pte_t pte) 437 437 { 438 + 438 439 if (sizeof(pteval_t) > sizeof(long)) 439 440 /* 5 arg words */ 440 - pv_ops.mmu.ptep_modify_prot_commit(mm, addr, ptep, pte); 441 + pv_ops.mmu.ptep_modify_prot_commit(vma, addr, ptep, pte); 441 442 else 442 443 PVOP_VCALL4(mmu.ptep_modify_prot_commit, 443 - mm, addr, ptep, pte.pte); 444 + vma, addr, ptep, pte.pte); 444 445 } 445 446 446 447 static inline void set_pte(pte_t *ptep, pte_t pte)

+3 -2

arch/x86/include/asm/paravirt_types.h

··· 55 55 struct cpumask; 56 56 struct flush_tlb_info; 57 57 struct mmu_gather; 58 + struct vm_area_struct; 58 59 59 60 /* 60 61 * Wrapper type for pointers to code which uses the non-standard ··· 255 254 pte_t *ptep, pte_t pteval); 256 255 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); 257 256 258 - pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, 257 + pte_t (*ptep_modify_prot_start)(struct vm_area_struct *vma, unsigned long addr, 259 258 pte_t *ptep); 260 - void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, 259 + void (*ptep_modify_prot_commit)(struct vm_area_struct *vma, unsigned long addr, 261 260 pte_t *ptep, pte_t pte); 262 261 263 262 struct paravirt_callee_save pte_val;

+2 -1

arch/x86/include/asm/pci.h

··· 7 7 #include <linux/slab.h> 8 8 #include <linux/string.h> 9 9 #include <linux/scatterlist.h> 10 + #include <linux/numa.h> 10 11 #include <asm/io.h> 11 12 #include <asm/pat.h> 12 13 #include <asm/x86_init.h> ··· 142 141 int node; 143 142 144 143 node = __pcibus_to_node(bus); 145 - return (node == -1) ? cpu_online_mask : 144 + return (node == NUMA_NO_NODE) ? cpu_online_mask : 146 145 cpumask_of_node(node); 147 146 } 148 147 #endif

+12 -12

arch/x86/include/asm/uaccess.h

··· 75 75 #endif 76 76 77 77 /** 78 - * access_ok: - Checks if a user space pointer is valid 78 + * access_ok - Checks if a user space pointer is valid 79 79 * @addr: User space pointer to start of block to check 80 80 * @size: Size of block to check 81 81 * ··· 84 84 * 85 85 * Checks if a pointer to a block of memory in user space is valid. 86 86 * 87 - * Returns true (nonzero) if the memory block may be valid, false (zero) 88 - * if it is definitely invalid. 89 - * 90 87 * Note that, depending on architecture, this function probably just 91 88 * checks that the pointer is in the user space range - after calling 92 89 * this function, memory access functions may still return -EFAULT. 90 + * 91 + * Return: true (nonzero) if the memory block may be valid, false (zero) 92 + * if it is definitely invalid. 93 93 */ 94 94 #define access_ok(addr, size) \ 95 95 ({ \ ··· 134 134 __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) 135 135 136 136 /** 137 - * get_user: - Get a simple variable from user space. 137 + * get_user - Get a simple variable from user space. 138 138 * @x: Variable to store result. 139 139 * @ptr: Source address, in user space. 140 140 * ··· 148 148 * @ptr must have pointer-to-simple-variable type, and the result of 149 149 * dereferencing @ptr must be assignable to @x without a cast. 150 150 * 151 - * Returns zero on success, or -EFAULT on error. 151 + * Return: zero on success, or -EFAULT on error. 152 152 * On error, the variable @x is set to zero. 153 153 */ 154 154 /* ··· 226 226 extern void __put_user_8(void); 227 227 228 228 /** 229 - * put_user: - Write a simple value into user space. 229 + * put_user - Write a simple value into user space. 230 230 * @x: Value to copy to user space. 231 231 * @ptr: Destination address, in user space. 232 232 * ··· 240 240 * @ptr must have pointer-to-simple-variable type, and @x must be assignable 241 241 * to the result of dereferencing @ptr. 242 242 * 243 - * Returns zero on success, or -EFAULT on error. 243 + * Return: zero on success, or -EFAULT on error. 244 244 */ 245 245 #define put_user(x, ptr) \ 246 246 ({ \ ··· 502 502 } while (0) 503 503 504 504 /** 505 - * __get_user: - Get a simple variable from user space, with less checking. 505 + * __get_user - Get a simple variable from user space, with less checking. 506 506 * @x: Variable to store result. 507 507 * @ptr: Source address, in user space. 508 508 * ··· 519 519 * Caller must check the pointer with access_ok() before calling this 520 520 * function. 521 521 * 522 - * Returns zero on success, or -EFAULT on error. 522 + * Return: zero on success, or -EFAULT on error. 523 523 * On error, the variable @x is set to zero. 524 524 */ 525 525 ··· 527 527 __get_user_nocheck((x), (ptr), sizeof(*(ptr))) 528 528 529 529 /** 530 - * __put_user: - Write a simple value into user space, with less checking. 530 + * __put_user - Write a simple value into user space, with less checking. 531 531 * @x: Value to copy to user space. 532 532 * @ptr: Destination address, in user space. 533 533 * ··· 544 544 * Caller must check the pointer with access_ok() before calling this 545 545 * function. 546 546 * 547 - * Returns zero on success, or -EFAULT on error. 547 + * Return: zero on success, or -EFAULT on error. 548 548 */ 549 549 550 550 #define __put_user(x, ptr) \

+4 -3

arch/x86/kernel/apic/x2apic_uv_x.c

··· 27 27 #include <linux/crash_dump.h> 28 28 #include <linux/reboot.h> 29 29 #include <linux/memory.h> 30 + #include <linux/numa.h> 30 31 31 32 #include <asm/uv/uv_mmrs.h> 32 33 #include <asm/uv/uv_hub.h> ··· 1391 1390 } 1392 1391 1393 1392 /* Set socket -> node values: */ 1394 - lnid = -1; 1393 + lnid = NUMA_NO_NODE; 1395 1394 for_each_present_cpu(cpu) { 1396 1395 int nid = cpu_to_node(cpu); 1397 1396 int apicid, sockid; ··· 1522 1521 new_hub->pnode = 0xffff; 1523 1522 1524 1523 new_hub->numa_blade_id = uv_node_to_blade_id(nodeid); 1525 - new_hub->memory_nid = -1; 1524 + new_hub->memory_nid = NUMA_NO_NODE; 1526 1525 new_hub->nr_possible_cpus = 0; 1527 1526 new_hub->nr_online_cpus = 0; 1528 1527 } ··· 1539 1538 1540 1539 uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid); 1541 1540 uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++; 1542 - if (uv_cpu_hub_info(cpu)->memory_nid == -1) 1541 + if (uv_cpu_hub_info(cpu)->memory_nid == NUMA_NO_NODE) 1543 1542 uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu); 1544 1543 1545 1544 /* Init memoryless node: */

+1 -1

arch/x86/kernel/setup_percpu.c

··· 171 171 unsigned long delta; 172 172 int rc; 173 173 174 - pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%d\n", 174 + pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%u\n", 175 175 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 176 176 177 177 /*

+2 -1

arch/x86/kernel/smpboot.c

··· 56 56 #include <linux/stackprotector.h> 57 57 #include <linux/gfp.h> 58 58 #include <linux/cpuidle.h> 59 + #include <linux/numa.h> 59 60 60 61 #include <asm/acpi.h> 61 62 #include <asm/desc.h> ··· 842 841 /* reduce the number of lines printed when booting a large cpu count system */ 843 842 static void announce_cpu(int cpu, int apicid) 844 843 { 845 - static int current_node = -1; 844 + static int current_node = NUMA_NO_NODE; 846 845 int node = early_cpu_to_node(cpu); 847 846 static int width, node_width; 848 847

+4 -4

arch/x86/lib/usercopy_32.c

··· 54 54 } while (0) 55 55 56 56 /** 57 - * clear_user: - Zero a block of memory in user space. 57 + * clear_user - Zero a block of memory in user space. 58 58 * @to: Destination address, in user space. 59 59 * @n: Number of bytes to zero. 60 60 * 61 61 * Zero a block of memory in user space. 62 62 * 63 - * Returns number of bytes that could not be cleared. 63 + * Return: number of bytes that could not be cleared. 64 64 * On success, this will be zero. 65 65 */ 66 66 unsigned long ··· 74 74 EXPORT_SYMBOL(clear_user); 75 75 76 76 /** 77 - * __clear_user: - Zero a block of memory in user space, with less checking. 77 + * __clear_user - Zero a block of memory in user space, with less checking. 78 78 * @to: Destination address, in user space. 79 79 * @n: Number of bytes to zero. 80 80 * 81 81 * Zero a block of memory in user space. Caller must check 82 82 * the specified block with access_ok() before calling this function. 83 83 * 84 - * Returns number of bytes that could not be cleared. 84 + * Return: number of bytes that could not be cleared. 85 85 * On success, this will be zero. 86 86 */ 87 87 unsigned long

+2 -2

arch/x86/mm/numa.c

··· 123 123 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 124 124 125 125 /* cpumask_of_node() will now work */ 126 - pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); 126 + pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); 127 127 } 128 128 129 129 static int __init numa_add_memblk_to(int nid, u64 start, u64 end, ··· 866 866 { 867 867 if (node >= nr_node_ids) { 868 868 printk(KERN_WARNING 869 - "cpumask_of_node(%d): node > nr_node_ids(%d)\n", 869 + "cpumask_of_node(%d): node > nr_node_ids(%u)\n", 870 870 node, nr_node_ids); 871 871 dump_stack(); 872 872 return cpu_none_mask;

+2 -2

arch/x86/xen/mmu.h

··· 17 17 18 18 void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 19 19 20 - pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 21 - void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 20 + pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); 21 + void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, 22 22 pte_t *ptep, pte_t pte); 23 23 24 24 unsigned long xen_read_cr2_direct(void);

+4 -4

arch/x86/xen/mmu_pv.c

··· 306 306 __xen_set_pte(ptep, pteval); 307 307 } 308 308 309 - pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, 309 + pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, 310 310 unsigned long addr, pte_t *ptep) 311 311 { 312 312 /* Just return the pte as-is. We preserve the bits on commit */ 313 - trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep); 313 + trace_xen_mmu_ptep_modify_prot_start(vma->vm_mm, addr, ptep, *ptep); 314 314 return *ptep; 315 315 } 316 316 317 - void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 317 + void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, 318 318 pte_t *ptep, pte_t pte) 319 319 { 320 320 struct mmu_update u; 321 321 322 - trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte); 322 + trace_xen_mmu_ptep_modify_prot_commit(vma->vm_mm, addr, ptep, pte); 323 323 xen_mc_batch(); 324 324 325 325 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;

+3 -2

drivers/block/mtip32xx/mtip32xx.c

··· 40 40 #include <linux/export.h> 41 41 #include <linux/debugfs.h> 42 42 #include <linux/prefetch.h> 43 + #include <linux/numa.h> 43 44 #include "mtip32xx.h" 44 45 45 46 #define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32) ··· 4019 4018 /* Helper for selecting a node in round robin mode */ 4020 4019 static inline int mtip_get_next_rr_node(void) 4021 4020 { 4022 - static int next_node = -1; 4021 + static int next_node = NUMA_NO_NODE; 4023 4022 4024 - if (next_node == -1) { 4023 + if (next_node == NUMA_NO_NODE) { 4025 4024 next_node = first_online_node; 4026 4025 return next_node; 4027 4026 }

-2

drivers/char/agp/efficeon-agp.c

··· 163 163 unsigned long page = efficeon_private.l1_table[index]; 164 164 if (page) { 165 165 efficeon_private.l1_table[index] = 0; 166 - ClearPageReserved(virt_to_page((char *)page)); 167 166 free_page(page); 168 167 freed++; 169 168 } ··· 218 219 efficeon_free_gatt_table(agp_bridge); 219 220 return -ENOMEM; 220 221 } 221 - SetPageReserved(virt_to_page((char *)page)); 222 222 223 223 for (offset = 0; offset < PAGE_SIZE; offset += clflush_chunk) 224 224 clflush((char *)page+offset);

+3 -1

drivers/dma/dmaengine.c

··· 63 63 #include <linux/acpi_dma.h> 64 64 #include <linux/of_dma.h> 65 65 #include <linux/mempool.h> 66 + #include <linux/numa.h> 66 67 67 68 static DEFINE_MUTEX(dma_list_mutex); 68 69 static DEFINE_IDA(dma_ida); ··· 387 386 static bool dma_chan_is_local(struct dma_chan *chan, int cpu) 388 387 { 389 388 int node = dev_to_node(chan->device->dev); 390 - return node == -1 || cpumask_test_cpu(cpu, cpumask_of_node(node)); 389 + return node == NUMA_NO_NODE || 390 + cpumask_test_cpu(cpu, cpumask_of_node(node)); 391 391 } 392 392 393 393 /**

-6

drivers/gpu/drm/i915/i915_utils.h

··· 123 123 124 124 #include <linux/list.h> 125 125 126 - static inline int list_is_first(const struct list_head *list, 127 - const struct list_head *head) 128 - { 129 - return head->next == list; 130 - } 131 - 132 126 static inline void __list_del_many(struct list_head *head, 133 127 struct list_head *first) 134 128 {

+16 -5

drivers/hv/hv_balloon.c

··· 681 681 /* Check if the particular page is backed and can be onlined and online it. */ 682 682 static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) 683 683 { 684 - if (!has_pfn_is_backed(has, page_to_pfn(pg))) 684 + if (!has_pfn_is_backed(has, page_to_pfn(pg))) { 685 + if (!PageOffline(pg)) 686 + __SetPageOffline(pg); 685 687 return; 688 + } 689 + if (PageOffline(pg)) 690 + __ClearPageOffline(pg); 686 691 687 692 /* This frame is currently backed; online the page. */ 688 693 __online_page_set_limits(pg); ··· 776 771 } 777 772 } 778 773 779 - static void hv_online_page(struct page *pg) 774 + static void hv_online_page(struct page *pg, unsigned int order) 780 775 { 781 776 struct hv_hotadd_state *has; 782 777 unsigned long flags; ··· 785 780 spin_lock_irqsave(&dm_device.ha_lock, flags); 786 781 list_for_each_entry(has, &dm_device.ha_region_list, list) { 787 782 /* The page belongs to a different HAS. */ 788 - if ((pfn < has->start_pfn) || (pfn >= has->end_pfn)) 783 + if ((pfn < has->start_pfn) || 784 + (pfn + (1UL << order) > has->end_pfn)) 789 785 continue; 790 786 791 - hv_page_online_one(has, pg); 787 + hv_bring_pgs_online(has, pfn, 1UL << order); 792 788 break; 793 789 } 794 790 spin_unlock_irqrestore(&dm_device.ha_lock, flags); ··· 1207 1201 1208 1202 for (i = 0; i < num_pages; i++) { 1209 1203 pg = pfn_to_page(i + start_frame); 1204 + __ClearPageOffline(pg); 1210 1205 __free_page(pg); 1211 1206 dm->num_pages_ballooned--; 1212 1207 } ··· 1220 1213 struct dm_balloon_response *bl_resp, 1221 1214 int alloc_unit) 1222 1215 { 1223 - unsigned int i = 0; 1216 + unsigned int i, j; 1224 1217 struct page *pg; 1225 1218 1226 1219 if (num_pages < alloc_unit) ··· 1251 1244 1252 1245 if (alloc_unit != 1) 1253 1246 split_page(pg, get_order(alloc_unit << PAGE_SHIFT)); 1247 + 1248 + /* mark all pages offline */ 1249 + for (j = 0; j < (1 << get_order(alloc_unit << PAGE_SHIFT)); j++) 1250 + __SetPageOffline(pg + j); 1254 1251 1255 1252 bl_resp->range_count++; 1256 1253 bl_resp->range_array[i].finfo.start_page =

+2 -1

drivers/infiniband/hw/hfi1/affinity.c

··· 48 48 #include <linux/cpumask.h> 49 49 #include <linux/module.h> 50 50 #include <linux/interrupt.h> 51 + #include <linux/numa.h> 51 52 52 53 #include "hfi.h" 53 54 #include "affinity.h" ··· 778 777 _dev_comp_vect_cpu_mask_clean_up(dd, entry); 779 778 unlock: 780 779 mutex_unlock(&node_affinity.lock); 781 - dd->node = -1; 780 + dd->node = NUMA_NO_NODE; 782 781 } 783 782 784 783 /*

+2 -1

drivers/infiniband/hw/hfi1/init.c

··· 54 54 #include <linux/printk.h> 55 55 #include <linux/hrtimer.h> 56 56 #include <linux/bitmap.h> 57 + #include <linux/numa.h> 57 58 #include <rdma/rdma_vt.h> 58 59 59 60 #include "hfi.h" ··· 1304 1303 dd->unit = ret; 1305 1304 list_add(&dd->list, &hfi1_dev_list); 1306 1305 } 1307 - dd->node = -1; 1306 + dd->node = NUMA_NO_NODE; 1308 1307 1309 1308 spin_unlock_irqrestore(&hfi1_devs_lock, flags); 1310 1309 idr_preload_end();

+3 -2

drivers/iommu/dmar.c

··· 39 39 #include <linux/dmi.h> 40 40 #include <linux/slab.h> 41 41 #include <linux/iommu.h> 42 + #include <linux/numa.h> 42 43 #include <asm/irq_remapping.h> 43 44 #include <asm/iommu_table.h> 44 45 ··· 478 477 int node = acpi_map_pxm_to_node(rhsa->proximity_domain); 479 478 480 479 if (!node_online(node)) 481 - node = -1; 480 + node = NUMA_NO_NODE; 482 481 drhd->iommu->node = node; 483 482 return 0; 484 483 } ··· 1063 1062 iommu->msagaw = msagaw; 1064 1063 iommu->segment = drhd->segment; 1065 1064 1066 - iommu->node = -1; 1065 + iommu->node = NUMA_NO_NODE; 1067 1066 1068 1067 ver = readl(iommu->reg + DMAR_VER_REG); 1069 1068 pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n",

+2 -1

drivers/iommu/intel-iommu.c

··· 47 47 #include <linux/dma-contiguous.h> 48 48 #include <linux/dma-direct.h> 49 49 #include <linux/crash_dump.h> 50 + #include <linux/numa.h> 50 51 #include <asm/irq_remapping.h> 51 52 #include <asm/cacheflush.h> 52 53 #include <asm/iommu.h> ··· 1717 1716 return NULL; 1718 1717 1719 1718 memset(domain, 0, sizeof(*domain)); 1720 - domain->nid = -1; 1719 + domain->nid = NUMA_NO_NODE; 1721 1720 domain->flags = flags; 1722 1721 domain->has_iotlb_device = false; 1723 1722 INIT_LIST_HEAD(&domain->devices);

+2 -1

drivers/misc/sgi-xp/xpc_uv.c

··· 22 22 #include <linux/module.h> 23 23 #include <linux/err.h> 24 24 #include <linux/slab.h> 25 + #include <linux/numa.h> 25 26 #include <asm/uv/uv_hub.h> 26 27 #if defined CONFIG_X86_64 27 28 #include <asm/uv/bios.h> ··· 62 61 XPC_NOTIFY_MSG_SIZE_UV) 63 62 #define XPC_NOTIFY_IRQ_NAME "xpc_notify" 64 63 65 - static int xpc_mq_node = -1; 64 + static int xpc_mq_node = NUMA_NO_NODE; 66 65 67 66 static struct xpc_gru_mq_uv *xpc_activate_mq_uv; 68 67 static struct xpc_gru_mq_uv *xpc_notify_mq_uv;

+32

drivers/misc/vmw_balloon.c

··· 557 557 } 558 558 559 559 /** 560 + * vmballoon_mark_page_offline() - mark a page as offline 561 + * @page: pointer for the page. 562 + * @page_size: the size of the page. 563 + */ 564 + static void 565 + vmballoon_mark_page_offline(struct page *page, 566 + enum vmballoon_page_size_type page_size) 567 + { 568 + int i; 569 + 570 + for (i = 0; i < vmballoon_page_in_frames(page_size); i++) 571 + __SetPageOffline(page + i); 572 + } 573 + 574 + /** 575 + * vmballoon_mark_page_online() - mark a page as online 576 + * @page: pointer for the page. 577 + * @page_size: the size of the page. 578 + */ 579 + static void 580 + vmballoon_mark_page_online(struct page *page, 581 + enum vmballoon_page_size_type page_size) 582 + { 583 + int i; 584 + 585 + for (i = 0; i < vmballoon_page_in_frames(page_size); i++) 586 + __ClearPageOffline(page + i); 587 + } 588 + 589 + /** 560 590 * vmballoon_send_get_target() - Retrieve desired balloon size from the host. 561 591 * 562 592 * @b: pointer to the balloon. ··· 642 612 ctl->page_size); 643 613 644 614 if (page) { 615 + vmballoon_mark_page_offline(page, ctl->page_size); 645 616 /* Success. Add the page to the list and continue. */ 646 617 list_add(&page->lru, &ctl->pages); 647 618 continue; ··· 881 850 882 851 list_for_each_entry_safe(page, tmp, page_list, lru) { 883 852 list_del(&page->lru); 853 + vmballoon_mark_page_online(page, page_size); 884 854 __free_pages(page, vmballoon_page_order(page_size)); 885 855 } 886 856

+3 -2

drivers/net/ethernet/intel/ixgbe/ixgbe_main.c

··· 27 27 #include <linux/bpf.h> 28 28 #include <linux/bpf_trace.h> 29 29 #include <linux/atomic.h> 30 + #include <linux/numa.h> 30 31 #include <scsi/fc/fc_fcoe.h> 31 32 #include <net/udp_tunnel.h> 32 33 #include <net/pkt_cls.h> ··· 6419 6418 { 6420 6419 struct device *dev = tx_ring->dev; 6421 6420 int orig_node = dev_to_node(dev); 6422 - int ring_node = -1; 6421 + int ring_node = NUMA_NO_NODE; 6423 6422 int size; 6424 6423 6425 6424 size = sizeof(struct ixgbe_tx_buffer) * tx_ring->count; ··· 6513 6512 { 6514 6513 struct device *dev = rx_ring->dev; 6515 6514 int orig_node = dev_to_node(dev); 6516 - int ring_node = -1; 6515 + int ring_node = NUMA_NO_NODE; 6517 6516 int size; 6518 6517 6519 6518 size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count;

+13 -5

drivers/xen/balloon.c

··· 369 369 return BP_ECANCELED; 370 370 } 371 371 372 - static void xen_online_page(struct page *page) 372 + static void xen_online_page(struct page *page, unsigned int order) 373 373 { 374 - __online_page_set_limits(page); 374 + unsigned long i, size = (1 << order); 375 + unsigned long start_pfn = page_to_pfn(page); 376 + struct page *p; 375 377 378 + pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn); 376 379 mutex_lock(&balloon_mutex); 377 - 378 - __balloon_append(page); 379 - 380 + for (i = 0; i < size; i++) { 381 + p = pfn_to_page(start_pfn + i); 382 + __online_page_set_limits(p); 383 + __SetPageOffline(p); 384 + __balloon_append(p); 385 + } 380 386 mutex_unlock(&balloon_mutex); 381 387 } 382 388 ··· 447 441 xenmem_reservation_va_mapping_update(1, &page, &frame_list[i]); 448 442 449 443 /* Relinquish the page back to the allocator. */ 444 + __ClearPageOffline(page); 450 445 free_reserved_page(page); 451 446 } 452 447 ··· 474 467 state = BP_EAGAIN; 475 468 break; 476 469 } 470 + __SetPageOffline(page); 477 471 adjust_managed_page_count(page, -1); 478 472 xenmem_reservation_scrub_page(page); 479 473 list_add(&page->lru, &pages);

+1

fs/file.c

··· 457 457 .full_fds_bits = init_files.full_fds_bits_init, 458 458 }, 459 459 .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), 460 + .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), 460 461 }; 461 462 462 463 static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)

+1 -1

fs/hugetlbfs/inode.c

··· 530 530 inode_lock(inode); 531 531 532 532 /* protected by i_mutex */ 533 - if (info->seals & F_SEAL_WRITE) { 533 + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { 534 534 inode_unlock(inode); 535 535 return -EPERM; 536 536 }

+1 -7

fs/inode.c

··· 2093 2093 void inode_set_flags(struct inode *inode, unsigned int flags, 2094 2094 unsigned int mask) 2095 2095 { 2096 - unsigned int old_flags, new_flags; 2097 - 2098 2096 WARN_ON_ONCE(flags & ~mask); 2099 - do { 2100 - old_flags = READ_ONCE(inode->i_flags); 2101 - new_flags = (old_flags & ~mask) | flags; 2102 - } while (unlikely(cmpxchg(&inode->i_flags, old_flags, 2103 - new_flags) != old_flags)); 2097 + set_mask_bits(&inode->i_flags, mask, flags); 2104 2098 } 2105 2099 EXPORT_SYMBOL(inode_set_flags); 2106 2100

+20 -11

fs/kernfs/file.c

··· 832 832 * to see if it supports poll (Neither 'poll' nor 'select' return 833 833 * an appropriate error code). When in doubt, set a suitable timeout value. 834 834 */ 835 + __poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) 836 + { 837 + struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry); 838 + struct kernfs_open_node *on = kn->attr.open; 839 + 840 + poll_wait(of->file, &on->poll, wait); 841 + 842 + if (of->event != atomic_read(&on->event)) 843 + return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; 844 + 845 + return DEFAULT_POLLMASK; 846 + } 847 + 835 848 static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) 836 849 { 837 850 struct kernfs_open_file *of = kernfs_of(filp); 838 851 struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); 839 - struct kernfs_open_node *on = kn->attr.open; 852 + __poll_t ret; 840 853 841 854 if (!kernfs_get_active(kn)) 842 - goto trigger; 855 + return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; 843 856 844 - poll_wait(filp, &on->poll, wait); 857 + if (kn->attr.ops->poll) 858 + ret = kn->attr.ops->poll(of, wait); 859 + else 860 + ret = kernfs_generic_poll(of, wait); 845 861 846 862 kernfs_put_active(kn); 847 - 848 - if (of->event != atomic_read(&on->event)) 849 - goto trigger; 850 - 851 - return DEFAULT_POLLMASK; 852 - 853 - trigger: 854 - return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; 863 + return ret; 855 864 } 856 865 857 866 static void kernfs_notify_workfn(struct work_struct *work)

+96 -63

fs/ocfs2/alloc.c

··· 7532 7532 return count; 7533 7533 } 7534 7534 7535 - int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) 7535 + static 7536 + int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range) 7536 7537 { 7537 7538 struct ocfs2_super *osb = OCFS2_SB(sb); 7538 - u64 start, len, trimmed, first_group, last_group, group; 7539 + u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0; 7539 7540 int ret, cnt; 7540 7541 u32 first_bit, last_bit, minlen; 7541 7542 struct buffer_head *main_bm_bh = NULL; ··· 7544 7543 struct buffer_head *gd_bh = NULL; 7545 7544 struct ocfs2_dinode *main_bm; 7546 7545 struct ocfs2_group_desc *gd = NULL; 7547 - struct ocfs2_trim_fs_info info, *pinfo = NULL; 7548 7546 7549 7547 start = range->start >> osb->s_clustersize_bits; 7550 7548 len = range->len >> osb->s_clustersize_bits; ··· 7552 7552 if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) 7553 7553 return -EINVAL; 7554 7554 7555 + trace_ocfs2_trim_mainbm(start, len, minlen); 7556 + 7557 + next_group: 7555 7558 main_bm_inode = ocfs2_get_system_file_inode(osb, 7556 7559 GLOBAL_BITMAP_SYSTEM_INODE, 7557 7560 OCFS2_INVALID_SLOT); ··· 7573 7570 } 7574 7571 main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; 7575 7572 7576 - if (start >= le32_to_cpu(main_bm->i_clusters)) { 7577 - ret = -EINVAL; 7578 - goto out_unlock; 7579 - } 7580 - 7581 - len = range->len >> osb->s_clustersize_bits; 7582 - if (start + len > le32_to_cpu(main_bm->i_clusters)) 7583 - len = le32_to_cpu(main_bm->i_clusters) - start; 7584 - 7585 - trace_ocfs2_trim_fs(start, len, minlen); 7586 - 7587 - ocfs2_trim_fs_lock_res_init(osb); 7588 - ret = ocfs2_trim_fs_lock(osb, NULL, 1); 7589 - if (ret < 0) { 7590 - if (ret != -EAGAIN) { 7591 - mlog_errno(ret); 7592 - ocfs2_trim_fs_lock_res_uninit(osb); 7573 + /* 7574 + * Do some check before trim the first group. 7575 + */ 7576 + if (!group) { 7577 + if (start >= le32_to_cpu(main_bm->i_clusters)) { 7578 + ret = -EINVAL; 7593 7579 goto out_unlock; 7594 7580 } 7595 7581 7596 - mlog(ML_NOTICE, "Wait for trim on device (%s) to " 7597 - "finish, which is running from another node.\n", 7598 - osb->dev_str); 7599 - ret = ocfs2_trim_fs_lock(osb, &info, 0); 7600 - if (ret < 0) { 7601 - mlog_errno(ret); 7602 - ocfs2_trim_fs_lock_res_uninit(osb); 7603 - goto out_unlock; 7604 - } 7582 + if (start + len > le32_to_cpu(main_bm->i_clusters)) 7583 + len = le32_to_cpu(main_bm->i_clusters) - start; 7605 7584 7606 - if (info.tf_valid && info.tf_success && 7607 - info.tf_start == start && info.tf_len == len && 7608 - info.tf_minlen == minlen) { 7609 - /* Avoid sending duplicated trim to a shared device */ 7610 - mlog(ML_NOTICE, "The same trim on device (%s) was " 7611 - "just done from node (%u), return.\n", 7612 - osb->dev_str, info.tf_nodenum); 7613 - range->len = info.tf_trimlen; 7614 - goto out_trimunlock; 7615 - } 7585 + /* 7586 + * Determine first and last group to examine based on 7587 + * start and len 7588 + */ 7589 + first_group = ocfs2_which_cluster_group(main_bm_inode, start); 7590 + if (first_group == osb->first_cluster_group_blkno) 7591 + first_bit = start; 7592 + else 7593 + first_bit = start - ocfs2_blocks_to_clusters(sb, 7594 + first_group); 7595 + last_group = ocfs2_which_cluster_group(main_bm_inode, 7596 + start + len - 1); 7597 + group = first_group; 7616 7598 } 7617 7599 7618 - info.tf_nodenum = osb->node_num; 7619 - info.tf_start = start; 7620 - info.tf_len = len; 7621 - info.tf_minlen = minlen; 7622 - 7623 - /* Determine first and last group to examine based on start and len */ 7624 - first_group = ocfs2_which_cluster_group(main_bm_inode, start); 7625 - if (first_group == osb->first_cluster_group_blkno) 7626 - first_bit = start; 7627 - else 7628 - first_bit = start - ocfs2_blocks_to_clusters(sb, first_group); 7629 - last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); 7630 - last_bit = osb->bitmap_cpg; 7631 - 7632 - trimmed = 0; 7633 - for (group = first_group; group <= last_group;) { 7600 + do { 7634 7601 if (first_bit + len >= osb->bitmap_cpg) 7635 7602 last_bit = osb->bitmap_cpg; 7636 7603 else ··· 7632 7659 group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); 7633 7660 else 7634 7661 group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); 7635 - } 7636 - range->len = trimmed * sb->s_blocksize; 7662 + } while (0); 7637 7663 7638 - info.tf_trimlen = range->len; 7639 - info.tf_success = (ret ? 0 : 1); 7640 - pinfo = &info; 7641 - out_trimunlock: 7642 - ocfs2_trim_fs_unlock(osb, pinfo); 7643 - ocfs2_trim_fs_lock_res_uninit(osb); 7644 7664 out_unlock: 7645 7665 ocfs2_inode_unlock(main_bm_inode, 0); 7646 7666 brelse(main_bm_bh); 7667 + main_bm_bh = NULL; 7647 7668 out_mutex: 7648 7669 inode_unlock(main_bm_inode); 7649 7670 iput(main_bm_inode); 7671 + 7672 + /* 7673 + * If all the groups trim are not done or failed, but we should release 7674 + * main_bm related locks for avoiding the current IO starve, then go to 7675 + * trim the next group 7676 + */ 7677 + if (ret >= 0 && group <= last_group) 7678 + goto next_group; 7650 7679 out: 7680 + range->len = trimmed * sb->s_blocksize; 7681 + return ret; 7682 + } 7683 + 7684 + int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) 7685 + { 7686 + int ret; 7687 + struct ocfs2_super *osb = OCFS2_SB(sb); 7688 + struct ocfs2_trim_fs_info info, *pinfo = NULL; 7689 + 7690 + ocfs2_trim_fs_lock_res_init(osb); 7691 + 7692 + trace_ocfs2_trim_fs(range->start, range->len, range->minlen); 7693 + 7694 + ret = ocfs2_trim_fs_lock(osb, NULL, 1); 7695 + if (ret < 0) { 7696 + if (ret != -EAGAIN) { 7697 + mlog_errno(ret); 7698 + ocfs2_trim_fs_lock_res_uninit(osb); 7699 + return ret; 7700 + } 7701 + 7702 + mlog(ML_NOTICE, "Wait for trim on device (%s) to " 7703 + "finish, which is running from another node.\n", 7704 + osb->dev_str); 7705 + ret = ocfs2_trim_fs_lock(osb, &info, 0); 7706 + if (ret < 0) { 7707 + mlog_errno(ret); 7708 + ocfs2_trim_fs_lock_res_uninit(osb); 7709 + return ret; 7710 + } 7711 + 7712 + if (info.tf_valid && info.tf_success && 7713 + info.tf_start == range->start && 7714 + info.tf_len == range->len && 7715 + info.tf_minlen == range->minlen) { 7716 + /* Avoid sending duplicated trim to a shared device */ 7717 + mlog(ML_NOTICE, "The same trim on device (%s) was " 7718 + "just done from node (%u), return.\n", 7719 + osb->dev_str, info.tf_nodenum); 7720 + range->len = info.tf_trimlen; 7721 + goto out; 7722 + } 7723 + } 7724 + 7725 + info.tf_nodenum = osb->node_num; 7726 + info.tf_start = range->start; 7727 + info.tf_len = range->len; 7728 + info.tf_minlen = range->minlen; 7729 + 7730 + ret = ocfs2_trim_mainbm(sb, range); 7731 + 7732 + info.tf_trimlen = range->len; 7733 + info.tf_success = (ret < 0 ? 0 : 1); 7734 + pinfo = &info; 7735 + out: 7736 + ocfs2_trim_fs_unlock(osb, pinfo); 7737 + ocfs2_trim_fs_lock_res_uninit(osb); 7651 7738 return ret; 7652 7739 }

+8 -6

fs/ocfs2/cluster/nodemanager.c

··· 621 621 struct o2nm_node *node = to_o2nm_node(item); 622 622 struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent); 623 623 624 - o2net_disconnect_node(node); 624 + if (cluster->cl_nodes[node->nd_num] == node) { 625 + o2net_disconnect_node(node); 625 626 626 - if (cluster->cl_has_local && 627 - (cluster->cl_local_node == node->nd_num)) { 628 - cluster->cl_has_local = 0; 629 - cluster->cl_local_node = O2NM_INVALID_NODE_NUM; 630 - o2net_stop_listening(node); 627 + if (cluster->cl_has_local && 628 + (cluster->cl_local_node == node->nd_num)) { 629 + cluster->cl_has_local = 0; 630 + cluster->cl_local_node = O2NM_INVALID_NODE_NUM; 631 + o2net_stop_listening(node); 632 + } 631 633 } 632 634 633 635 /* XXX call into net to stop this node from trading messages */

+5

fs/ocfs2/dlmglue.c

··· 686 686 { 687 687 struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; 688 688 689 + /* Only one trimfs thread are allowed to work at the same time. */ 690 + mutex_lock(&osb->obs_trim_fs_mutex); 691 + 689 692 ocfs2_lock_res_init_once(lockres); 690 693 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name); 691 694 ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS, ··· 701 698 702 699 ocfs2_simple_drop_lockres(osb, lockres); 703 700 ocfs2_lock_res_free(lockres); 701 + 702 + mutex_unlock(&osb->obs_trim_fs_mutex); 704 703 } 705 704 706 705 static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,

+1

fs/ocfs2/ocfs2.h

··· 407 407 struct ocfs2_lock_res osb_rename_lockres; 408 408 struct ocfs2_lock_res osb_nfs_sync_lockres; 409 409 struct ocfs2_lock_res osb_trim_fs_lockres; 410 + struct mutex obs_trim_fs_mutex; 410 411 struct ocfs2_dlm_debug *osb_dlm_debug; 411 412 412 413 struct dentry *osb_debug_root;

+2

fs/ocfs2/ocfs2_trace.h

··· 712 712 713 713 DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group); 714 714 715 + DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm); 716 + 715 717 DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs); 716 718 717 719 /* End of trace events for fs/ocfs2/alloc.c. */

+2 -6

fs/ocfs2/slot_map.c

··· 55 55 unsigned int si_blocks; 56 56 struct buffer_head **si_bh; 57 57 unsigned int si_num_slots; 58 - struct ocfs2_slot *si_slots; 58 + struct ocfs2_slot si_slots[]; 59 59 }; 60 60 61 61 ··· 420 420 struct inode *inode = NULL; 421 421 struct ocfs2_slot_info *si; 422 422 423 - si = kzalloc(sizeof(struct ocfs2_slot_info) + 424 - (sizeof(struct ocfs2_slot) * osb->max_slots), 425 - GFP_KERNEL); 423 + si = kzalloc(struct_size(si, si_slots, osb->max_slots), GFP_KERNEL); 426 424 if (!si) { 427 425 status = -ENOMEM; 428 426 mlog_errno(status); ··· 429 431 430 432 si->si_extended = ocfs2_uses_extended_slot_map(osb); 431 433 si->si_num_slots = osb->max_slots; 432 - si->si_slots = (struct ocfs2_slot *)((char *)si + 433 - sizeof(struct ocfs2_slot_info)); 434 434 435 435 inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, 436 436 OCFS2_INVALID_SLOT);

+2

fs/ocfs2/super.c

··· 1847 1847 if (ocfs2_is_hard_readonly(osb)) 1848 1848 goto leave; 1849 1849 1850 + mutex_init(&osb->obs_trim_fs_mutex); 1851 + 1850 1852 status = ocfs2_dlm_init(osb); 1851 1853 if (status < 0) { 1852 1854 mlog_errno(status);

+1 -2

fs/pipe.c

··· 140 140 struct page *page = buf->page; 141 141 142 142 if (page_count(page) == 1) { 143 - if (memcg_kmem_enabled()) 144 - memcg_kmem_uncharge(page, 0); 143 + memcg_kmem_uncharge(page, 0); 145 144 __SetPageLocked(page); 146 145 return 0; 147 146 }

+8 -8

fs/proc/array.c

··· 343 343 #ifdef CONFIG_SECCOMP 344 344 seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); 345 345 #endif 346 - seq_printf(m, "\nSpeculation_Store_Bypass:\t"); 346 + seq_puts(m, "\nSpeculation_Store_Bypass:\t"); 347 347 switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { 348 348 case -EINVAL: 349 - seq_printf(m, "unknown"); 349 + seq_puts(m, "unknown"); 350 350 break; 351 351 case PR_SPEC_NOT_AFFECTED: 352 - seq_printf(m, "not vulnerable"); 352 + seq_puts(m, "not vulnerable"); 353 353 break; 354 354 case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: 355 - seq_printf(m, "thread force mitigated"); 355 + seq_puts(m, "thread force mitigated"); 356 356 break; 357 357 case PR_SPEC_PRCTL | PR_SPEC_DISABLE: 358 - seq_printf(m, "thread mitigated"); 358 + seq_puts(m, "thread mitigated"); 359 359 break; 360 360 case PR_SPEC_PRCTL | PR_SPEC_ENABLE: 361 - seq_printf(m, "thread vulnerable"); 361 + seq_puts(m, "thread vulnerable"); 362 362 break; 363 363 case PR_SPEC_DISABLE: 364 - seq_printf(m, "globally mitigated"); 364 + seq_puts(m, "globally mitigated"); 365 365 break; 366 366 default: 367 - seq_printf(m, "vulnerable"); 367 + seq_puts(m, "vulnerable"); 368 368 break; 369 369 } 370 370 seq_putc(m, '\n');

+2 -2

fs/proc/base.c

··· 456 456 struct pid *pid, struct task_struct *task) 457 457 { 458 458 if (unlikely(!sched_info_on())) 459 - seq_printf(m, "0 0 0\n"); 459 + seq_puts(m, "0 0 0\n"); 460 460 else 461 461 seq_printf(m, "%llu %llu %lu\n", 462 462 (unsigned long long)task->se.sum_exec_runtime, ··· 3161 3161 return d_splice_alias(inode, dentry); 3162 3162 } 3163 3163 3164 - struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 3164 + struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) 3165 3165 { 3166 3166 struct task_struct *task; 3167 3167 unsigned tgid;

+1 -1

fs/proc/internal.h

··· 162 162 extern void pid_update_inode(struct task_struct *, struct inode *); 163 163 extern int pid_delete_dentry(const struct dentry *); 164 164 extern int proc_pid_readdir(struct file *, struct dir_context *); 165 - extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); 165 + struct dentry *proc_pid_lookup(struct dentry *, unsigned int); 166 166 extern loff_t mem_lseek(struct file *, loff_t, int); 167 167 168 168 /* Lookups */

+2 -2

fs/proc/page.c

··· 152 152 else if (page_count(page) == 0 && is_free_buddy_page(page)) 153 153 u |= 1 << KPF_BUDDY; 154 154 155 - if (PageBalloon(page)) 156 - u |= 1 << KPF_BALLOON; 155 + if (PageOffline(page)) 156 + u |= 1 << KPF_OFFLINE; 157 157 if (PageTable(page)) 158 158 u |= 1 << KPF_PGTABLE; 159 159

+1 -1

fs/proc/root.c

··· 154 154 155 155 static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) 156 156 { 157 - if (!proc_pid_lookup(dir, dentry, flags)) 157 + if (!proc_pid_lookup(dentry, flags)) 158 158 return NULL; 159 159 160 160 return proc_lookup(dir, dentry, flags);

+8 -8

fs/proc/self.c

··· 38 38 struct inode *root_inode = d_inode(s->s_root); 39 39 struct pid_namespace *ns = proc_pid_ns(root_inode); 40 40 struct dentry *self; 41 + int ret = -ENOMEM; 41 42 42 43 inode_lock(root_inode); 43 44 self = d_alloc_name(s->s_root, "self"); ··· 52 51 inode->i_gid = GLOBAL_ROOT_GID; 53 52 inode->i_op = &proc_self_inode_operations; 54 53 d_add(self, inode); 54 + ret = 0; 55 55 } else { 56 56 dput(self); 57 - self = ERR_PTR(-ENOMEM); 58 57 } 59 - } else { 60 - self = ERR_PTR(-ENOMEM); 61 58 } 62 59 inode_unlock(root_inode); 63 - if (IS_ERR(self)) { 60 + 61 + if (ret) 64 62 pr_err("proc_fill_super: can't allocate /proc/self\n"); 65 - return PTR_ERR(self); 66 - } 67 - ns->proc_self = self; 68 - return 0; 63 + else 64 + ns->proc_self = self; 65 + 66 + return ret; 69 67 } 70 68 71 69 void __init proc_self_init(void)

+32 -28

fs/proc/stat.c

··· 23 23 24 24 #ifdef arch_idle_time 25 25 26 - static u64 get_idle_time(int cpu) 26 + static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) 27 27 { 28 28 u64 idle; 29 29 30 - idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; 30 + idle = kcs->cpustat[CPUTIME_IDLE]; 31 31 if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) 32 32 idle += arch_idle_time(cpu); 33 33 return idle; 34 34 } 35 35 36 - static u64 get_iowait_time(int cpu) 36 + static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) 37 37 { 38 38 u64 iowait; 39 39 40 - iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; 40 + iowait = kcs->cpustat[CPUTIME_IOWAIT]; 41 41 if (cpu_online(cpu) && nr_iowait_cpu(cpu)) 42 42 iowait += arch_idle_time(cpu); 43 43 return iowait; ··· 45 45 46 46 #else 47 47 48 - static u64 get_idle_time(int cpu) 48 + static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) 49 49 { 50 50 u64 idle, idle_usecs = -1ULL; 51 51 ··· 54 54 55 55 if (idle_usecs == -1ULL) 56 56 /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ 57 - idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; 57 + idle = kcs->cpustat[CPUTIME_IDLE]; 58 58 else 59 59 idle = idle_usecs * NSEC_PER_USEC; 60 60 61 61 return idle; 62 62 } 63 63 64 - static u64 get_iowait_time(int cpu) 64 + static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) 65 65 { 66 66 u64 iowait, iowait_usecs = -1ULL; 67 67 ··· 70 70 71 71 if (iowait_usecs == -1ULL) 72 72 /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ 73 - iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; 73 + iowait = kcs->cpustat[CPUTIME_IOWAIT]; 74 74 else 75 75 iowait = iowait_usecs * NSEC_PER_USEC; 76 76 ··· 120 120 getboottime64(&boottime); 121 121 122 122 for_each_possible_cpu(i) { 123 - user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; 124 - nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; 125 - system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; 126 - idle += get_idle_time(i); 127 - iowait += get_iowait_time(i); 128 - irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; 129 - softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; 130 - steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; 131 - guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; 132 - guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; 123 + struct kernel_cpustat *kcs = &kcpustat_cpu(i); 124 + 125 + user += kcs->cpustat[CPUTIME_USER]; 126 + nice += kcs->cpustat[CPUTIME_NICE]; 127 + system += kcs->cpustat[CPUTIME_SYSTEM]; 128 + idle += get_idle_time(kcs, i); 129 + iowait += get_iowait_time(kcs, i); 130 + irq += kcs->cpustat[CPUTIME_IRQ]; 131 + softirq += kcs->cpustat[CPUTIME_SOFTIRQ]; 132 + steal += kcs->cpustat[CPUTIME_STEAL]; 133 + guest += kcs->cpustat[CPUTIME_GUEST]; 134 + guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE]; 133 135 sum += kstat_cpu_irqs_sum(i); 134 136 sum += arch_irq_stat_cpu(i); 135 137 ··· 157 155 seq_putc(p, '\n'); 158 156 159 157 for_each_online_cpu(i) { 158 + struct kernel_cpustat *kcs = &kcpustat_cpu(i); 159 + 160 160 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ 161 - user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; 162 - nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; 163 - system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; 164 - idle = get_idle_time(i); 165 - iowait = get_iowait_time(i); 166 - irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; 167 - softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; 168 - steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; 169 - guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; 170 - guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; 161 + user = kcs->cpustat[CPUTIME_USER]; 162 + nice = kcs->cpustat[CPUTIME_NICE]; 163 + system = kcs->cpustat[CPUTIME_SYSTEM]; 164 + idle = get_idle_time(kcs, i); 165 + iowait = get_iowait_time(kcs, i); 166 + irq = kcs->cpustat[CPUTIME_IRQ]; 167 + softirq = kcs->cpustat[CPUTIME_SOFTIRQ]; 168 + steal = kcs->cpustat[CPUTIME_STEAL]; 169 + guest = kcs->cpustat[CPUTIME_GUEST]; 170 + guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE]; 171 171 seq_printf(p, "cpu%d", i); 172 172 seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); 173 173 seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));

+5 -3

fs/proc/task_mmu.c

··· 948 948 pte_t ptent = *pte; 949 949 950 950 if (pte_present(ptent)) { 951 - ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte); 952 - ptent = pte_wrprotect(ptent); 951 + pte_t old_pte; 952 + 953 + old_pte = ptep_modify_prot_start(vma, addr, pte); 954 + ptent = pte_wrprotect(old_pte); 953 955 ptent = pte_clear_soft_dirty(ptent); 954 - ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent); 956 + ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); 955 957 } else if (is_swap_pte(ptent)) { 956 958 ptent = pte_swp_clear_soft_dirty(ptent); 957 959 set_pte_at(vma->vm_mm, addr, pte, ptent);

+1 -1

fs/proc/task_nommu.c

··· 178 178 seq_file_path(m, file, ""); 179 179 } else if (mm && is_stack(vma)) { 180 180 seq_pad(m, ' '); 181 - seq_printf(m, "[stack]"); 181 + seq_puts(m, "[stack]"); 182 182 } 183 183 184 184 seq_putc(m, '\n');

+8 -8

fs/proc/thread_self.c

··· 38 38 struct inode *root_inode = d_inode(s->s_root); 39 39 struct pid_namespace *ns = proc_pid_ns(root_inode); 40 40 struct dentry *thread_self; 41 + int ret = -ENOMEM; 41 42 42 43 inode_lock(root_inode); 43 44 thread_self = d_alloc_name(s->s_root, "thread-self"); ··· 52 51 inode->i_gid = GLOBAL_ROOT_GID; 53 52 inode->i_op = &proc_thread_self_inode_operations; 54 53 d_add(thread_self, inode); 54 + ret = 0; 55 55 } else { 56 56 dput(thread_self); 57 - thread_self = ERR_PTR(-ENOMEM); 58 57 } 59 - } else { 60 - thread_self = ERR_PTR(-ENOMEM); 61 58 } 62 59 inode_unlock(root_inode); 63 - if (IS_ERR(thread_self)) { 60 + 61 + if (ret) 64 62 pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); 65 - return PTR_ERR(thread_self); 66 - } 67 - ns->proc_thread_self = thread_self; 68 - return 0; 63 + else 64 + ns->proc_thread_self = thread_self; 65 + 66 + return ret; 69 67 } 70 68 71 69 void __init proc_thread_self_init(void)

+9 -9

include/asm-generic/pgtable.h

··· 606 606 return 0; 607 607 } 608 608 609 - static inline pte_t __ptep_modify_prot_start(struct mm_struct *mm, 609 + static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma, 610 610 unsigned long addr, 611 611 pte_t *ptep) 612 612 { ··· 615 615 * non-present, preventing the hardware from asynchronously 616 616 * updating it. 617 617 */ 618 - return ptep_get_and_clear(mm, addr, ptep); 618 + return ptep_get_and_clear(vma->vm_mm, addr, ptep); 619 619 } 620 620 621 - static inline void __ptep_modify_prot_commit(struct mm_struct *mm, 621 + static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma, 622 622 unsigned long addr, 623 623 pte_t *ptep, pte_t pte) 624 624 { ··· 626 626 * The pte is non-present, so there's no hardware state to 627 627 * preserve. 628 628 */ 629 - set_pte_at(mm, addr, ptep, pte); 629 + set_pte_at(vma->vm_mm, addr, ptep, pte); 630 630 } 631 631 632 632 #ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION ··· 644 644 * queue the update to be done at some later time. The update must be 645 645 * actually committed before the pte lock is released, however. 646 646 */ 647 - static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, 647 + static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, 648 648 unsigned long addr, 649 649 pte_t *ptep) 650 650 { 651 - return __ptep_modify_prot_start(mm, addr, ptep); 651 + return __ptep_modify_prot_start(vma, addr, ptep); 652 652 } 653 653 654 654 /* 655 655 * Commit an update to a pte, leaving any hardware-controlled bits in 656 656 * the PTE unmodified. 657 657 */ 658 - static inline void ptep_modify_prot_commit(struct mm_struct *mm, 658 + static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, 659 659 unsigned long addr, 660 - pte_t *ptep, pte_t pte) 660 + pte_t *ptep, pte_t old_pte, pte_t pte) 661 661 { 662 - __ptep_modify_prot_commit(mm, addr, ptep, pte); 662 + __ptep_modify_prot_commit(vma, addr, ptep, pte); 663 663 } 664 664 #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ 665 665 #endif /* CONFIG_MMU */

+1 -1

include/linux/backing-dev.h

··· 365 365 rcu_read_lock(); 366 366 367 367 /* 368 - * Paired with store_release in inode_switch_wb_work_fn() and 368 + * Paired with store_release in inode_switch_wbs_work_fn() and 369 369 * ensures that we see the new wb if we see cleared I_WB_SWITCH. 370 370 */ 371 371 cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;

+13 -21

include/linux/balloon_compaction.h

··· 4 4 * 5 5 * Common interface definitions for making balloon pages movable by compaction. 6 6 * 7 - * Despite being perfectly possible to perform ballooned pages migration, they 8 - * make a special corner case to compaction scans because balloon pages are not 9 - * enlisted at any LRU list like the other pages we do compact / migrate. 7 + * Balloon page migration makes use of the general non-lru movable page 8 + * feature. 9 + * 10 + * page->private is used to reference the responsible balloon device. 11 + * page->mapping is used in context of non-lru page migration to reference 12 + * the address space operations for page isolation/migration/compaction. 10 13 * 11 14 * As the page isolation scanning step a compaction thread does is a lockless 12 15 * procedure (from a page standpoint), it might bring some racy situations while 13 16 * performing balloon page compaction. In order to sort out these racy scenarios 14 17 * and safely perform balloon's page compaction and migration we must, always, 15 - * ensure following these three simple rules: 18 + * ensure following these simple rules: 16 19 * 17 20 * i. when updating a balloon's page ->mapping element, strictly do it under 18 21 * the following lock order, independently of the far superior ··· 24 21 * +--spin_lock_irq(&b_dev_info->pages_lock); 25 22 * ... page->mapping updates here ... 26 23 * 27 - * ii. before isolating or dequeueing a balloon page from the balloon device 28 - * pages list, the page reference counter must be raised by one and the 29 - * extra refcount must be dropped when the page is enqueued back into 30 - * the balloon device page list, thus a balloon page keeps its reference 31 - * counter raised only while it is under our special handling; 32 - * 33 - * iii. after the lockless scan step have selected a potential balloon page for 34 - * isolation, re-test the PageBalloon mark and the PagePrivate flag 35 - * under the proper page lock, to ensure isolating a valid balloon page 36 - * (not yet isolated, nor under release procedure) 37 - * 38 - * iv. isolation or dequeueing procedure must clear PagePrivate flag under 39 - * page lock together with removing page from balloon device page list. 24 + * ii. isolation or dequeueing procedure must remove the page from balloon 25 + * device page list under b_dev_info->pages_lock. 40 26 * 41 27 * The functions provided by this interface are placed to help on coping with 42 28 * the aforementioned balloon page corner case, as well as to ensure the simple ··· 95 103 static inline void balloon_page_insert(struct balloon_dev_info *balloon, 96 104 struct page *page) 97 105 { 98 - __SetPageBalloon(page); 106 + __SetPageOffline(page); 99 107 __SetPageMovable(page, balloon->inode->i_mapping); 100 108 set_page_private(page, (unsigned long)balloon); 101 109 list_add(&page->lru, &balloon->pages); ··· 111 119 */ 112 120 static inline void balloon_page_delete(struct page *page) 113 121 { 114 - __ClearPageBalloon(page); 122 + __ClearPageOffline(page); 115 123 __ClearPageMovable(page); 116 124 set_page_private(page, 0); 117 125 /* ··· 141 149 static inline void balloon_page_insert(struct balloon_dev_info *balloon, 142 150 struct page *page) 143 151 { 144 - __SetPageBalloon(page); 152 + __SetPageOffline(page); 145 153 list_add(&page->lru, &balloon->pages); 146 154 } 147 155 148 156 static inline void balloon_page_delete(struct page *page) 149 157 { 150 - __ClearPageBalloon(page); 158 + __ClearPageOffline(page); 151 159 list_del(&page->lru); 152 160 } 153 161

+4

include/linux/cgroup-defs.h

··· 32 32 struct kernfs_ops; 33 33 struct kernfs_open_file; 34 34 struct seq_file; 35 + struct poll_table_struct; 35 36 36 37 #define MAX_CGROUP_TYPE_NAMELEN 32 37 38 #define MAX_CGROUP_ROOT_NAMELEN 64 ··· 574 573 */ 575 574 ssize_t (*write)(struct kernfs_open_file *of, 576 575 char *buf, size_t nbytes, loff_t off); 576 + 577 + __poll_t (*poll)(struct kernfs_open_file *of, 578 + struct poll_table_struct *pt); 577 579 578 580 #ifdef CONFIG_DEBUG_LOCK_ALLOC 579 581 struct lock_class_key lockdep_key;

+3 -4

include/linux/compaction.h

··· 88 88 extern int sysctl_compaction_handler(struct ctl_table *table, int write, 89 89 void __user *buffer, size_t *length, loff_t *ppos); 90 90 extern int sysctl_extfrag_threshold; 91 - extern int sysctl_extfrag_handler(struct ctl_table *table, int write, 92 - void __user *buffer, size_t *length, loff_t *ppos); 93 91 extern int sysctl_compact_unevictable_allowed; 94 92 95 93 extern int fragmentation_index(struct zone *zone, unsigned int order); 96 94 extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, 97 95 unsigned int order, unsigned int alloc_flags, 98 - const struct alloc_context *ac, enum compact_priority prio); 96 + const struct alloc_context *ac, enum compact_priority prio, 97 + struct page **page); 99 98 extern void reset_isolation_suitable(pg_data_t *pgdat); 100 99 extern enum compact_result compaction_suitable(struct zone *zone, int order, 101 100 unsigned int alloc_flags, int classzone_idx); ··· 226 227 227 228 #endif /* CONFIG_COMPACTION */ 228 229 229 - #if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) 230 230 struct node; 231 + #if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) 231 232 extern int compaction_register_node(struct node *node); 232 233 extern void compaction_unregister_node(struct node *node); 233 234

+1 -1

include/linux/device.h

··· 1095 1095 #else 1096 1096 static inline int dev_to_node(struct device *dev) 1097 1097 { 1098 - return -1; 1098 + return NUMA_NO_NODE; 1099 1099 } 1100 1100 static inline void set_dev_node(struct device *dev, int node) 1101 1101 {

+7

include/linux/frontswap.h

··· 7 7 #include <linux/bitops.h> 8 8 #include <linux/jump_label.h> 9 9 10 + /* 11 + * Return code to denote that requested number of 12 + * frontswap pages are unused(moved to page cache). 13 + * Used in in shmem_unuse and try_to_unuse. 14 + */ 15 + #define FRONTSWAP_PAGES_UNUSED 2 16 + 10 17 struct frontswap_ops { 11 18 void (*init)(unsigned); /* this swap type was just swapon'ed */ 12 19 int (*store)(unsigned, pgoff_t, struct page *); /* store a page */

+1 -1

include/linux/fs.h

··· 2091 2091 * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to 2092 2092 * synchronize competing switching instances and to tell 2093 2093 * wb stat updates to grab the i_pages lock. See 2094 - * inode_switch_wb_work_fn() for details. 2094 + * inode_switch_wbs_work_fn() for details. 2095 2095 * 2096 2096 * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper 2097 2097 * and work dirs among overlayfs mounts.

+15 -15

include/linux/gfp.h

··· 24 24 #define ___GFP_HIGH 0x20u 25 25 #define ___GFP_IO 0x40u 26 26 #define ___GFP_FS 0x80u 27 - #define ___GFP_WRITE 0x100u 28 - #define ___GFP_NOWARN 0x200u 29 - #define ___GFP_RETRY_MAYFAIL 0x400u 30 - #define ___GFP_NOFAIL 0x800u 31 - #define ___GFP_NORETRY 0x1000u 32 - #define ___GFP_MEMALLOC 0x2000u 33 - #define ___GFP_COMP 0x4000u 34 - #define ___GFP_ZERO 0x8000u 35 - #define ___GFP_NOMEMALLOC 0x10000u 36 - #define ___GFP_HARDWALL 0x20000u 37 - #define ___GFP_THISNODE 0x40000u 38 - #define ___GFP_ATOMIC 0x80000u 39 - #define ___GFP_ACCOUNT 0x100000u 40 - #define ___GFP_DIRECT_RECLAIM 0x200000u 41 - #define ___GFP_KSWAPD_RECLAIM 0x400000u 27 + #define ___GFP_ZERO 0x100u 28 + #define ___GFP_ATOMIC 0x200u 29 + #define ___GFP_DIRECT_RECLAIM 0x400u 30 + #define ___GFP_KSWAPD_RECLAIM 0x800u 31 + #define ___GFP_WRITE 0x1000u 32 + #define ___GFP_NOWARN 0x2000u 33 + #define ___GFP_RETRY_MAYFAIL 0x4000u 34 + #define ___GFP_NOFAIL 0x8000u 35 + #define ___GFP_NORETRY 0x10000u 36 + #define ___GFP_MEMALLOC 0x20000u 37 + #define ___GFP_COMP 0x40000u 38 + #define ___GFP_NOMEMALLOC 0x80000u 39 + #define ___GFP_HARDWALL 0x100000u 40 + #define ___GFP_THISNODE 0x200000u 41 + #define ___GFP_ACCOUNT 0x400000u 42 42 #ifdef CONFIG_LOCKDEP 43 43 #define ___GFP_NOLOCKDEP 0x800000u 44 44 #else

+69 -5

include/linux/hugetlb.h

··· 371 371 nodemask_t *nmask); 372 372 struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, 373 373 unsigned long address); 374 + struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, 375 + int nid, nodemask_t *nmask); 374 376 int huge_add_to_page_cache(struct page *page, struct address_space *mapping, 375 377 pgoff_t idx); 376 378 ··· 495 493 extern int dissolve_free_huge_page(struct page *page); 496 494 extern int dissolve_free_huge_pages(unsigned long start_pfn, 497 495 unsigned long end_pfn); 498 - static inline bool hugepage_migration_supported(struct hstate *h) 499 - { 496 + 500 497 #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION 498 + #ifndef arch_hugetlb_migration_supported 499 + static inline bool arch_hugetlb_migration_supported(struct hstate *h) 500 + { 501 501 if ((huge_page_shift(h) == PMD_SHIFT) || 502 - (huge_page_shift(h) == PGDIR_SHIFT)) 502 + (huge_page_shift(h) == PUD_SHIFT) || 503 + (huge_page_shift(h) == PGDIR_SHIFT)) 503 504 return true; 504 505 else 505 506 return false; 506 - #else 507 - return false; 507 + } 508 508 #endif 509 + #else 510 + static inline bool arch_hugetlb_migration_supported(struct hstate *h) 511 + { 512 + return false; 513 + } 514 + #endif 515 + 516 + static inline bool hugepage_migration_supported(struct hstate *h) 517 + { 518 + return arch_hugetlb_migration_supported(h); 519 + } 520 + 521 + /* 522 + * Movability check is different as compared to migration check. 523 + * It determines whether or not a huge page should be placed on 524 + * movable zone or not. Movability of any huge page should be 525 + * required only if huge page size is supported for migration. 526 + * There wont be any reason for the huge page to be movable if 527 + * it is not migratable to start with. Also the size of the huge 528 + * page should be large enough to be placed under a movable zone 529 + * and still feasible enough to be migratable. Just the presence 530 + * in movable zone does not make the migration feasible. 531 + * 532 + * So even though large huge page sizes like the gigantic ones 533 + * are migratable they should not be movable because its not 534 + * feasible to migrate them from movable zone. 535 + */ 536 + static inline bool hugepage_movable_supported(struct hstate *h) 537 + { 538 + if (!hugepage_migration_supported(h)) 539 + return false; 540 + 541 + if (hstate_is_gigantic(h)) 542 + return false; 543 + return true; 509 544 } 510 545 511 546 static inline spinlock_t *huge_pte_lockptr(struct hstate *h, ··· 582 543 set_huge_pte_at(mm, addr, ptep, pte); 583 544 } 584 545 #endif 546 + 547 + #ifndef huge_ptep_modify_prot_start 548 + #define huge_ptep_modify_prot_start huge_ptep_modify_prot_start 549 + static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, 550 + unsigned long addr, pte_t *ptep) 551 + { 552 + return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); 553 + } 554 + #endif 555 + 556 + #ifndef huge_ptep_modify_prot_commit 557 + #define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit 558 + static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, 559 + unsigned long addr, pte_t *ptep, 560 + pte_t old_pte, pte_t pte) 561 + { 562 + set_huge_pte_at(vma->vm_mm, addr, ptep, pte); 563 + } 564 + #endif 565 + 585 566 #else /* CONFIG_HUGETLB_PAGE */ 586 567 struct hstate {}; 587 568 #define alloc_huge_page(v, a, r) NULL ··· 657 598 } 658 599 659 600 static inline bool hugepage_migration_supported(struct hstate *h) 601 + { 602 + return false; 603 + } 604 + 605 + static inline bool hugepage_movable_supported(struct hstate *h) 660 606 { 661 607 return false; 662 608 }

+1 -1

include/linux/kasan-checks.h

··· 2 2 #ifndef _LINUX_KASAN_CHECKS_H 3 3 #define _LINUX_KASAN_CHECKS_H 4 4 5 - #ifdef CONFIG_KASAN 5 + #if defined(__SANITIZE_ADDRESS__) || defined(__KASAN_INTERNAL) 6 6 void kasan_check_read(const volatile void *p, unsigned int size); 7 7 void kasan_check_write(const volatile void *p, unsigned int size); 8 8 #else

+6

include/linux/kernfs.h

··· 25 25 struct vm_area_struct; 26 26 struct super_block; 27 27 struct file_system_type; 28 + struct poll_table_struct; 28 29 29 30 struct kernfs_open_node; 30 31 struct kernfs_iattrs; ··· 262 261 ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes, 263 262 loff_t off); 264 263 264 + __poll_t (*poll)(struct kernfs_open_file *of, 265 + struct poll_table_struct *pt); 266 + 265 267 int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma); 266 268 267 269 #ifdef CONFIG_DEBUG_LOCK_ALLOC ··· 354 350 int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, 355 351 const char *new_name, const void *new_ns); 356 352 int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); 353 + __poll_t kernfs_generic_poll(struct kernfs_open_file *of, 354 + struct poll_table_struct *pt); 357 355 void kernfs_notify(struct kernfs_node *kn); 358 356 359 357 const void *kernfs_super_ns(struct super_block *sb);

+7

include/linux/ksm.h

··· 53 53 54 54 void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); 55 55 void ksm_migrate_page(struct page *newpage, struct page *oldpage); 56 + bool reuse_ksm_page(struct page *page, 57 + struct vm_area_struct *vma, unsigned long address); 56 58 57 59 #else /* !CONFIG_KSM */ 58 60 ··· 87 85 88 86 static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) 89 87 { 88 + } 89 + static inline bool reuse_ksm_page(struct page *page, 90 + struct vm_area_struct *vma, unsigned long address) 91 + { 92 + return false; 90 93 } 91 94 #endif /* CONFIG_MMU */ 92 95 #endif /* !CONFIG_KSM */

+11

include/linux/list.h

··· 207 207 } 208 208 209 209 /** 210 + * list_is_first -- tests whether @ list is the first entry in list @head 211 + * @list: the entry to test 212 + * @head: the head of the list 213 + */ 214 + static inline int list_is_first(const struct list_head *list, 215 + const struct list_head *head) 216 + { 217 + return list->prev == head; 218 + } 219 + 220 + /** 210 221 * list_is_last - tests whether @list is the last entry in list @head 211 222 * @list: the entry to test 212 223 * @head: the head of the list

+43 -4

include/linux/memcontrol.h

··· 429 429 } 430 430 struct mem_cgroup *mem_cgroup_from_id(unsigned short id); 431 431 432 + static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) 433 + { 434 + return mem_cgroup_from_css(seq_css(m)); 435 + } 436 + 432 437 static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) 433 438 { 434 439 struct mem_cgroup_per_node *mz; ··· 942 937 return NULL; 943 938 } 944 939 940 + static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) 941 + { 942 + return NULL; 943 + } 944 + 945 945 static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) 946 946 { 947 947 return NULL; ··· 1283 1273 1284 1274 struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); 1285 1275 void memcg_kmem_put_cache(struct kmem_cache *cachep); 1286 - int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, 1287 - struct mem_cgroup *memcg); 1288 1276 1289 1277 #ifdef CONFIG_MEMCG_KMEM 1290 - int memcg_kmem_charge(struct page *page, gfp_t gfp, int order); 1291 - void memcg_kmem_uncharge(struct page *page, int order); 1278 + int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); 1279 + void __memcg_kmem_uncharge(struct page *page, int order); 1280 + int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, 1281 + struct mem_cgroup *memcg); 1292 1282 1293 1283 extern struct static_key_false memcg_kmem_enabled_key; 1294 1284 extern struct workqueue_struct *memcg_kmem_cache_wq; ··· 1310 1300 return static_branch_unlikely(&memcg_kmem_enabled_key); 1311 1301 } 1312 1302 1303 + static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) 1304 + { 1305 + if (memcg_kmem_enabled()) 1306 + return __memcg_kmem_charge(page, gfp, order); 1307 + return 0; 1308 + } 1309 + 1310 + static inline void memcg_kmem_uncharge(struct page *page, int order) 1311 + { 1312 + if (memcg_kmem_enabled()) 1313 + __memcg_kmem_uncharge(page, order); 1314 + } 1315 + 1316 + static inline int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, 1317 + int order, struct mem_cgroup *memcg) 1318 + { 1319 + if (memcg_kmem_enabled()) 1320 + return __memcg_kmem_charge_memcg(page, gfp, order, memcg); 1321 + return 0; 1322 + } 1313 1323 /* 1314 1324 * helper for accessing a memcg's index. It will be used as an index in the 1315 1325 * child cache array in kmem_cache, and also to derive its name. This function ··· 1352 1322 } 1353 1323 1354 1324 static inline void memcg_kmem_uncharge(struct page *page, int order) 1325 + { 1326 + } 1327 + 1328 + static inline int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) 1329 + { 1330 + return 0; 1331 + } 1332 + 1333 + static inline void __memcg_kmem_uncharge(struct page *page, int order) 1355 1334 { 1356 1335 } 1357 1336

+1 -1

include/linux/memory_hotplug.h

··· 89 89 unsigned long *valid_start, unsigned long *valid_end); 90 90 extern void __offline_isolated_pages(unsigned long, unsigned long); 91 91 92 - typedef void (*online_page_callback_t)(struct page *page); 92 + typedef void (*online_page_callback_t)(struct page *page, unsigned int order); 93 93 94 94 extern int set_online_page_callback(online_page_callback_t callback); 95 95 extern int restore_online_page_callback(online_page_callback_t callback);

+2 -1

include/linux/mm.h

··· 1536 1536 unsigned int gup_flags, struct page **pages, int *locked); 1537 1537 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, 1538 1538 struct page **pages, unsigned int gup_flags); 1539 - #ifdef CONFIG_FS_DAX 1539 + 1540 + #if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA) 1540 1541 long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, 1541 1542 unsigned int gup_flags, struct page **pages, 1542 1543 struct vm_area_struct **vmas);

+1 -1

include/linux/mm_types.h

··· 80 80 struct { /* Page cache and anonymous pages */ 81 81 /** 82 82 * @lru: Pageout list, eg. active_list protected by 83 - * zone_lru_lock. Sometimes used as a generic list 83 + * pgdat->lru_lock. Sometimes used as a generic list 84 84 * by the page owner. 85 85 */ 86 86 struct list_head lru;

+3 -5

include/linux/mmzone.h

··· 480 480 unsigned long compact_cached_free_pfn; 481 481 /* pfn where async and sync compaction migration scanner should start */ 482 482 unsigned long compact_cached_migrate_pfn[2]; 483 + unsigned long compact_init_migrate_pfn; 484 + unsigned long compact_init_free_pfn; 483 485 #endif 484 486 485 487 #ifdef CONFIG_COMPACTION ··· 730 728 731 729 #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) 732 730 #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) 733 - static inline spinlock_t *zone_lru_lock(struct zone *zone) 734 - { 735 - return &zone->zone_pgdat->lru_lock; 736 - } 737 731 738 732 static inline struct lruvec *node_lruvec(struct pglist_data *pgdat) 739 733 { ··· 1297 1299 1298 1300 /* 1299 1301 * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we 1300 - * need to check pfn validility within that MAX_ORDER_NR_PAGES block. 1302 + * need to check pfn validity within that MAX_ORDER_NR_PAGES block. 1301 1303 * pfn_valid_within() should be used in this case; we optimise this away 1302 1304 * when we have no holes within a MAX_ORDER_NR_PAGES block. 1303 1305 */

+4 -4

include/linux/nodemask.h

··· 444 444 return next_node(nid, node_states[N_MEMORY]); 445 445 } 446 446 447 - extern int nr_node_ids; 448 - extern int nr_online_nodes; 447 + extern unsigned int nr_node_ids; 448 + extern unsigned int nr_online_nodes; 449 449 450 450 static inline void node_set_online(int nid) 451 451 { ··· 485 485 #define first_online_node 0 486 486 #define first_memory_node 0 487 487 #define next_online_node(nid) (MAX_NUMNODES) 488 - #define nr_node_ids 1 489 - #define nr_online_nodes 1 488 + #define nr_node_ids 1U 489 + #define nr_online_nodes 1U 490 490 491 491 #define node_set_online(node) node_set_state((node), N_ONLINE) 492 492 #define node_set_offline(node) node_clear_state((node), N_ONLINE)

+38 -6

include/linux/page-flags.h

··· 17 17 /* 18 18 * Various page->flags bits: 19 19 * 20 - * PG_reserved is set for special pages, which can never be swapped out. Some 21 - * of them might not even exist... 20 + * PG_reserved is set for special pages. The "struct page" of such a page 21 + * should in general not be touched (e.g. set dirty) except by its owner. 22 + * Pages marked as PG_reserved include: 23 + * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS, 24 + * initrd, HW tables) 25 + * - Pages reserved or allocated early during boot (before the page allocator 26 + * was initialized). This includes (depending on the architecture) the 27 + * initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much 28 + * much more. Once (if ever) freed, PG_reserved is cleared and they will 29 + * be given to the page allocator. 30 + * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying 31 + * to read/write these pages might end badly. Don't touch! 32 + * - The zero page(s) 33 + * - Pages not added to the page allocator when onlining a section because 34 + * they were excluded via the online_page_callback() or because they are 35 + * PG_hwpoison. 36 + * - Pages allocated in the context of kexec/kdump (loaded kernel image, 37 + * control pages, vmcoreinfo) 38 + * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are 39 + * not marked PG_reserved (as they might be in use by somebody else who does 40 + * not respect the caching strategy). 41 + * - Pages part of an offline section (struct pages of offline sections should 42 + * not be trusted as they will be initialized when first onlined). 43 + * - MCA pages on ia64 44 + * - Pages holding CPU notes for POWER Firmware Assisted Dump 45 + * - Device memory (e.g. PMEM, DAX, HMM) 46 + * Some PG_reserved pages will be excluded from the hibernation image. 47 + * PG_reserved does in general not hinder anybody from dumping or swapping 48 + * and is no longer required for remap_pfn_range(). ioremap might require it. 49 + * Consequently, PG_reserved for a page mapped into user space can indicate 50 + * the zero page, the vDSO, MMIO pages or device memory. 22 51 * 23 52 * The PG_private bitflag is set on pagecache pages if they contain filesystem 24 53 * specific data (which is normally at page->private). It can be used by ··· 700 671 /* Reserve 0x0000007f to catch underflows of page_mapcount */ 701 672 #define PAGE_MAPCOUNT_RESERVE -128 702 673 #define PG_buddy 0x00000080 703 - #define PG_balloon 0x00000100 674 + #define PG_offline 0x00000100 704 675 #define PG_kmemcg 0x00000200 705 676 #define PG_table 0x00000400 706 677 ··· 735 706 PAGE_TYPE_OPS(Buddy, buddy) 736 707 737 708 /* 738 - * PageBalloon() is true for pages that are on the balloon page list 739 - * (see mm/balloon_compaction.c). 709 + * PageOffline() indicates that the page is logically offline although the 710 + * containing section is online. (e.g. inflated in a balloon driver or 711 + * not onlined when onlining the section). 712 + * The content of these pages is effectively stale. Such pages should not 713 + * be touched (read/write/dump/save) except by their owner. 740 714 */ 741 - PAGE_TYPE_OPS(Balloon, balloon) 715 + PAGE_TYPE_OPS(Offline, offline) 742 716 743 717 /* 744 718 * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on

+9 -22

include/linux/pagemap.h

··· 164 164 * will find the page or it will not. Likewise, the old find_get_page could run 165 165 * either before the insertion or afterwards, depending on timing. 166 166 */ 167 - static inline int page_cache_get_speculative(struct page *page) 167 + static inline int __page_cache_add_speculative(struct page *page, int count) 168 168 { 169 169 #ifdef CONFIG_TINY_RCU 170 170 # ifdef CONFIG_PREEMPT_COUNT ··· 180 180 * SMP requires. 181 181 */ 182 182 VM_BUG_ON_PAGE(page_count(page) == 0, page); 183 - page_ref_inc(page); 183 + page_ref_add(page, count); 184 184 185 185 #else 186 - if (unlikely(!get_page_unless_zero(page))) { 186 + if (unlikely(!page_ref_add_unless(page, count, 0))) { 187 187 /* 188 188 * Either the page has been freed, or will be freed. 189 189 * In either case, retry here and the caller should ··· 197 197 return 1; 198 198 } 199 199 200 - /* 201 - * Same as above, but add instead of inc (could just be merged) 202 - */ 200 + static inline int page_cache_get_speculative(struct page *page) 201 + { 202 + return __page_cache_add_speculative(page, 1); 203 + } 204 + 203 205 static inline int page_cache_add_speculative(struct page *page, int count) 204 206 { 205 - VM_BUG_ON(in_interrupt()); 206 - 207 - #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) 208 - # ifdef CONFIG_PREEMPT_COUNT 209 - VM_BUG_ON(!in_atomic() && !irqs_disabled()); 210 - # endif 211 - VM_BUG_ON_PAGE(page_count(page) == 0, page); 212 - page_ref_add(page, count); 213 - 214 - #else 215 - if (unlikely(!page_ref_add_unless(page, count, 0))) 216 - return 0; 217 - #endif 218 - VM_BUG_ON_PAGE(PageCompound(page) && page != compound_head(page), page); 219 - 220 - return 1; 207 + return __page_cache_add_speculative(page, count); 221 208 } 222 209 223 210 #ifdef CONFIG_NUMA

+1 -1

include/linux/poison.h

··· 30 30 */ 31 31 #define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA) 32 32 33 - /********** mm/debug-pagealloc.c **********/ 33 + /********** mm/page_poison.c **********/ 34 34 #ifdef CONFIG_PAGE_POISONING_ZERO 35 35 #define PAGE_POISON 0x00 36 36 #else

+5

include/linux/sched.h

··· 48 48 struct pipe_inode_info; 49 49 struct rcu_node; 50 50 struct reclaim_state; 51 + struct capture_control; 51 52 struct robust_list_head; 52 53 struct sched_attr; 53 54 struct sched_param; ··· 951 950 952 951 struct io_context *io_context; 953 952 953 + #ifdef CONFIG_COMPACTION 954 + struct capture_control *capture_control; 955 + #endif 954 956 /* Ptrace state: */ 955 957 unsigned long ptrace_message; 956 958 kernel_siginfo_t *last_siginfo; ··· 1399 1395 #define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ 1400 1396 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ 1401 1397 #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ 1398 + #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ 1402 1399 #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ 1403 1400 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ 1404 1401

+40 -8

include/linux/sched/mm.h

··· 148 148 * Applies per-task gfp context to the given allocation flags. 149 149 * PF_MEMALLOC_NOIO implies GFP_NOIO 150 150 * PF_MEMALLOC_NOFS implies GFP_NOFS 151 + * PF_MEMALLOC_NOCMA implies no allocation from CMA region. 151 152 */ 152 153 static inline gfp_t current_gfp_context(gfp_t flags) 153 154 { 154 - /* 155 - * NOIO implies both NOIO and NOFS and it is a weaker context 156 - * so always make sure it makes precedence 157 - */ 158 - if (unlikely(current->flags & PF_MEMALLOC_NOIO)) 159 - flags &= ~(__GFP_IO | __GFP_FS); 160 - else if (unlikely(current->flags & PF_MEMALLOC_NOFS)) 161 - flags &= ~__GFP_FS; 155 + if (unlikely(current->flags & 156 + (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_NOCMA))) { 157 + /* 158 + * NOIO implies both NOIO and NOFS and it is a weaker context 159 + * so always make sure it makes precedence 160 + */ 161 + if (current->flags & PF_MEMALLOC_NOIO) 162 + flags &= ~(__GFP_IO | __GFP_FS); 163 + else if (current->flags & PF_MEMALLOC_NOFS) 164 + flags &= ~__GFP_FS; 165 + #ifdef CONFIG_CMA 166 + if (current->flags & PF_MEMALLOC_NOCMA) 167 + flags &= ~__GFP_MOVABLE; 168 + #endif 169 + } 162 170 return flags; 163 171 } 164 172 ··· 255 247 { 256 248 current->flags = (current->flags & ~PF_MEMALLOC) | flags; 257 249 } 250 + 251 + #ifdef CONFIG_CMA 252 + static inline unsigned int memalloc_nocma_save(void) 253 + { 254 + unsigned int flags = current->flags & PF_MEMALLOC_NOCMA; 255 + 256 + current->flags |= PF_MEMALLOC_NOCMA; 257 + return flags; 258 + } 259 + 260 + static inline void memalloc_nocma_restore(unsigned int flags) 261 + { 262 + current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags; 263 + } 264 + #else 265 + static inline unsigned int memalloc_nocma_save(void) 266 + { 267 + return 0; 268 + } 269 + 270 + static inline void memalloc_nocma_restore(unsigned int flags) 271 + { 272 + } 273 + #endif 258 274 259 275 #ifdef CONFIG_MEMCG 260 276 /**

+2 -1

include/linux/shmem_fs.h

··· 72 72 extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 73 73 pgoff_t index, gfp_t gfp_mask); 74 74 extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); 75 - extern int shmem_unuse(swp_entry_t entry, struct page *page); 75 + extern int shmem_unuse(unsigned int type, bool frontswap, 76 + unsigned long *fs_pages_to_unuse); 76 77 77 78 extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); 78 79 extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,

+6 -6

include/linux/slub_def.h

··· 81 81 */ 82 82 struct kmem_cache { 83 83 struct kmem_cache_cpu __percpu *cpu_slab; 84 - /* Used for retriving partial slabs etc */ 84 + /* Used for retrieving partial slabs, etc. */ 85 85 slab_flags_t flags; 86 86 unsigned long min_partial; 87 - unsigned int size; /* The size of an object including meta data */ 88 - unsigned int object_size;/* The size of an object without meta data */ 89 - unsigned int offset; /* Free pointer offset. */ 87 + unsigned int size; /* The size of an object including metadata */ 88 + unsigned int object_size;/* The size of an object without metadata */ 89 + unsigned int offset; /* Free pointer offset */ 90 90 #ifdef CONFIG_SLUB_CPU_PARTIAL 91 91 /* Number of per cpu partial objects to keep around */ 92 92 unsigned int cpu_partial; ··· 110 110 #endif 111 111 #ifdef CONFIG_MEMCG 112 112 struct memcg_cache_params memcg_params; 113 - /* for propagation, maximum size of a stored attr */ 113 + /* For propagation, maximum size of a stored attr */ 114 114 unsigned int max_attr_size; 115 115 #ifdef CONFIG_SYSFS 116 116 struct kset *memcg_kset; ··· 151 151 #else 152 152 #define slub_cpu_partial(s) (0) 153 153 #define slub_set_cpu_partial(s, n) 154 - #endif // CONFIG_SLUB_CPU_PARTIAL 154 + #endif /* CONFIG_SLUB_CPU_PARTIAL */ 155 155 156 156 #ifdef CONFIG_SYSFS 157 157 #define SLAB_SUPPORTS_SYSFS

+2 -2

include/linux/swap.h

··· 307 307 }; 308 308 309 309 /* linux/mm/workingset.c */ 310 - void *workingset_eviction(struct address_space *mapping, struct page *page); 310 + void *workingset_eviction(struct page *page); 311 311 void workingset_refault(struct page *page, void *shadow); 312 312 void workingset_activation(struct page *page); 313 313 ··· 625 625 return vm_swappiness; 626 626 627 627 /* root ? */ 628 - if (mem_cgroup_disabled() || !memcg->css.parent) 628 + if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg)) 629 629 return vm_swappiness; 630 630 631 631 return memcg->swappiness;

+1

include/uapi/linux/fcntl.h

··· 41 41 #define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ 42 42 #define F_SEAL_GROW 0x0004 /* prevent file from growing */ 43 43 #define F_SEAL_WRITE 0x0008 /* prevent writes */ 44 + #define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ 44 45 /* (1U << 31) is reserved for signed error codes */ 45 46 46 47 /*

+1 -1

include/uapi/linux/kernel-page-flags.h

··· 32 32 33 33 #define KPF_KSM 21 34 34 #define KPF_THP 22 35 - #define KPF_BALLOON 23 35 + #define KPF_OFFLINE 23 36 36 #define KPF_ZERO_PAGE 24 37 37 #define KPF_IDLE 25 38 38 #define KPF_PGTABLE 26

+2 -1

init/init_task.c

··· 10 10 #include <linux/fs.h> 11 11 #include <linux/mm.h> 12 12 #include <linux/audit.h> 13 + #include <linux/numa.h> 13 14 14 15 #include <asm/pgtable.h> 15 16 #include <linux/uaccess.h> ··· 155 154 .vtime.state = VTIME_SYS, 156 155 #endif 157 156 #ifdef CONFIG_NUMA_BALANCING 158 - .numa_preferred_nid = -1, 157 + .numa_preferred_nid = NUMA_NO_NODE, 159 158 .numa_group = NULL, 160 159 .numa_faults = NULL, 161 160 #endif

+12

kernel/cgroup/cgroup.c

··· 3534 3534 return ret ?: nbytes; 3535 3535 } 3536 3536 3537 + static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) 3538 + { 3539 + struct cftype *cft = of->kn->priv; 3540 + 3541 + if (cft->poll) 3542 + return cft->poll(of, pt); 3543 + 3544 + return kernfs_generic_poll(of, pt); 3545 + } 3546 + 3537 3547 static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) 3538 3548 { 3539 3549 return seq_cft(seq)->seq_start(seq, ppos); ··· 3582 3572 .open = cgroup_file_open, 3583 3573 .release = cgroup_file_release, 3584 3574 .write = cgroup_file_write, 3575 + .poll = cgroup_file_poll, 3585 3576 .seq_show = cgroup_seqfile_show, 3586 3577 }; 3587 3578 ··· 3591 3580 .open = cgroup_file_open, 3592 3581 .release = cgroup_file_release, 3593 3582 .write = cgroup_file_write, 3583 + .poll = cgroup_file_poll, 3594 3584 .seq_start = cgroup_seqfile_start, 3595 3585 .seq_next = cgroup_seqfile_next, 3596 3586 .seq_stop = cgroup_seqfile_stop,

+2

kernel/crash_core.c

··· 464 464 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); 465 465 #ifdef CONFIG_HUGETLB_PAGE 466 466 VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); 467 + #define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) 468 + VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); 467 469 #endif 468 470 469 471 arch_crash_save_vmcoreinfo();

+2 -1

kernel/kthread.c

··· 20 20 #include <linux/freezer.h> 21 21 #include <linux/ptrace.h> 22 22 #include <linux/uaccess.h> 23 + #include <linux/numa.h> 23 24 #include <trace/events/sched.h> 24 25 25 26 static DEFINE_SPINLOCK(kthread_create_lock); ··· 682 681 { 683 682 struct kthread_worker *worker; 684 683 struct task_struct *task; 685 - int node = -1; 684 + int node = NUMA_NO_NODE; 686 685 687 686 worker = kzalloc(sizeof(*worker), GFP_KERNEL); 688 687 if (!worker)

+11 -6

kernel/power/snapshot.c

··· 1215 1215 if (!pfn_valid(pfn)) 1216 1216 return NULL; 1217 1217 1218 - page = pfn_to_page(pfn); 1219 - if (page_zone(page) != zone) 1218 + page = pfn_to_online_page(pfn); 1219 + if (!page || page_zone(page) != zone) 1220 1220 return NULL; 1221 1221 1222 1222 BUG_ON(!PageHighMem(page)); 1223 1223 1224 - if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) || 1225 - PageReserved(page)) 1224 + if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) 1225 + return NULL; 1226 + 1227 + if (PageReserved(page) || PageOffline(page)) 1226 1228 return NULL; 1227 1229 1228 1230 if (page_is_guard(page)) ··· 1279 1277 if (!pfn_valid(pfn)) 1280 1278 return NULL; 1281 1279 1282 - page = pfn_to_page(pfn); 1283 - if (page_zone(page) != zone) 1280 + page = pfn_to_online_page(pfn); 1281 + if (!page || page_zone(page) != zone) 1284 1282 return NULL; 1285 1283 1286 1284 BUG_ON(PageHighMem(page)); 1287 1285 1288 1286 if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) 1287 + return NULL; 1288 + 1289 + if (PageOffline(page)) 1289 1290 return NULL; 1290 1291 1291 1292 if (PageReserved(page)

+3

kernel/sched/core.c

··· 2220 2220 INIT_HLIST_HEAD(&p->preempt_notifiers); 2221 2221 #endif 2222 2222 2223 + #ifdef CONFIG_COMPACTION 2224 + p->capture_control = NULL; 2225 + #endif 2223 2226 init_numa_balancing(clone_flags, p); 2224 2227 } 2225 2228

+8 -7

kernel/sched/fair.c

··· 1173 1173 1174 1174 /* New address space, reset the preferred nid */ 1175 1175 if (!(clone_flags & CLONE_VM)) { 1176 - p->numa_preferred_nid = -1; 1176 + p->numa_preferred_nid = NUMA_NO_NODE; 1177 1177 return; 1178 1178 } 1179 1179 ··· 1193 1193 1194 1194 static void account_numa_enqueue(struct rq *rq, struct task_struct *p) 1195 1195 { 1196 - rq->nr_numa_running += (p->numa_preferred_nid != -1); 1196 + rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); 1197 1197 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); 1198 1198 } 1199 1199 1200 1200 static void account_numa_dequeue(struct rq *rq, struct task_struct *p) 1201 1201 { 1202 - rq->nr_numa_running -= (p->numa_preferred_nid != -1); 1202 + rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE); 1203 1203 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); 1204 1204 } 1205 1205 ··· 1413 1413 * two full passes of the "multi-stage node selection" test that is 1414 1414 * executed below. 1415 1415 */ 1416 - if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && 1416 + if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) && 1417 1417 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) 1418 1418 return true; 1419 1419 ··· 1861 1861 unsigned long interval = HZ; 1862 1862 1863 1863 /* This task has no NUMA fault statistics yet */ 1864 - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) 1864 + if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults)) 1865 1865 return; 1866 1866 1867 1867 /* Periodically retry migrating the task to the preferred node */ ··· 2108 2108 2109 2109 static void task_numa_placement(struct task_struct *p) 2110 2110 { 2111 - int seq, nid, max_nid = -1; 2111 + int seq, nid, max_nid = NUMA_NO_NODE; 2112 2112 unsigned long max_faults = 0; 2113 2113 unsigned long fault_types[2] = { 0, 0 }; 2114 2114 unsigned long total_faults; ··· 2651 2651 * the preferred node. 2652 2652 */ 2653 2653 if (dst_nid == p->numa_preferred_nid || 2654 - (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) 2654 + (p->numa_preferred_nid != NUMA_NO_NODE && 2655 + src_nid != p->numa_preferred_nid)) 2655 2656 return; 2656 2657 } 2657 2658

+1 -1

kernel/sysctl.c

··· 1471 1471 .data = &sysctl_extfrag_threshold, 1472 1472 .maxlen = sizeof(int), 1473 1473 .mode = 0644, 1474 - .proc_handler = sysctl_extfrag_handler, 1474 + .proc_handler = proc_dointvec_minmax, 1475 1475 .extra1 = &min_extfrag_threshold, 1476 1476 .extra2 = &max_extfrag_threshold, 1477 1477 },

+13 -18

lib/Kconfig.debug

··· 222 222 config FRAME_WARN 223 223 int "Warn for stack frames larger than (needs gcc 4.4)" 224 224 range 0 8192 225 - default 3072 if KASAN_EXTRA 226 225 default 2048 if GCC_PLUGIN_LATENT_ENTROPY 227 226 default 1280 if (!64BIT && PARISC) 228 227 default 1024 if (!64BIT && !PARISC) ··· 264 265 mail to the linux kernel mailing list mentioning the symbol and why 265 266 you really need it, and what the merge plan to the mainline kernel for 266 267 your module is. 267 - 268 - config PAGE_OWNER 269 - bool "Track page owner" 270 - depends on DEBUG_KERNEL && STACKTRACE_SUPPORT 271 - select DEBUG_FS 272 - select STACKTRACE 273 - select STACKDEPOT 274 - select PAGE_EXTENSION 275 - help 276 - This keeps track of what call chain is the owner of a page, may 277 - help to find bare alloc_page(s) leaks. Even if you include this 278 - feature on your build, it is disabled in default. You should pass 279 - "page_owner=on" to boot parameter in order to enable it. Eats 280 - a fair amount of memory if enabled. See tools/vm/page_owner_sort.c 281 - for user-space helper. 282 - 283 - If unsure, say N. 284 268 285 269 config DEBUG_FS 286 270 bool "Debug Filesystem" ··· 1855 1873 validating module verification). It lacks any extra dependencies, 1856 1874 and will not normally be loaded by the system unless explicitly 1857 1875 requested by name. 1876 + 1877 + If unsure, say N. 1878 + 1879 + config TEST_VMALLOC 1880 + tristate "Test module for stress/performance analysis of vmalloc allocator" 1881 + default n 1882 + depends on MMU 1883 + depends on m 1884 + help 1885 + This builds the "test_vmalloc" module that should be used for 1886 + stress and performance analysis. So, any new change for vmalloc 1887 + subsystem can be evaluated from performance and stability point 1888 + of view. 1858 1889 1859 1890 If unsure, say N. 1860 1891

-10

lib/Kconfig.kasan

··· 78 78 79 79 endchoice 80 80 81 - config KASAN_EXTRA 82 - bool "KASAN: extra checks" 83 - depends on KASAN_GENERIC && DEBUG_KERNEL && !COMPILE_TEST 84 - help 85 - This enables further checks in generic KASAN, for now it only 86 - includes the address-use-after-scope check that can lead to 87 - excessive kernel stack usage, frame size warnings and longer 88 - compile time. 89 - See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 90 - 91 81 choice 92 82 prompt "Instrumentation type" 93 83 depends on KASAN

+1

lib/Makefile

··· 60 60 obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o 61 61 obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o 62 62 obj-$(CONFIG_TEST_LKM) += test_module.o 63 + obj-$(CONFIG_TEST_VMALLOC) += test_vmalloc.o 63 64 obj-$(CONFIG_TEST_OVERFLOW) += test_overflow.o 64 65 obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o 65 66 obj-$(CONFIG_TEST_SORT) += test_sort.o

+2 -1

lib/cpumask.c

··· 5 5 #include <linux/cpumask.h> 6 6 #include <linux/export.h> 7 7 #include <linux/memblock.h> 8 + #include <linux/numa.h> 8 9 9 10 /** 10 11 * cpumask_next - get the next cpu in a cpumask ··· 207 206 /* Wrap: we always want a cpu. */ 208 207 i %= num_online_cpus(); 209 208 210 - if (node == -1) { 209 + if (node == NUMA_NO_NODE) { 211 210 for_each_cpu(cpu, cpu_online_mask) 212 211 if (i-- == 0) 213 212 return cpu;

-24

lib/test_kasan.c

··· 480 480 kfree(kmem); 481 481 } 482 482 483 - static noinline void __init use_after_scope_test(void) 484 - { 485 - volatile char *volatile p; 486 - 487 - pr_info("use-after-scope on int\n"); 488 - { 489 - int local = 0; 490 - 491 - p = (char *)&local; 492 - } 493 - p[0] = 1; 494 - p[3] = 1; 495 - 496 - pr_info("use-after-scope on array\n"); 497 - { 498 - char local[1024] = {0}; 499 - 500 - p = local; 501 - } 502 - p[0] = 1; 503 - p[1023] = 1; 504 - } 505 - 506 483 static noinline void __init kasan_alloca_oob_left(void) 507 484 { 508 485 volatile int i = 10; ··· 659 682 kasan_alloca_oob_right(); 660 683 ksize_unpoisons_memory(); 661 684 copy_user_test(); 662 - use_after_scope_test(); 663 685 kmem_cache_double_free(); 664 686 kmem_cache_invalid_free(); 665 687 kasan_memchr();

+551

lib/test_vmalloc.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + /* 4 + * Test module for stress and analyze performance of vmalloc allocator. 5 + * (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com> 6 + */ 7 + #include <linux/init.h> 8 + #include <linux/kernel.h> 9 + #include <linux/module.h> 10 + #include <linux/vmalloc.h> 11 + #include <linux/random.h> 12 + #include <linux/kthread.h> 13 + #include <linux/moduleparam.h> 14 + #include <linux/completion.h> 15 + #include <linux/delay.h> 16 + #include <linux/rwsem.h> 17 + #include <linux/mm.h> 18 + 19 + #define __param(type, name, init, msg) \ 20 + static type name = init; \ 21 + module_param(name, type, 0444); \ 22 + MODULE_PARM_DESC(name, msg) \ 23 + 24 + __param(bool, single_cpu_test, false, 25 + "Use single first online CPU to run tests"); 26 + 27 + __param(bool, sequential_test_order, false, 28 + "Use sequential stress tests order"); 29 + 30 + __param(int, test_repeat_count, 1, 31 + "Set test repeat counter"); 32 + 33 + __param(int, test_loop_count, 1000000, 34 + "Set test loop counter"); 35 + 36 + __param(int, run_test_mask, INT_MAX, 37 + "Set tests specified in the mask.\n\n" 38 + "\t\tid: 1, name: fix_size_alloc_test\n" 39 + "\t\tid: 2, name: full_fit_alloc_test\n" 40 + "\t\tid: 4, name: long_busy_list_alloc_test\n" 41 + "\t\tid: 8, name: random_size_alloc_test\n" 42 + "\t\tid: 16, name: fix_align_alloc_test\n" 43 + "\t\tid: 32, name: random_size_align_alloc_test\n" 44 + "\t\tid: 64, name: align_shift_alloc_test\n" 45 + "\t\tid: 128, name: pcpu_alloc_test\n" 46 + /* Add a new test case description here. */ 47 + ); 48 + 49 + /* 50 + * Depends on single_cpu_test parameter. If it is true, then 51 + * use first online CPU to trigger a test on, otherwise go with 52 + * all online CPUs. 53 + */ 54 + static cpumask_t cpus_run_test_mask = CPU_MASK_NONE; 55 + 56 + /* 57 + * Read write semaphore for synchronization of setup 58 + * phase that is done in main thread and workers. 59 + */ 60 + static DECLARE_RWSEM(prepare_for_test_rwsem); 61 + 62 + /* 63 + * Completion tracking for worker threads. 64 + */ 65 + static DECLARE_COMPLETION(test_all_done_comp); 66 + static atomic_t test_n_undone = ATOMIC_INIT(0); 67 + 68 + static inline void 69 + test_report_one_done(void) 70 + { 71 + if (atomic_dec_and_test(&test_n_undone)) 72 + complete(&test_all_done_comp); 73 + } 74 + 75 + static int random_size_align_alloc_test(void) 76 + { 77 + unsigned long size, align, rnd; 78 + void *ptr; 79 + int i; 80 + 81 + for (i = 0; i < test_loop_count; i++) { 82 + get_random_bytes(&rnd, sizeof(rnd)); 83 + 84 + /* 85 + * Maximum 1024 pages, if PAGE_SIZE is 4096. 86 + */ 87 + align = 1 << (rnd % 23); 88 + 89 + /* 90 + * Maximum 10 pages. 91 + */ 92 + size = ((rnd % 10) + 1) * PAGE_SIZE; 93 + 94 + ptr = __vmalloc_node_range(size, align, 95 + VMALLOC_START, VMALLOC_END, 96 + GFP_KERNEL | __GFP_ZERO, 97 + PAGE_KERNEL, 98 + 0, 0, __builtin_return_address(0)); 99 + 100 + if (!ptr) 101 + return -1; 102 + 103 + vfree(ptr); 104 + } 105 + 106 + return 0; 107 + } 108 + 109 + /* 110 + * This test case is supposed to be failed. 111 + */ 112 + static int align_shift_alloc_test(void) 113 + { 114 + unsigned long align; 115 + void *ptr; 116 + int i; 117 + 118 + for (i = 0; i < BITS_PER_LONG; i++) { 119 + align = ((unsigned long) 1) << i; 120 + 121 + ptr = __vmalloc_node_range(PAGE_SIZE, align, 122 + VMALLOC_START, VMALLOC_END, 123 + GFP_KERNEL | __GFP_ZERO, 124 + PAGE_KERNEL, 125 + 0, 0, __builtin_return_address(0)); 126 + 127 + if (!ptr) 128 + return -1; 129 + 130 + vfree(ptr); 131 + } 132 + 133 + return 0; 134 + } 135 + 136 + static int fix_align_alloc_test(void) 137 + { 138 + void *ptr; 139 + int i; 140 + 141 + for (i = 0; i < test_loop_count; i++) { 142 + ptr = __vmalloc_node_range(5 * PAGE_SIZE, 143 + THREAD_ALIGN << 1, 144 + VMALLOC_START, VMALLOC_END, 145 + GFP_KERNEL | __GFP_ZERO, 146 + PAGE_KERNEL, 147 + 0, 0, __builtin_return_address(0)); 148 + 149 + if (!ptr) 150 + return -1; 151 + 152 + vfree(ptr); 153 + } 154 + 155 + return 0; 156 + } 157 + 158 + static int random_size_alloc_test(void) 159 + { 160 + unsigned int n; 161 + void *p; 162 + int i; 163 + 164 + for (i = 0; i < test_loop_count; i++) { 165 + get_random_bytes(&n, sizeof(i)); 166 + n = (n % 100) + 1; 167 + 168 + p = vmalloc(n * PAGE_SIZE); 169 + 170 + if (!p) 171 + return -1; 172 + 173 + *((__u8 *)p) = 1; 174 + vfree(p); 175 + } 176 + 177 + return 0; 178 + } 179 + 180 + static int long_busy_list_alloc_test(void) 181 + { 182 + void *ptr_1, *ptr_2; 183 + void **ptr; 184 + int rv = -1; 185 + int i; 186 + 187 + ptr = vmalloc(sizeof(void *) * 15000); 188 + if (!ptr) 189 + return rv; 190 + 191 + for (i = 0; i < 15000; i++) 192 + ptr[i] = vmalloc(1 * PAGE_SIZE); 193 + 194 + for (i = 0; i < test_loop_count; i++) { 195 + ptr_1 = vmalloc(100 * PAGE_SIZE); 196 + if (!ptr_1) 197 + goto leave; 198 + 199 + ptr_2 = vmalloc(1 * PAGE_SIZE); 200 + if (!ptr_2) { 201 + vfree(ptr_1); 202 + goto leave; 203 + } 204 + 205 + *((__u8 *)ptr_1) = 0; 206 + *((__u8 *)ptr_2) = 1; 207 + 208 + vfree(ptr_1); 209 + vfree(ptr_2); 210 + } 211 + 212 + /* Success */ 213 + rv = 0; 214 + 215 + leave: 216 + for (i = 0; i < 15000; i++) 217 + vfree(ptr[i]); 218 + 219 + vfree(ptr); 220 + return rv; 221 + } 222 + 223 + static int full_fit_alloc_test(void) 224 + { 225 + void **ptr, **junk_ptr, *tmp; 226 + int junk_length; 227 + int rv = -1; 228 + int i; 229 + 230 + junk_length = fls(num_online_cpus()); 231 + junk_length *= (32 * 1024 * 1024 / PAGE_SIZE); 232 + 233 + ptr = vmalloc(sizeof(void *) * junk_length); 234 + if (!ptr) 235 + return rv; 236 + 237 + junk_ptr = vmalloc(sizeof(void *) * junk_length); 238 + if (!junk_ptr) { 239 + vfree(ptr); 240 + return rv; 241 + } 242 + 243 + for (i = 0; i < junk_length; i++) { 244 + ptr[i] = vmalloc(1 * PAGE_SIZE); 245 + junk_ptr[i] = vmalloc(1 * PAGE_SIZE); 246 + } 247 + 248 + for (i = 0; i < junk_length; i++) 249 + vfree(junk_ptr[i]); 250 + 251 + for (i = 0; i < test_loop_count; i++) { 252 + tmp = vmalloc(1 * PAGE_SIZE); 253 + 254 + if (!tmp) 255 + goto error; 256 + 257 + *((__u8 *)tmp) = 1; 258 + vfree(tmp); 259 + } 260 + 261 + /* Success */ 262 + rv = 0; 263 + 264 + error: 265 + for (i = 0; i < junk_length; i++) 266 + vfree(ptr[i]); 267 + 268 + vfree(ptr); 269 + vfree(junk_ptr); 270 + 271 + return rv; 272 + } 273 + 274 + static int fix_size_alloc_test(void) 275 + { 276 + void *ptr; 277 + int i; 278 + 279 + for (i = 0; i < test_loop_count; i++) { 280 + ptr = vmalloc(3 * PAGE_SIZE); 281 + 282 + if (!ptr) 283 + return -1; 284 + 285 + *((__u8 *)ptr) = 0; 286 + 287 + vfree(ptr); 288 + } 289 + 290 + return 0; 291 + } 292 + 293 + static int 294 + pcpu_alloc_test(void) 295 + { 296 + int rv = 0; 297 + #ifndef CONFIG_NEED_PER_CPU_KM 298 + void __percpu **pcpu; 299 + size_t size, align; 300 + int i; 301 + 302 + pcpu = vmalloc(sizeof(void __percpu *) * 35000); 303 + if (!pcpu) 304 + return -1; 305 + 306 + for (i = 0; i < 35000; i++) { 307 + unsigned int r; 308 + 309 + get_random_bytes(&r, sizeof(i)); 310 + size = (r % (PAGE_SIZE / 4)) + 1; 311 + 312 + /* 313 + * Maximum PAGE_SIZE 314 + */ 315 + get_random_bytes(&r, sizeof(i)); 316 + align = 1 << ((i % 11) + 1); 317 + 318 + pcpu[i] = __alloc_percpu(size, align); 319 + if (!pcpu[i]) 320 + rv = -1; 321 + } 322 + 323 + for (i = 0; i < 35000; i++) 324 + free_percpu(pcpu[i]); 325 + 326 + vfree(pcpu); 327 + #endif 328 + return rv; 329 + } 330 + 331 + struct test_case_desc { 332 + const char *test_name; 333 + int (*test_func)(void); 334 + }; 335 + 336 + static struct test_case_desc test_case_array[] = { 337 + { "fix_size_alloc_test", fix_size_alloc_test }, 338 + { "full_fit_alloc_test", full_fit_alloc_test }, 339 + { "long_busy_list_alloc_test", long_busy_list_alloc_test }, 340 + { "random_size_alloc_test", random_size_alloc_test }, 341 + { "fix_align_alloc_test", fix_align_alloc_test }, 342 + { "random_size_align_alloc_test", random_size_align_alloc_test }, 343 + { "align_shift_alloc_test", align_shift_alloc_test }, 344 + { "pcpu_alloc_test", pcpu_alloc_test }, 345 + /* Add a new test case here. */ 346 + }; 347 + 348 + struct test_case_data { 349 + int test_failed; 350 + int test_passed; 351 + u64 time; 352 + }; 353 + 354 + /* Split it to get rid of: WARNING: line over 80 characters */ 355 + static struct test_case_data 356 + per_cpu_test_data[NR_CPUS][ARRAY_SIZE(test_case_array)]; 357 + 358 + static struct test_driver { 359 + struct task_struct *task; 360 + unsigned long start; 361 + unsigned long stop; 362 + int cpu; 363 + } per_cpu_test_driver[NR_CPUS]; 364 + 365 + static void shuffle_array(int *arr, int n) 366 + { 367 + unsigned int rnd; 368 + int i, j, x; 369 + 370 + for (i = n - 1; i > 0; i--) { 371 + get_random_bytes(&rnd, sizeof(rnd)); 372 + 373 + /* Cut the range. */ 374 + j = rnd % i; 375 + 376 + /* Swap indexes. */ 377 + x = arr[i]; 378 + arr[i] = arr[j]; 379 + arr[j] = x; 380 + } 381 + } 382 + 383 + static int test_func(void *private) 384 + { 385 + struct test_driver *t = private; 386 + cpumask_t newmask = CPU_MASK_NONE; 387 + int random_array[ARRAY_SIZE(test_case_array)]; 388 + int index, i, j, ret; 389 + ktime_t kt; 390 + u64 delta; 391 + 392 + cpumask_set_cpu(t->cpu, &newmask); 393 + set_cpus_allowed_ptr(current, &newmask); 394 + 395 + for (i = 0; i < ARRAY_SIZE(test_case_array); i++) 396 + random_array[i] = i; 397 + 398 + if (!sequential_test_order) 399 + shuffle_array(random_array, ARRAY_SIZE(test_case_array)); 400 + 401 + /* 402 + * Block until initialization is done. 403 + */ 404 + down_read(&prepare_for_test_rwsem); 405 + 406 + t->start = get_cycles(); 407 + for (i = 0; i < ARRAY_SIZE(test_case_array); i++) { 408 + index = random_array[i]; 409 + 410 + /* 411 + * Skip tests if run_test_mask has been specified. 412 + */ 413 + if (!((run_test_mask & (1 << index)) >> index)) 414 + continue; 415 + 416 + kt = ktime_get(); 417 + for (j = 0; j < test_repeat_count; j++) { 418 + ret = test_case_array[index].test_func(); 419 + if (!ret) 420 + per_cpu_test_data[t->cpu][index].test_passed++; 421 + else 422 + per_cpu_test_data[t->cpu][index].test_failed++; 423 + } 424 + 425 + /* 426 + * Take an average time that test took. 427 + */ 428 + delta = (u64) ktime_us_delta(ktime_get(), kt); 429 + do_div(delta, (u32) test_repeat_count); 430 + 431 + per_cpu_test_data[t->cpu][index].time = delta; 432 + } 433 + t->stop = get_cycles(); 434 + 435 + up_read(&prepare_for_test_rwsem); 436 + test_report_one_done(); 437 + 438 + /* 439 + * Wait for the kthread_stop() call. 440 + */ 441 + while (!kthread_should_stop()) 442 + msleep(10); 443 + 444 + return 0; 445 + } 446 + 447 + static void 448 + init_test_configurtion(void) 449 + { 450 + /* 451 + * Reset all data of all CPUs. 452 + */ 453 + memset(per_cpu_test_data, 0, sizeof(per_cpu_test_data)); 454 + 455 + if (single_cpu_test) 456 + cpumask_set_cpu(cpumask_first(cpu_online_mask), 457 + &cpus_run_test_mask); 458 + else 459 + cpumask_and(&cpus_run_test_mask, cpu_online_mask, 460 + cpu_online_mask); 461 + 462 + if (test_repeat_count <= 0) 463 + test_repeat_count = 1; 464 + 465 + if (test_loop_count <= 0) 466 + test_loop_count = 1; 467 + } 468 + 469 + static void do_concurrent_test(void) 470 + { 471 + int cpu, ret; 472 + 473 + /* 474 + * Set some basic configurations plus sanity check. 475 + */ 476 + init_test_configurtion(); 477 + 478 + /* 479 + * Put on hold all workers. 480 + */ 481 + down_write(&prepare_for_test_rwsem); 482 + 483 + for_each_cpu(cpu, &cpus_run_test_mask) { 484 + struct test_driver *t = &per_cpu_test_driver[cpu]; 485 + 486 + t->cpu = cpu; 487 + t->task = kthread_run(test_func, t, "vmalloc_test/%d", cpu); 488 + 489 + if (!IS_ERR(t->task)) 490 + /* Success. */ 491 + atomic_inc(&test_n_undone); 492 + else 493 + pr_err("Failed to start kthread for %d CPU\n", cpu); 494 + } 495 + 496 + /* 497 + * Now let the workers do their job. 498 + */ 499 + up_write(&prepare_for_test_rwsem); 500 + 501 + /* 502 + * Sleep quiet until all workers are done with 1 second 503 + * interval. Since the test can take a lot of time we 504 + * can run into a stack trace of the hung task. That is 505 + * why we go with completion_timeout and HZ value. 506 + */ 507 + do { 508 + ret = wait_for_completion_timeout(&test_all_done_comp, HZ); 509 + } while (!ret); 510 + 511 + for_each_cpu(cpu, &cpus_run_test_mask) { 512 + struct test_driver *t = &per_cpu_test_driver[cpu]; 513 + int i; 514 + 515 + if (!IS_ERR(t->task)) 516 + kthread_stop(t->task); 517 + 518 + for (i = 0; i < ARRAY_SIZE(test_case_array); i++) { 519 + if (!((run_test_mask & (1 << i)) >> i)) 520 + continue; 521 + 522 + pr_info( 523 + "Summary: %s passed: %d failed: %d repeat: %d loops: %d avg: %llu usec\n", 524 + test_case_array[i].test_name, 525 + per_cpu_test_data[cpu][i].test_passed, 526 + per_cpu_test_data[cpu][i].test_failed, 527 + test_repeat_count, test_loop_count, 528 + per_cpu_test_data[cpu][i].time); 529 + } 530 + 531 + pr_info("All test took CPU%d=%lu cycles\n", 532 + cpu, t->stop - t->start); 533 + } 534 + } 535 + 536 + static int vmalloc_test_init(void) 537 + { 538 + do_concurrent_test(); 539 + return -EAGAIN; /* Fail will directly unload the module */ 540 + } 541 + 542 + static void vmalloc_test_exit(void) 543 + { 544 + } 545 + 546 + module_init(vmalloc_test_init) 547 + module_exit(vmalloc_test_exit) 548 + 549 + MODULE_LICENSE("GPL"); 550 + MODULE_AUTHOR("Uladzislau Rezki"); 551 + MODULE_DESCRIPTION("vmalloc test module");

+17

mm/Kconfig.debug

··· 39 39 Enable debug page memory allocations by default? This value 40 40 can be overridden by debug_pagealloc=off|on. 41 41 42 + config PAGE_OWNER 43 + bool "Track page owner" 44 + depends on DEBUG_KERNEL && STACKTRACE_SUPPORT 45 + select DEBUG_FS 46 + select STACKTRACE 47 + select STACKDEPOT 48 + select PAGE_EXTENSION 49 + help 50 + This keeps track of what call chain is the owner of a page, may 51 + help to find bare alloc_page(s) leaks. Even if you include this 52 + feature on your build, it is disabled in default. You should pass 53 + "page_owner=on" to boot parameter in order to enable it. Eats 54 + a fair amount of memory if enabled. See tools/vm/page_owner_sort.c 55 + for user-space helper. 56 + 57 + If unsure, say N. 58 + 42 59 config PAGE_POISONING 43 60 bool "Poison pages after freeing" 44 61 select PAGE_POISONING_NO_SANITY if HIBERNATION

+3 -1

mm/cma.c

··· 353 353 354 354 ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); 355 355 if (ret) 356 - goto err; 356 + goto free_mem; 357 357 358 358 pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, 359 359 &base); 360 360 return 0; 361 361 362 + free_mem: 363 + memblock_free(base, size); 362 364 err: 363 365 pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); 364 366 return ret;

+4 -7

mm/cma_debug.c

··· 21 21 unsigned long n; 22 22 }; 23 23 24 - static struct dentry *cma_debugfs_root; 25 - 26 24 static int cma_debugfs_get(void *data, u64 *val) 27 25 { 28 26 unsigned long *p = data; ··· 160 162 } 161 163 DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); 162 164 163 - static void cma_debugfs_add_one(struct cma *cma, int idx) 165 + static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry) 164 166 { 165 167 struct dentry *tmp; 166 168 char name[16]; ··· 168 170 169 171 scnprintf(name, sizeof(name), "cma-%s", cma->name); 170 172 171 - tmp = debugfs_create_dir(name, cma_debugfs_root); 173 + tmp = debugfs_create_dir(name, root_dentry); 172 174 173 175 debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops); 174 176 debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops); ··· 186 188 187 189 static int __init cma_debugfs_init(void) 188 190 { 191 + struct dentry *cma_debugfs_root; 189 192 int i; 190 193 191 194 cma_debugfs_root = debugfs_create_dir("cma", NULL); 192 - if (!cma_debugfs_root) 193 - return -ENOMEM; 194 195 195 196 for (i = 0; i < cma_area_count; i++) 196 - cma_debugfs_add_one(&cma_areas[i], i); 197 + cma_debugfs_add_one(&cma_areas[i], cma_debugfs_root); 197 198 198 199 return 0; 199 200 }

+804 -235

mm/compaction.c

··· 66 66 return high_pfn; 67 67 } 68 68 69 - static void map_pages(struct list_head *list) 69 + static void split_map_pages(struct list_head *list) 70 70 { 71 71 unsigned int i, order, nr_pages; 72 72 struct page *page, *next; ··· 237 237 return false; 238 238 } 239 239 240 + static bool 241 + __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, 242 + bool check_target) 243 + { 244 + struct page *page = pfn_to_online_page(pfn); 245 + struct page *end_page; 246 + unsigned long block_pfn; 247 + 248 + if (!page) 249 + return false; 250 + if (zone != page_zone(page)) 251 + return false; 252 + if (pageblock_skip_persistent(page)) 253 + return false; 254 + 255 + /* 256 + * If skip is already cleared do no further checking once the 257 + * restart points have been set. 258 + */ 259 + if (check_source && check_target && !get_pageblock_skip(page)) 260 + return true; 261 + 262 + /* 263 + * If clearing skip for the target scanner, do not select a 264 + * non-movable pageblock as the starting point. 265 + */ 266 + if (!check_source && check_target && 267 + get_pageblock_migratetype(page) != MIGRATE_MOVABLE) 268 + return false; 269 + 270 + /* 271 + * Only clear the hint if a sample indicates there is either a 272 + * free page or an LRU page in the block. One or other condition 273 + * is necessary for the block to be a migration source/target. 274 + */ 275 + block_pfn = pageblock_start_pfn(pfn); 276 + pfn = max(block_pfn, zone->zone_start_pfn); 277 + page = pfn_to_page(pfn); 278 + if (zone != page_zone(page)) 279 + return false; 280 + pfn = block_pfn + pageblock_nr_pages; 281 + pfn = min(pfn, zone_end_pfn(zone)); 282 + end_page = pfn_to_page(pfn); 283 + 284 + do { 285 + if (pfn_valid_within(pfn)) { 286 + if (check_source && PageLRU(page)) { 287 + clear_pageblock_skip(page); 288 + return true; 289 + } 290 + 291 + if (check_target && PageBuddy(page)) { 292 + clear_pageblock_skip(page); 293 + return true; 294 + } 295 + } 296 + 297 + page += (1 << PAGE_ALLOC_COSTLY_ORDER); 298 + pfn += (1 << PAGE_ALLOC_COSTLY_ORDER); 299 + } while (page < end_page); 300 + 301 + return false; 302 + } 303 + 240 304 /* 241 305 * This function is called to clear all cached information on pageblocks that 242 306 * should be skipped for page isolation when the migrate and free page scanner ··· 308 244 */ 309 245 static void __reset_isolation_suitable(struct zone *zone) 310 246 { 311 - unsigned long start_pfn = zone->zone_start_pfn; 312 - unsigned long end_pfn = zone_end_pfn(zone); 313 - unsigned long pfn; 247 + unsigned long migrate_pfn = zone->zone_start_pfn; 248 + unsigned long free_pfn = zone_end_pfn(zone); 249 + unsigned long reset_migrate = free_pfn; 250 + unsigned long reset_free = migrate_pfn; 251 + bool source_set = false; 252 + bool free_set = false; 253 + 254 + if (!zone->compact_blockskip_flush) 255 + return; 314 256 315 257 zone->compact_blockskip_flush = false; 316 258 317 - /* Walk the zone and mark every pageblock as suitable for isolation */ 318 - for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 319 - struct page *page; 320 - 259 + /* 260 + * Walk the zone and update pageblock skip information. Source looks 261 + * for PageLRU while target looks for PageBuddy. When the scanner 262 + * is found, both PageBuddy and PageLRU are checked as the pageblock 263 + * is suitable as both source and target. 264 + */ 265 + for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages, 266 + free_pfn -= pageblock_nr_pages) { 321 267 cond_resched(); 322 268 323 - page = pfn_to_online_page(pfn); 324 - if (!page) 325 - continue; 326 - if (zone != page_zone(page)) 327 - continue; 328 - if (pageblock_skip_persistent(page)) 329 - continue; 269 + /* Update the migrate PFN */ 270 + if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) && 271 + migrate_pfn < reset_migrate) { 272 + source_set = true; 273 + reset_migrate = migrate_pfn; 274 + zone->compact_init_migrate_pfn = reset_migrate; 275 + zone->compact_cached_migrate_pfn[0] = reset_migrate; 276 + zone->compact_cached_migrate_pfn[1] = reset_migrate; 277 + } 330 278 331 - clear_pageblock_skip(page); 279 + /* Update the free PFN */ 280 + if (__reset_isolation_pfn(zone, free_pfn, free_set, true) && 281 + free_pfn > reset_free) { 282 + free_set = true; 283 + reset_free = free_pfn; 284 + zone->compact_init_free_pfn = reset_free; 285 + zone->compact_cached_free_pfn = reset_free; 286 + } 332 287 } 333 288 334 - reset_cached_positions(zone); 289 + /* Leave no distance if no suitable block was reset */ 290 + if (reset_migrate >= reset_free) { 291 + zone->compact_cached_migrate_pfn[0] = migrate_pfn; 292 + zone->compact_cached_migrate_pfn[1] = migrate_pfn; 293 + zone->compact_cached_free_pfn = free_pfn; 294 + } 335 295 } 336 296 337 297 void reset_isolation_suitable(pg_data_t *pgdat) ··· 374 286 } 375 287 376 288 /* 289 + * Sets the pageblock skip bit if it was clear. Note that this is a hint as 290 + * locks are not required for read/writers. Returns true if it was already set. 291 + */ 292 + static bool test_and_set_skip(struct compact_control *cc, struct page *page, 293 + unsigned long pfn) 294 + { 295 + bool skip; 296 + 297 + /* Do no update if skip hint is being ignored */ 298 + if (cc->ignore_skip_hint) 299 + return false; 300 + 301 + if (!IS_ALIGNED(pfn, pageblock_nr_pages)) 302 + return false; 303 + 304 + skip = get_pageblock_skip(page); 305 + if (!skip && !cc->no_set_skip_hint) 306 + set_pageblock_skip(page); 307 + 308 + return skip; 309 + } 310 + 311 + static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) 312 + { 313 + struct zone *zone = cc->zone; 314 + 315 + pfn = pageblock_end_pfn(pfn); 316 + 317 + /* Set for isolation rather than compaction */ 318 + if (cc->no_set_skip_hint) 319 + return; 320 + 321 + if (pfn > zone->compact_cached_migrate_pfn[0]) 322 + zone->compact_cached_migrate_pfn[0] = pfn; 323 + if (cc->mode != MIGRATE_ASYNC && 324 + pfn > zone->compact_cached_migrate_pfn[1]) 325 + zone->compact_cached_migrate_pfn[1] = pfn; 326 + } 327 + 328 + /* 377 329 * If no pages were isolated then mark this pageblock to be skipped in the 378 330 * future. The information is later cleared by __reset_isolation_suitable(). 379 331 */ 380 332 static void update_pageblock_skip(struct compact_control *cc, 381 - struct page *page, unsigned long nr_isolated, 382 - bool migrate_scanner) 333 + struct page *page, unsigned long pfn) 383 334 { 384 335 struct zone *zone = cc->zone; 385 - unsigned long pfn; 386 336 387 337 if (cc->no_set_skip_hint) 388 338 return; ··· 428 302 if (!page) 429 303 return; 430 304 431 - if (nr_isolated) 432 - return; 433 - 434 305 set_pageblock_skip(page); 435 306 436 - pfn = page_to_pfn(page); 437 - 438 307 /* Update where async and sync compaction should restart */ 439 - if (migrate_scanner) { 440 - if (pfn > zone->compact_cached_migrate_pfn[0]) 441 - zone->compact_cached_migrate_pfn[0] = pfn; 442 - if (cc->mode != MIGRATE_ASYNC && 443 - pfn > zone->compact_cached_migrate_pfn[1]) 444 - zone->compact_cached_migrate_pfn[1] = pfn; 445 - } else { 446 - if (pfn < zone->compact_cached_free_pfn) 447 - zone->compact_cached_free_pfn = pfn; 448 - } 308 + if (pfn < zone->compact_cached_free_pfn) 309 + zone->compact_cached_free_pfn = pfn; 449 310 } 450 311 #else 451 312 static inline bool isolation_suitable(struct compact_control *cc, ··· 447 334 } 448 335 449 336 static inline void update_pageblock_skip(struct compact_control *cc, 450 - struct page *page, unsigned long nr_isolated, 451 - bool migrate_scanner) 337 + struct page *page, unsigned long pfn) 452 338 { 339 + } 340 + 341 + static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) 342 + { 343 + } 344 + 345 + static bool test_and_set_skip(struct compact_control *cc, struct page *page, 346 + unsigned long pfn) 347 + { 348 + return false; 453 349 } 454 350 #endif /* CONFIG_COMPACTION */ 455 351 456 352 /* 457 353 * Compaction requires the taking of some coarse locks that are potentially 458 - * very heavily contended. For async compaction, back out if the lock cannot 459 - * be taken immediately. For sync compaction, spin on the lock if needed. 354 + * very heavily contended. For async compaction, trylock and record if the 355 + * lock is contended. The lock will still be acquired but compaction will 356 + * abort when the current block is finished regardless of success rate. 357 + * Sync compaction acquires the lock. 460 358 * 461 - * Returns true if the lock is held 462 - * Returns false if the lock is not held and compaction should abort 359 + * Always returns true which makes it easier to track lock state in callers. 463 360 */ 464 - static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, 361 + static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags, 465 362 struct compact_control *cc) 466 363 { 467 - if (cc->mode == MIGRATE_ASYNC) { 468 - if (!spin_trylock_irqsave(lock, *flags)) { 469 - cc->contended = true; 470 - return false; 471 - } 472 - } else { 473 - spin_lock_irqsave(lock, *flags); 364 + /* Track if the lock is contended in async mode */ 365 + if (cc->mode == MIGRATE_ASYNC && !cc->contended) { 366 + if (spin_trylock_irqsave(lock, *flags)) 367 + return true; 368 + 369 + cc->contended = true; 474 370 } 475 371 372 + spin_lock_irqsave(lock, *flags); 476 373 return true; 477 374 } 478 375 ··· 514 391 return true; 515 392 } 516 393 517 - if (need_resched()) { 518 - if (cc->mode == MIGRATE_ASYNC) { 519 - cc->contended = true; 520 - return true; 521 - } 522 - cond_resched(); 523 - } 524 - 525 - return false; 526 - } 527 - 528 - /* 529 - * Aside from avoiding lock contention, compaction also periodically checks 530 - * need_resched() and either schedules in sync compaction or aborts async 531 - * compaction. This is similar to what compact_unlock_should_abort() does, but 532 - * is used where no lock is concerned. 533 - * 534 - * Returns false when no scheduling was needed, or sync compaction scheduled. 535 - * Returns true when async compaction should abort. 536 - */ 537 - static inline bool compact_should_abort(struct compact_control *cc) 538 - { 539 - /* async compaction aborts if contended */ 540 - if (need_resched()) { 541 - if (cc->mode == MIGRATE_ASYNC) { 542 - cc->contended = true; 543 - return true; 544 - } 545 - 546 - cond_resched(); 547 - } 394 + cond_resched(); 548 395 549 396 return false; 550 397 } ··· 528 435 unsigned long *start_pfn, 529 436 unsigned long end_pfn, 530 437 struct list_head *freelist, 438 + unsigned int stride, 531 439 bool strict) 532 440 { 533 441 int nr_scanned = 0, total_isolated = 0; 534 - struct page *cursor, *valid_page = NULL; 442 + struct page *cursor; 535 443 unsigned long flags = 0; 536 444 bool locked = false; 537 445 unsigned long blockpfn = *start_pfn; 538 446 unsigned int order; 539 447 448 + /* Strict mode is for isolation, speed is secondary */ 449 + if (strict) 450 + stride = 1; 451 + 540 452 cursor = pfn_to_page(blockpfn); 541 453 542 454 /* Isolate free pages. */ 543 - for (; blockpfn < end_pfn; blockpfn++, cursor++) { 455 + for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) { 544 456 int isolated; 545 457 struct page *page = cursor; 546 458 ··· 562 464 nr_scanned++; 563 465 if (!pfn_valid_within(blockpfn)) 564 466 goto isolate_fail; 565 - 566 - if (!valid_page) 567 - valid_page = page; 568 467 569 468 /* 570 469 * For compound pages such as THP and hugetlbfs, we can save ··· 590 495 * recheck as well. 591 496 */ 592 497 if (!locked) { 593 - /* 594 - * The zone lock must be held to isolate freepages. 595 - * Unfortunately this is a very coarse lock and can be 596 - * heavily contended if there are parallel allocations 597 - * or parallel compactions. For async compaction do not 598 - * spin on the lock and we acquire the lock as late as 599 - * possible. 600 - */ 601 - locked = compact_trylock_irqsave(&cc->zone->lock, 498 + locked = compact_lock_irqsave(&cc->zone->lock, 602 499 &flags, cc); 603 - if (!locked) 604 - break; 605 500 606 501 /* Recheck this is a buddy page under lock */ 607 502 if (!PageBuddy(page)) ··· 650 565 if (strict && blockpfn < end_pfn) 651 566 total_isolated = 0; 652 567 653 - /* Update the pageblock-skip if the whole pageblock was scanned */ 654 - if (blockpfn == end_pfn) 655 - update_pageblock_skip(cc, valid_page, total_isolated, false); 656 - 657 568 cc->total_free_scanned += nr_scanned; 658 569 if (total_isolated) 659 570 count_compact_events(COMPACTISOLATED, total_isolated); ··· 707 626 break; 708 627 709 628 isolated = isolate_freepages_block(cc, &isolate_start_pfn, 710 - block_end_pfn, &freelist, true); 629 + block_end_pfn, &freelist, 0, true); 711 630 712 631 /* 713 632 * In strict mode, isolate_freepages_block() returns 0 if ··· 725 644 } 726 645 727 646 /* __isolate_free_page() does not map the pages */ 728 - map_pages(&freelist); 647 + split_map_pages(&freelist); 729 648 730 649 if (pfn < end_pfn) { 731 650 /* Loop terminated early, cleanup. */ ··· 738 657 } 739 658 740 659 /* Similar to reclaim, but different enough that they don't share logic */ 741 - static bool too_many_isolated(struct zone *zone) 660 + static bool too_many_isolated(pg_data_t *pgdat) 742 661 { 743 662 unsigned long active, inactive, isolated; 744 663 745 - inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) + 746 - node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON); 747 - active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) + 748 - node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON); 749 - isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) + 750 - node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON); 664 + inactive = node_page_state(pgdat, NR_INACTIVE_FILE) + 665 + node_page_state(pgdat, NR_INACTIVE_ANON); 666 + active = node_page_state(pgdat, NR_ACTIVE_FILE) + 667 + node_page_state(pgdat, NR_ACTIVE_ANON); 668 + isolated = node_page_state(pgdat, NR_ISOLATED_FILE) + 669 + node_page_state(pgdat, NR_ISOLATED_ANON); 751 670 752 671 return isolated > (inactive + active) / 2; 753 672 } ··· 774 693 isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, 775 694 unsigned long end_pfn, isolate_mode_t isolate_mode) 776 695 { 777 - struct zone *zone = cc->zone; 696 + pg_data_t *pgdat = cc->zone->zone_pgdat; 778 697 unsigned long nr_scanned = 0, nr_isolated = 0; 779 698 struct lruvec *lruvec; 780 699 unsigned long flags = 0; ··· 783 702 unsigned long start_pfn = low_pfn; 784 703 bool skip_on_failure = false; 785 704 unsigned long next_skip_pfn = 0; 705 + bool skip_updated = false; 786 706 787 707 /* 788 708 * Ensure that there are not too many pages isolated from the LRU 789 709 * list by either parallel reclaimers or compaction. If there are, 790 710 * delay for some time until fewer pages are isolated 791 711 */ 792 - while (unlikely(too_many_isolated(zone))) { 712 + while (unlikely(too_many_isolated(pgdat))) { 793 713 /* async migration should just abort */ 794 714 if (cc->mode == MIGRATE_ASYNC) 795 715 return 0; ··· 801 719 return 0; 802 720 } 803 721 804 - if (compact_should_abort(cc)) 805 - return 0; 722 + cond_resched(); 806 723 807 724 if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) { 808 725 skip_on_failure = true; ··· 839 758 * if contended. 840 759 */ 841 760 if (!(low_pfn % SWAP_CLUSTER_MAX) 842 - && compact_unlock_should_abort(zone_lru_lock(zone), flags, 843 - &locked, cc)) 761 + && compact_unlock_should_abort(&pgdat->lru_lock, 762 + flags, &locked, cc)) 844 763 break; 845 764 846 765 if (!pfn_valid_within(low_pfn)) ··· 849 768 850 769 page = pfn_to_page(low_pfn); 851 770 852 - if (!valid_page) 771 + /* 772 + * Check if the pageblock has already been marked skipped. 773 + * Only the aligned PFN is checked as the caller isolates 774 + * COMPACT_CLUSTER_MAX at a time so the second call must 775 + * not falsely conclude that the block should be skipped. 776 + */ 777 + if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) { 778 + if (!cc->ignore_skip_hint && get_pageblock_skip(page)) { 779 + low_pfn = end_pfn; 780 + goto isolate_abort; 781 + } 853 782 valid_page = page; 783 + } 854 784 855 785 /* 856 786 * Skip if free. We read page order here without zone lock ··· 910 818 if (unlikely(__PageMovable(page)) && 911 819 !PageIsolated(page)) { 912 820 if (locked) { 913 - spin_unlock_irqrestore(zone_lru_lock(zone), 821 + spin_unlock_irqrestore(&pgdat->lru_lock, 914 822 flags); 915 823 locked = false; 916 824 } ··· 940 848 941 849 /* If we already hold the lock, we can skip some rechecking */ 942 850 if (!locked) { 943 - locked = compact_trylock_irqsave(zone_lru_lock(zone), 851 + locked = compact_lock_irqsave(&pgdat->lru_lock, 944 852 &flags, cc); 945 - if (!locked) 946 - break; 853 + 854 + /* Try get exclusive access under lock */ 855 + if (!skip_updated) { 856 + skip_updated = true; 857 + if (test_and_set_skip(cc, page, low_pfn)) 858 + goto isolate_abort; 859 + } 947 860 948 861 /* Recheck PageLRU and PageCompound under lock */ 949 862 if (!PageLRU(page)) ··· 965 868 } 966 869 } 967 870 968 - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 871 + lruvec = mem_cgroup_page_lruvec(page, pgdat); 969 872 970 873 /* Try isolate the page */ 971 874 if (__isolate_lru_page(page, isolate_mode) != 0) ··· 984 887 nr_isolated++; 985 888 986 889 /* 987 - * Record where we could have freed pages by migration and not 988 - * yet flushed them to buddy allocator. 989 - * - this is the lowest page that was isolated and likely be 990 - * then freed by migration. 890 + * Avoid isolating too much unless this block is being 891 + * rescanned (e.g. dirty/writeback pages, parallel allocation) 892 + * or a lock is contended. For contention, isolate quickly to 893 + * potentially remove one source of contention. 991 894 */ 992 - if (!cc->last_migrated_pfn) 993 - cc->last_migrated_pfn = low_pfn; 994 - 995 - /* Avoid isolating too much */ 996 - if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { 895 + if (cc->nr_migratepages == COMPACT_CLUSTER_MAX && 896 + !cc->rescan && !cc->contended) { 997 897 ++low_pfn; 998 898 break; 999 899 } ··· 1007 913 */ 1008 914 if (nr_isolated) { 1009 915 if (locked) { 1010 - spin_unlock_irqrestore(zone_lru_lock(zone), flags); 916 + spin_unlock_irqrestore(&pgdat->lru_lock, flags); 1011 917 locked = false; 1012 918 } 1013 919 putback_movable_pages(&cc->migratepages); 1014 920 cc->nr_migratepages = 0; 1015 - cc->last_migrated_pfn = 0; 1016 921 nr_isolated = 0; 1017 922 } 1018 923 ··· 1032 939 if (unlikely(low_pfn > end_pfn)) 1033 940 low_pfn = end_pfn; 1034 941 942 + isolate_abort: 1035 943 if (locked) 1036 - spin_unlock_irqrestore(zone_lru_lock(zone), flags); 944 + spin_unlock_irqrestore(&pgdat->lru_lock, flags); 1037 945 1038 946 /* 1039 - * Update the pageblock-skip information and cached scanner pfn, 1040 - * if the whole pageblock was scanned without isolating any page. 947 + * Updated the cached scanner pfn once the pageblock has been scanned 948 + * Pages will either be migrated in which case there is no point 949 + * scanning in the near future or migration failed in which case the 950 + * failure reason may persist. The block is marked for skipping if 951 + * there were no pages isolated in the block or if the block is 952 + * rescanned twice in a row. 1041 953 */ 1042 - if (low_pfn == end_pfn) 1043 - update_pageblock_skip(cc, valid_page, nr_isolated, true); 954 + if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) { 955 + if (valid_page && !skip_updated) 956 + set_pageblock_skip(valid_page); 957 + update_cached_migrate(cc, low_pfn); 958 + } 1044 959 1045 960 trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, 1046 961 nr_scanned, nr_isolated); ··· 1114 1013 { 1115 1014 int block_mt; 1116 1015 1016 + if (pageblock_skip_persistent(page)) 1017 + return false; 1018 + 1117 1019 if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) 1118 1020 return true; 1119 1021 ··· 1154 1050 return false; 1155 1051 } 1156 1052 1053 + static inline unsigned int 1054 + freelist_scan_limit(struct compact_control *cc) 1055 + { 1056 + return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1; 1057 + } 1058 + 1157 1059 /* 1158 1060 * Test whether the free scanner has reached the same or lower pageblock than 1159 1061 * the migration scanner, and compaction should thus terminate. ··· 1168 1058 { 1169 1059 return (cc->free_pfn >> pageblock_order) 1170 1060 <= (cc->migrate_pfn >> pageblock_order); 1061 + } 1062 + 1063 + /* 1064 + * Used when scanning for a suitable migration target which scans freelists 1065 + * in reverse. Reorders the list such as the unscanned pages are scanned 1066 + * first on the next iteration of the free scanner 1067 + */ 1068 + static void 1069 + move_freelist_head(struct list_head *freelist, struct page *freepage) 1070 + { 1071 + LIST_HEAD(sublist); 1072 + 1073 + if (!list_is_last(freelist, &freepage->lru)) { 1074 + list_cut_before(&sublist, freelist, &freepage->lru); 1075 + if (!list_empty(&sublist)) 1076 + list_splice_tail(&sublist, freelist); 1077 + } 1078 + } 1079 + 1080 + /* 1081 + * Similar to move_freelist_head except used by the migration scanner 1082 + * when scanning forward. It's possible for these list operations to 1083 + * move against each other if they search the free list exactly in 1084 + * lockstep. 1085 + */ 1086 + static void 1087 + move_freelist_tail(struct list_head *freelist, struct page *freepage) 1088 + { 1089 + LIST_HEAD(sublist); 1090 + 1091 + if (!list_is_first(freelist, &freepage->lru)) { 1092 + list_cut_position(&sublist, freelist, &freepage->lru); 1093 + if (!list_empty(&sublist)) 1094 + list_splice_tail(&sublist, freelist); 1095 + } 1096 + } 1097 + 1098 + static void 1099 + fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated) 1100 + { 1101 + unsigned long start_pfn, end_pfn; 1102 + struct page *page = pfn_to_page(pfn); 1103 + 1104 + /* Do not search around if there are enough pages already */ 1105 + if (cc->nr_freepages >= cc->nr_migratepages) 1106 + return; 1107 + 1108 + /* Minimise scanning during async compaction */ 1109 + if (cc->direct_compaction && cc->mode == MIGRATE_ASYNC) 1110 + return; 1111 + 1112 + /* Pageblock boundaries */ 1113 + start_pfn = pageblock_start_pfn(pfn); 1114 + end_pfn = min(start_pfn + pageblock_nr_pages, zone_end_pfn(cc->zone)); 1115 + 1116 + /* Scan before */ 1117 + if (start_pfn != pfn) { 1118 + isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false); 1119 + if (cc->nr_freepages >= cc->nr_migratepages) 1120 + return; 1121 + } 1122 + 1123 + /* Scan after */ 1124 + start_pfn = pfn + nr_isolated; 1125 + if (start_pfn != end_pfn) 1126 + isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); 1127 + 1128 + /* Skip this pageblock in the future as it's full or nearly full */ 1129 + if (cc->nr_freepages < cc->nr_migratepages) 1130 + set_pageblock_skip(page); 1131 + } 1132 + 1133 + /* Search orders in round-robin fashion */ 1134 + static int next_search_order(struct compact_control *cc, int order) 1135 + { 1136 + order--; 1137 + if (order < 0) 1138 + order = cc->order - 1; 1139 + 1140 + /* Search wrapped around? */ 1141 + if (order == cc->search_order) { 1142 + cc->search_order--; 1143 + if (cc->search_order < 0) 1144 + cc->search_order = cc->order - 1; 1145 + return -1; 1146 + } 1147 + 1148 + return order; 1149 + } 1150 + 1151 + static unsigned long 1152 + fast_isolate_freepages(struct compact_control *cc) 1153 + { 1154 + unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1); 1155 + unsigned int nr_scanned = 0; 1156 + unsigned long low_pfn, min_pfn, high_pfn = 0, highest = 0; 1157 + unsigned long nr_isolated = 0; 1158 + unsigned long distance; 1159 + struct page *page = NULL; 1160 + bool scan_start = false; 1161 + int order; 1162 + 1163 + /* Full compaction passes in a negative order */ 1164 + if (cc->order <= 0) 1165 + return cc->free_pfn; 1166 + 1167 + /* 1168 + * If starting the scan, use a deeper search and use the highest 1169 + * PFN found if a suitable one is not found. 1170 + */ 1171 + if (cc->free_pfn >= cc->zone->compact_init_free_pfn) { 1172 + limit = pageblock_nr_pages >> 1; 1173 + scan_start = true; 1174 + } 1175 + 1176 + /* 1177 + * Preferred point is in the top quarter of the scan space but take 1178 + * a pfn from the top half if the search is problematic. 1179 + */ 1180 + distance = (cc->free_pfn - cc->migrate_pfn); 1181 + low_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 2)); 1182 + min_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 1)); 1183 + 1184 + if (WARN_ON_ONCE(min_pfn > low_pfn)) 1185 + low_pfn = min_pfn; 1186 + 1187 + /* 1188 + * Search starts from the last successful isolation order or the next 1189 + * order to search after a previous failure 1190 + */ 1191 + cc->search_order = min_t(unsigned int, cc->order - 1, cc->search_order); 1192 + 1193 + for (order = cc->search_order; 1194 + !page && order >= 0; 1195 + order = next_search_order(cc, order)) { 1196 + struct free_area *area = &cc->zone->free_area[order]; 1197 + struct list_head *freelist; 1198 + struct page *freepage; 1199 + unsigned long flags; 1200 + unsigned int order_scanned = 0; 1201 + 1202 + if (!area->nr_free) 1203 + continue; 1204 + 1205 + spin_lock_irqsave(&cc->zone->lock, flags); 1206 + freelist = &area->free_list[MIGRATE_MOVABLE]; 1207 + list_for_each_entry_reverse(freepage, freelist, lru) { 1208 + unsigned long pfn; 1209 + 1210 + order_scanned++; 1211 + nr_scanned++; 1212 + pfn = page_to_pfn(freepage); 1213 + 1214 + if (pfn >= highest) 1215 + highest = pageblock_start_pfn(pfn); 1216 + 1217 + if (pfn >= low_pfn) { 1218 + cc->fast_search_fail = 0; 1219 + cc->search_order = order; 1220 + page = freepage; 1221 + break; 1222 + } 1223 + 1224 + if (pfn >= min_pfn && pfn > high_pfn) { 1225 + high_pfn = pfn; 1226 + 1227 + /* Shorten the scan if a candidate is found */ 1228 + limit >>= 1; 1229 + } 1230 + 1231 + if (order_scanned >= limit) 1232 + break; 1233 + } 1234 + 1235 + /* Use a minimum pfn if a preferred one was not found */ 1236 + if (!page && high_pfn) { 1237 + page = pfn_to_page(high_pfn); 1238 + 1239 + /* Update freepage for the list reorder below */ 1240 + freepage = page; 1241 + } 1242 + 1243 + /* Reorder to so a future search skips recent pages */ 1244 + move_freelist_head(freelist, freepage); 1245 + 1246 + /* Isolate the page if available */ 1247 + if (page) { 1248 + if (__isolate_free_page(page, order)) { 1249 + set_page_private(page, order); 1250 + nr_isolated = 1 << order; 1251 + cc->nr_freepages += nr_isolated; 1252 + list_add_tail(&page->lru, &cc->freepages); 1253 + count_compact_events(COMPACTISOLATED, nr_isolated); 1254 + } else { 1255 + /* If isolation fails, abort the search */ 1256 + order = -1; 1257 + page = NULL; 1258 + } 1259 + } 1260 + 1261 + spin_unlock_irqrestore(&cc->zone->lock, flags); 1262 + 1263 + /* 1264 + * Smaller scan on next order so the total scan ig related 1265 + * to freelist_scan_limit. 1266 + */ 1267 + if (order_scanned >= limit) 1268 + limit = min(1U, limit >> 1); 1269 + } 1270 + 1271 + if (!page) { 1272 + cc->fast_search_fail++; 1273 + if (scan_start) { 1274 + /* 1275 + * Use the highest PFN found above min. If one was 1276 + * not found, be pessemistic for direct compaction 1277 + * and use the min mark. 1278 + */ 1279 + if (highest) { 1280 + page = pfn_to_page(highest); 1281 + cc->free_pfn = highest; 1282 + } else { 1283 + if (cc->direct_compaction) { 1284 + page = pfn_to_page(min_pfn); 1285 + cc->free_pfn = min_pfn; 1286 + } 1287 + } 1288 + } 1289 + } 1290 + 1291 + if (highest && highest >= cc->zone->compact_cached_free_pfn) { 1292 + highest -= pageblock_nr_pages; 1293 + cc->zone->compact_cached_free_pfn = highest; 1294 + } 1295 + 1296 + cc->total_free_scanned += nr_scanned; 1297 + if (!page) 1298 + return cc->free_pfn; 1299 + 1300 + low_pfn = page_to_pfn(page); 1301 + fast_isolate_around(cc, low_pfn, nr_isolated); 1302 + return low_pfn; 1171 1303 } 1172 1304 1173 1305 /* ··· 1425 1073 unsigned long block_end_pfn; /* end of current pageblock */ 1426 1074 unsigned long low_pfn; /* lowest pfn scanner is able to scan */ 1427 1075 struct list_head *freelist = &cc->freepages; 1076 + unsigned int stride; 1077 + 1078 + /* Try a small search of the free lists for a candidate */ 1079 + isolate_start_pfn = fast_isolate_freepages(cc); 1080 + if (cc->nr_freepages) 1081 + goto splitmap; 1428 1082 1429 1083 /* 1430 1084 * Initialise the free scanner. The starting point is where we last ··· 1444 1086 * is using. 1445 1087 */ 1446 1088 isolate_start_pfn = cc->free_pfn; 1447 - block_start_pfn = pageblock_start_pfn(cc->free_pfn); 1089 + block_start_pfn = pageblock_start_pfn(isolate_start_pfn); 1448 1090 block_end_pfn = min(block_start_pfn + pageblock_nr_pages, 1449 1091 zone_end_pfn(zone)); 1450 1092 low_pfn = pageblock_end_pfn(cc->migrate_pfn); 1093 + stride = cc->mode == MIGRATE_ASYNC ? COMPACT_CLUSTER_MAX : 1; 1451 1094 1452 1095 /* 1453 1096 * Isolate free pages until enough are available to migrate the ··· 1459 1100 block_end_pfn = block_start_pfn, 1460 1101 block_start_pfn -= pageblock_nr_pages, 1461 1102 isolate_start_pfn = block_start_pfn) { 1103 + unsigned long nr_isolated; 1104 + 1462 1105 /* 1463 1106 * This can iterate a massively long zone without finding any 1464 - * suitable migration targets, so periodically check if we need 1465 - * to schedule, or even abort async compaction. 1107 + * suitable migration targets, so periodically check resched. 1466 1108 */ 1467 - if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) 1468 - && compact_should_abort(cc)) 1469 - break; 1109 + if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) 1110 + cond_resched(); 1470 1111 1471 1112 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, 1472 1113 zone); ··· 1482 1123 continue; 1483 1124 1484 1125 /* Found a block suitable for isolating free pages from. */ 1485 - isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, 1486 - freelist, false); 1126 + nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn, 1127 + block_end_pfn, freelist, stride, false); 1487 1128 1488 - /* 1489 - * If we isolated enough freepages, or aborted due to lock 1490 - * contention, terminate. 1491 - */ 1492 - if ((cc->nr_freepages >= cc->nr_migratepages) 1493 - || cc->contended) { 1129 + /* Update the skip hint if the full pageblock was scanned */ 1130 + if (isolate_start_pfn == block_end_pfn) 1131 + update_pageblock_skip(cc, page, block_start_pfn); 1132 + 1133 + /* Are enough freepages isolated? */ 1134 + if (cc->nr_freepages >= cc->nr_migratepages) { 1494 1135 if (isolate_start_pfn >= block_end_pfn) { 1495 1136 /* 1496 1137 * Restart at previous pageblock if more ··· 1507 1148 */ 1508 1149 break; 1509 1150 } 1510 - } 1511 1151 1512 - /* __isolate_free_page() does not map the pages */ 1513 - map_pages(freelist); 1152 + /* Adjust stride depending on isolation */ 1153 + if (nr_isolated) { 1154 + stride = 1; 1155 + continue; 1156 + } 1157 + stride = min_t(unsigned int, COMPACT_CLUSTER_MAX, stride << 1); 1158 + } 1514 1159 1515 1160 /* 1516 1161 * Record where the free scanner will restart next time. Either we ··· 1523 1160 * and the loop terminated due to isolate_start_pfn < low_pfn 1524 1161 */ 1525 1162 cc->free_pfn = isolate_start_pfn; 1163 + 1164 + splitmap: 1165 + /* __isolate_free_page() does not map the pages */ 1166 + split_map_pages(freelist); 1526 1167 } 1527 1168 1528 1169 /* ··· 1539 1172 struct compact_control *cc = (struct compact_control *)data; 1540 1173 struct page *freepage; 1541 1174 1542 - /* 1543 - * Isolate free pages if necessary, and if we are not aborting due to 1544 - * contention. 1545 - */ 1546 1175 if (list_empty(&cc->freepages)) { 1547 - if (!cc->contended) 1548 - isolate_freepages(cc); 1176 + isolate_freepages(cc); 1549 1177 1550 1178 if (list_empty(&cc->freepages)) 1551 1179 return NULL; ··· 1579 1217 */ 1580 1218 int sysctl_compact_unevictable_allowed __read_mostly = 1; 1581 1219 1220 + static inline void 1221 + update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) 1222 + { 1223 + if (cc->fast_start_pfn == ULONG_MAX) 1224 + return; 1225 + 1226 + if (!cc->fast_start_pfn) 1227 + cc->fast_start_pfn = pfn; 1228 + 1229 + cc->fast_start_pfn = min(cc->fast_start_pfn, pfn); 1230 + } 1231 + 1232 + static inline unsigned long 1233 + reinit_migrate_pfn(struct compact_control *cc) 1234 + { 1235 + if (!cc->fast_start_pfn || cc->fast_start_pfn == ULONG_MAX) 1236 + return cc->migrate_pfn; 1237 + 1238 + cc->migrate_pfn = cc->fast_start_pfn; 1239 + cc->fast_start_pfn = ULONG_MAX; 1240 + 1241 + return cc->migrate_pfn; 1242 + } 1243 + 1244 + /* 1245 + * Briefly search the free lists for a migration source that already has 1246 + * some free pages to reduce the number of pages that need migration 1247 + * before a pageblock is free. 1248 + */ 1249 + static unsigned long fast_find_migrateblock(struct compact_control *cc) 1250 + { 1251 + unsigned int limit = freelist_scan_limit(cc); 1252 + unsigned int nr_scanned = 0; 1253 + unsigned long distance; 1254 + unsigned long pfn = cc->migrate_pfn; 1255 + unsigned long high_pfn; 1256 + int order; 1257 + 1258 + /* Skip hints are relied on to avoid repeats on the fast search */ 1259 + if (cc->ignore_skip_hint) 1260 + return pfn; 1261 + 1262 + /* 1263 + * If the migrate_pfn is not at the start of a zone or the start 1264 + * of a pageblock then assume this is a continuation of a previous 1265 + * scan restarted due to COMPACT_CLUSTER_MAX. 1266 + */ 1267 + if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn)) 1268 + return pfn; 1269 + 1270 + /* 1271 + * For smaller orders, just linearly scan as the number of pages 1272 + * to migrate should be relatively small and does not necessarily 1273 + * justify freeing up a large block for a small allocation. 1274 + */ 1275 + if (cc->order <= PAGE_ALLOC_COSTLY_ORDER) 1276 + return pfn; 1277 + 1278 + /* 1279 + * Only allow kcompactd and direct requests for movable pages to 1280 + * quickly clear out a MOVABLE pageblock for allocation. This 1281 + * reduces the risk that a large movable pageblock is freed for 1282 + * an unmovable/reclaimable small allocation. 1283 + */ 1284 + if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE) 1285 + return pfn; 1286 + 1287 + /* 1288 + * When starting the migration scanner, pick any pageblock within the 1289 + * first half of the search space. Otherwise try and pick a pageblock 1290 + * within the first eighth to reduce the chances that a migration 1291 + * target later becomes a source. 1292 + */ 1293 + distance = (cc->free_pfn - cc->migrate_pfn) >> 1; 1294 + if (cc->migrate_pfn != cc->zone->zone_start_pfn) 1295 + distance >>= 2; 1296 + high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance); 1297 + 1298 + for (order = cc->order - 1; 1299 + order >= PAGE_ALLOC_COSTLY_ORDER && pfn == cc->migrate_pfn && nr_scanned < limit; 1300 + order--) { 1301 + struct free_area *area = &cc->zone->free_area[order]; 1302 + struct list_head *freelist; 1303 + unsigned long flags; 1304 + struct page *freepage; 1305 + 1306 + if (!area->nr_free) 1307 + continue; 1308 + 1309 + spin_lock_irqsave(&cc->zone->lock, flags); 1310 + freelist = &area->free_list[MIGRATE_MOVABLE]; 1311 + list_for_each_entry(freepage, freelist, lru) { 1312 + unsigned long free_pfn; 1313 + 1314 + nr_scanned++; 1315 + free_pfn = page_to_pfn(freepage); 1316 + if (free_pfn < high_pfn) { 1317 + /* 1318 + * Avoid if skipped recently. Ideally it would 1319 + * move to the tail but even safe iteration of 1320 + * the list assumes an entry is deleted, not 1321 + * reordered. 1322 + */ 1323 + if (get_pageblock_skip(freepage)) { 1324 + if (list_is_last(freelist, &freepage->lru)) 1325 + break; 1326 + 1327 + continue; 1328 + } 1329 + 1330 + /* Reorder to so a future search skips recent pages */ 1331 + move_freelist_tail(freelist, freepage); 1332 + 1333 + update_fast_start_pfn(cc, free_pfn); 1334 + pfn = pageblock_start_pfn(free_pfn); 1335 + cc->fast_search_fail = 0; 1336 + set_pageblock_skip(freepage); 1337 + break; 1338 + } 1339 + 1340 + if (nr_scanned >= limit) { 1341 + cc->fast_search_fail++; 1342 + move_freelist_tail(freelist, freepage); 1343 + break; 1344 + } 1345 + } 1346 + spin_unlock_irqrestore(&cc->zone->lock, flags); 1347 + } 1348 + 1349 + cc->total_migrate_scanned += nr_scanned; 1350 + 1351 + /* 1352 + * If fast scanning failed then use a cached entry for a page block 1353 + * that had free pages as the basis for starting a linear scan. 1354 + */ 1355 + if (pfn == cc->migrate_pfn) 1356 + pfn = reinit_migrate_pfn(cc); 1357 + 1358 + return pfn; 1359 + } 1360 + 1582 1361 /* 1583 1362 * Isolate all pages that can be migrated from the first suitable block, 1584 1363 * starting at the block pointed to by the migrate scanner pfn within ··· 1735 1232 const isolate_mode_t isolate_mode = 1736 1233 (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | 1737 1234 (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0); 1235 + bool fast_find_block; 1738 1236 1739 1237 /* 1740 1238 * Start at where we last stopped, or beginning of the zone as 1741 - * initialized by compact_zone() 1239 + * initialized by compact_zone(). The first failure will use 1240 + * the lowest PFN as the starting point for linear scanning. 1742 1241 */ 1743 - low_pfn = cc->migrate_pfn; 1242 + low_pfn = fast_find_migrateblock(cc); 1744 1243 block_start_pfn = pageblock_start_pfn(low_pfn); 1745 1244 if (block_start_pfn < zone->zone_start_pfn) 1746 1245 block_start_pfn = zone->zone_start_pfn; 1246 + 1247 + /* 1248 + * fast_find_migrateblock marks a pageblock skipped so to avoid 1249 + * the isolation_suitable check below, check whether the fast 1250 + * search was successful. 1251 + */ 1252 + fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail; 1747 1253 1748 1254 /* Only scan within a pageblock boundary */ 1749 1255 block_end_pfn = pageblock_end_pfn(low_pfn); ··· 1762 1250 * Do not cross the free scanner. 1763 1251 */ 1764 1252 for (; block_end_pfn <= cc->free_pfn; 1253 + fast_find_block = false, 1765 1254 low_pfn = block_end_pfn, 1766 1255 block_start_pfn = block_end_pfn, 1767 1256 block_end_pfn += pageblock_nr_pages) { ··· 1770 1257 /* 1771 1258 * This can potentially iterate a massively long zone with 1772 1259 * many pageblocks unsuitable, so periodically check if we 1773 - * need to schedule, or even abort async compaction. 1260 + * need to schedule. 1774 1261 */ 1775 - if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) 1776 - && compact_should_abort(cc)) 1777 - break; 1262 + if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) 1263 + cond_resched(); 1778 1264 1779 1265 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, 1780 1266 zone); 1781 1267 if (!page) 1782 1268 continue; 1783 1269 1784 - /* If isolation recently failed, do not retry */ 1785 - if (!isolation_suitable(cc, page)) 1270 + /* 1271 + * If isolation recently failed, do not retry. Only check the 1272 + * pageblock once. COMPACT_CLUSTER_MAX causes a pageblock 1273 + * to be visited multiple times. Assume skip was checked 1274 + * before making it "skip" so other compaction instances do 1275 + * not scan the same block. 1276 + */ 1277 + if (IS_ALIGNED(low_pfn, pageblock_nr_pages) && 1278 + !fast_find_block && !isolation_suitable(cc, page)) 1786 1279 continue; 1787 1280 1788 1281 /* 1789 - * For async compaction, also only scan in MOVABLE blocks. 1790 - * Async compaction is optimistic to see if the minimum amount 1791 - * of work satisfies the allocation. 1282 + * For async compaction, also only scan in MOVABLE blocks 1283 + * without huge pages. Async compaction is optimistic to see 1284 + * if the minimum amount of work satisfies the allocation. 1285 + * The cached PFN is updated as it's possible that all 1286 + * remaining blocks between source and target are unsuitable 1287 + * and the compaction scanners fail to meet. 1792 1288 */ 1793 - if (!suitable_migration_source(cc, page)) 1289 + if (!suitable_migration_source(cc, page)) { 1290 + update_cached_migrate(cc, block_end_pfn); 1794 1291 continue; 1292 + } 1795 1293 1796 1294 /* Perform the isolation */ 1797 1295 low_pfn = isolate_migratepages_block(cc, low_pfn, 1798 1296 block_end_pfn, isolate_mode); 1799 1297 1800 - if (!low_pfn || cc->contended) 1298 + if (!low_pfn) 1801 1299 return ISOLATE_ABORT; 1802 1300 1803 1301 /* ··· 1834 1310 return order == -1; 1835 1311 } 1836 1312 1837 - static enum compact_result __compact_finished(struct zone *zone, 1838 - struct compact_control *cc) 1313 + static enum compact_result __compact_finished(struct compact_control *cc) 1839 1314 { 1840 1315 unsigned int order; 1841 1316 const int migratetype = cc->migratetype; 1842 - 1843 - if (cc->contended || fatal_signal_pending(current)) 1844 - return COMPACT_CONTENDED; 1317 + int ret; 1845 1318 1846 1319 /* Compaction run completes if the migrate and free scanner meet */ 1847 1320 if (compact_scanners_met(cc)) { 1848 1321 /* Let the next compaction start anew. */ 1849 - reset_cached_positions(zone); 1322 + reset_cached_positions(cc->zone); 1850 1323 1851 1324 /* 1852 1325 * Mark that the PG_migrate_skip information should be cleared ··· 1852 1331 * based on an allocation request. 1853 1332 */ 1854 1333 if (cc->direct_compaction) 1855 - zone->compact_blockskip_flush = true; 1334 + cc->zone->compact_blockskip_flush = true; 1856 1335 1857 1336 if (cc->whole_zone) 1858 1337 return COMPACT_COMPLETE; ··· 1863 1342 if (is_via_compact_memory(cc->order)) 1864 1343 return COMPACT_CONTINUE; 1865 1344 1866 - if (cc->finishing_block) { 1867 - /* 1868 - * We have finished the pageblock, but better check again that 1869 - * we really succeeded. 1870 - */ 1871 - if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) 1872 - cc->finishing_block = false; 1873 - else 1874 - return COMPACT_CONTINUE; 1875 - } 1345 + /* 1346 + * Always finish scanning a pageblock to reduce the possibility of 1347 + * fallbacks in the future. This is particularly important when 1348 + * migration source is unmovable/reclaimable but it's not worth 1349 + * special casing. 1350 + */ 1351 + if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) 1352 + return COMPACT_CONTINUE; 1876 1353 1877 1354 /* Direct compactor: Is a suitable page free? */ 1355 + ret = COMPACT_NO_SUITABLE_PAGE; 1878 1356 for (order = cc->order; order < MAX_ORDER; order++) { 1879 - struct free_area *area = &zone->free_area[order]; 1357 + struct free_area *area = &cc->zone->free_area[order]; 1880 1358 bool can_steal; 1881 1359 1882 1360 /* Job done if page is free of the right migratetype */ ··· 1913 1393 return COMPACT_SUCCESS; 1914 1394 } 1915 1395 1916 - cc->finishing_block = true; 1917 - return COMPACT_CONTINUE; 1396 + ret = COMPACT_CONTINUE; 1397 + break; 1918 1398 } 1919 1399 } 1920 1400 1921 - return COMPACT_NO_SUITABLE_PAGE; 1401 + if (cc->contended || fatal_signal_pending(current)) 1402 + ret = COMPACT_CONTENDED; 1403 + 1404 + return ret; 1922 1405 } 1923 1406 1924 - static enum compact_result compact_finished(struct zone *zone, 1925 - struct compact_control *cc) 1407 + static enum compact_result compact_finished(struct compact_control *cc) 1926 1408 { 1927 1409 int ret; 1928 1410 1929 - ret = __compact_finished(zone, cc); 1930 - trace_mm_compaction_finished(zone, cc->order, ret); 1411 + ret = __compact_finished(cc); 1412 + trace_mm_compaction_finished(cc->zone, cc->order, ret); 1931 1413 if (ret == COMPACT_NO_SUITABLE_PAGE) 1932 1414 ret = COMPACT_CONTINUE; 1933 1415 ··· 2056 1534 return false; 2057 1535 } 2058 1536 2059 - static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc) 1537 + static enum compact_result 1538 + compact_zone(struct compact_control *cc, struct capture_control *capc) 2060 1539 { 2061 1540 enum compact_result ret; 2062 - unsigned long start_pfn = zone->zone_start_pfn; 2063 - unsigned long end_pfn = zone_end_pfn(zone); 1541 + unsigned long start_pfn = cc->zone->zone_start_pfn; 1542 + unsigned long end_pfn = zone_end_pfn(cc->zone); 1543 + unsigned long last_migrated_pfn; 2064 1544 const bool sync = cc->mode != MIGRATE_ASYNC; 1545 + bool update_cached; 2065 1546 2066 1547 cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); 2067 - ret = compaction_suitable(zone, cc->order, cc->alloc_flags, 1548 + ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, 2068 1549 cc->classzone_idx); 2069 1550 /* Compaction is likely to fail */ 2070 1551 if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) ··· 2080 1555 * Clear pageblock skip if there were failures recently and compaction 2081 1556 * is about to be retried after being deferred. 2082 1557 */ 2083 - if (compaction_restarting(zone, cc->order)) 2084 - __reset_isolation_suitable(zone); 1558 + if (compaction_restarting(cc->zone, cc->order)) 1559 + __reset_isolation_suitable(cc->zone); 2085 1560 2086 1561 /* 2087 1562 * Setup to move all movable pages to the end of the zone. Used cached ··· 2089 1564 * want to compact the whole zone), but check that it is initialised 2090 1565 * by ensuring the values are within zone boundaries. 2091 1566 */ 1567 + cc->fast_start_pfn = 0; 2092 1568 if (cc->whole_zone) { 2093 1569 cc->migrate_pfn = start_pfn; 2094 1570 cc->free_pfn = pageblock_start_pfn(end_pfn - 1); 2095 1571 } else { 2096 - cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; 2097 - cc->free_pfn = zone->compact_cached_free_pfn; 1572 + cc->migrate_pfn = cc->zone->compact_cached_migrate_pfn[sync]; 1573 + cc->free_pfn = cc->zone->compact_cached_free_pfn; 2098 1574 if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { 2099 1575 cc->free_pfn = pageblock_start_pfn(end_pfn - 1); 2100 - zone->compact_cached_free_pfn = cc->free_pfn; 1576 + cc->zone->compact_cached_free_pfn = cc->free_pfn; 2101 1577 } 2102 1578 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { 2103 1579 cc->migrate_pfn = start_pfn; 2104 - zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; 2105 - zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; 1580 + cc->zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; 1581 + cc->zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; 2106 1582 } 2107 1583 2108 - if (cc->migrate_pfn == start_pfn) 1584 + if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn) 2109 1585 cc->whole_zone = true; 2110 1586 } 2111 1587 2112 - cc->last_migrated_pfn = 0; 1588 + last_migrated_pfn = 0; 1589 + 1590 + /* 1591 + * Migrate has separate cached PFNs for ASYNC and SYNC* migration on 1592 + * the basis that some migrations will fail in ASYNC mode. However, 1593 + * if the cached PFNs match and pageblocks are skipped due to having 1594 + * no isolation candidates, then the sync state does not matter. 1595 + * Until a pageblock with isolation candidates is found, keep the 1596 + * cached PFNs in sync to avoid revisiting the same blocks. 1597 + */ 1598 + update_cached = !sync && 1599 + cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1]; 2113 1600 2114 1601 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, 2115 1602 cc->free_pfn, end_pfn, sync); 2116 1603 2117 1604 migrate_prep_local(); 2118 1605 2119 - while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { 1606 + while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) { 2120 1607 int err; 1608 + unsigned long start_pfn = cc->migrate_pfn; 2121 1609 2122 - switch (isolate_migratepages(zone, cc)) { 1610 + /* 1611 + * Avoid multiple rescans which can happen if a page cannot be 1612 + * isolated (dirty/writeback in async mode) or if the migrated 1613 + * pages are being allocated before the pageblock is cleared. 1614 + * The first rescan will capture the entire pageblock for 1615 + * migration. If it fails, it'll be marked skip and scanning 1616 + * will proceed as normal. 1617 + */ 1618 + cc->rescan = false; 1619 + if (pageblock_start_pfn(last_migrated_pfn) == 1620 + pageblock_start_pfn(start_pfn)) { 1621 + cc->rescan = true; 1622 + } 1623 + 1624 + switch (isolate_migratepages(cc->zone, cc)) { 2123 1625 case ISOLATE_ABORT: 2124 1626 ret = COMPACT_CONTENDED; 2125 1627 putback_movable_pages(&cc->migratepages); 2126 1628 cc->nr_migratepages = 0; 1629 + last_migrated_pfn = 0; 2127 1630 goto out; 2128 1631 case ISOLATE_NONE: 1632 + if (update_cached) { 1633 + cc->zone->compact_cached_migrate_pfn[1] = 1634 + cc->zone->compact_cached_migrate_pfn[0]; 1635 + } 1636 + 2129 1637 /* 2130 1638 * We haven't isolated and migrated anything, but 2131 1639 * there might still be unflushed migrations from ··· 2166 1608 */ 2167 1609 goto check_drain; 2168 1610 case ISOLATE_SUCCESS: 1611 + update_cached = false; 1612 + last_migrated_pfn = start_pfn; 2169 1613 ; 2170 1614 } 2171 1615 ··· 2199 1639 cc->migrate_pfn = block_end_pfn( 2200 1640 cc->migrate_pfn - 1, cc->order); 2201 1641 /* Draining pcplists is useless in this case */ 2202 - cc->last_migrated_pfn = 0; 2203 - 1642 + last_migrated_pfn = 0; 2204 1643 } 2205 1644 } 2206 1645 ··· 2211 1652 * compact_finished() can detect immediately if allocation 2212 1653 * would succeed. 2213 1654 */ 2214 - if (cc->order > 0 && cc->last_migrated_pfn) { 1655 + if (cc->order > 0 && last_migrated_pfn) { 2215 1656 int cpu; 2216 1657 unsigned long current_block_start = 2217 1658 block_start_pfn(cc->migrate_pfn, cc->order); 2218 1659 2219 - if (cc->last_migrated_pfn < current_block_start) { 1660 + if (last_migrated_pfn < current_block_start) { 2220 1661 cpu = get_cpu(); 2221 1662 lru_add_drain_cpu(cpu); 2222 - drain_local_pages(zone); 1663 + drain_local_pages(cc->zone); 2223 1664 put_cpu(); 2224 1665 /* No more flushing until we migrate again */ 2225 - cc->last_migrated_pfn = 0; 1666 + last_migrated_pfn = 0; 2226 1667 } 2227 1668 } 2228 1669 1670 + /* Stop if a page has been captured */ 1671 + if (capc && capc->page) { 1672 + ret = COMPACT_SUCCESS; 1673 + break; 1674 + } 2229 1675 } 2230 1676 2231 1677 out: ··· 2249 1685 * Only go back, not forward. The cached pfn might have been 2250 1686 * already reset to zone end in compact_finished() 2251 1687 */ 2252 - if (free_pfn > zone->compact_cached_free_pfn) 2253 - zone->compact_cached_free_pfn = free_pfn; 1688 + if (free_pfn > cc->zone->compact_cached_free_pfn) 1689 + cc->zone->compact_cached_free_pfn = free_pfn; 2254 1690 } 2255 1691 2256 1692 count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned); ··· 2264 1700 2265 1701 static enum compact_result compact_zone_order(struct zone *zone, int order, 2266 1702 gfp_t gfp_mask, enum compact_priority prio, 2267 - unsigned int alloc_flags, int classzone_idx) 1703 + unsigned int alloc_flags, int classzone_idx, 1704 + struct page **capture) 2268 1705 { 2269 1706 enum compact_result ret; 2270 1707 struct compact_control cc = { ··· 2274 1709 .total_migrate_scanned = 0, 2275 1710 .total_free_scanned = 0, 2276 1711 .order = order, 1712 + .search_order = order, 2277 1713 .gfp_mask = gfp_mask, 2278 1714 .zone = zone, 2279 1715 .mode = (prio == COMPACT_PRIO_ASYNC) ? ··· 2286 1720 .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), 2287 1721 .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY) 2288 1722 }; 1723 + struct capture_control capc = { 1724 + .cc = &cc, 1725 + .page = NULL, 1726 + }; 1727 + 1728 + if (capture) 1729 + current->capture_control = &capc; 2289 1730 INIT_LIST_HEAD(&cc.freepages); 2290 1731 INIT_LIST_HEAD(&cc.migratepages); 2291 1732 2292 - ret = compact_zone(zone, &cc); 1733 + ret = compact_zone(&cc, &capc); 2293 1734 2294 1735 VM_BUG_ON(!list_empty(&cc.freepages)); 2295 1736 VM_BUG_ON(!list_empty(&cc.migratepages)); 1737 + 1738 + *capture = capc.page; 1739 + current->capture_control = NULL; 2296 1740 2297 1741 return ret; 2298 1742 } ··· 2321 1745 */ 2322 1746 enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, 2323 1747 unsigned int alloc_flags, const struct alloc_context *ac, 2324 - enum compact_priority prio) 1748 + enum compact_priority prio, struct page **capture) 2325 1749 { 2326 1750 int may_perform_io = gfp_mask & __GFP_IO; 2327 1751 struct zoneref *z; ··· 2349 1773 } 2350 1774 2351 1775 status = compact_zone_order(zone, order, gfp_mask, prio, 2352 - alloc_flags, ac_classzone_idx(ac)); 1776 + alloc_flags, ac_classzone_idx(ac), capture); 2353 1777 rc = max(status, rc); 2354 1778 2355 1779 /* The allocation should succeed, stop compacting */ ··· 2417 1841 INIT_LIST_HEAD(&cc.freepages); 2418 1842 INIT_LIST_HEAD(&cc.migratepages); 2419 1843 2420 - compact_zone(zone, &cc); 1844 + compact_zone(&cc, NULL); 2421 1845 2422 1846 VM_BUG_ON(!list_empty(&cc.freepages)); 2423 1847 VM_BUG_ON(!list_empty(&cc.migratepages)); ··· 2448 1872 { 2449 1873 if (write) 2450 1874 compact_nodes(); 2451 - 2452 - return 0; 2453 - } 2454 - 2455 - int sysctl_extfrag_handler(struct ctl_table *table, int write, 2456 - void __user *buffer, size_t *length, loff_t *ppos) 2457 - { 2458 - proc_dointvec_minmax(table, write, buffer, length, ppos); 2459 1875 2460 1876 return 0; 2461 1877 } ··· 2516 1948 struct zone *zone; 2517 1949 struct compact_control cc = { 2518 1950 .order = pgdat->kcompactd_max_order, 1951 + .search_order = pgdat->kcompactd_max_order, 2519 1952 .total_migrate_scanned = 0, 2520 1953 .total_free_scanned = 0, 2521 1954 .classzone_idx = pgdat->kcompactd_classzone_idx, ··· 2552 1983 2553 1984 if (kthread_should_stop()) 2554 1985 return; 2555 - status = compact_zone(zone, &cc); 1986 + status = compact_zone(&cc, NULL); 2556 1987 2557 1988 if (status == COMPACT_SUCCESS) { 2558 1989 compaction_defer_reset(zone, cc.order, false);

+9 -4

mm/dmapool.c

··· 114 114 * @size: size of the blocks in this pool. 115 115 * @align: alignment requirement for blocks; must be a power of two 116 116 * @boundary: returned blocks won't cross this power of two boundary 117 - * Context: !in_interrupt() 117 + * Context: not in_interrupt() 118 118 * 119 - * Returns a dma allocation pool with the requested characteristics, or 120 - * null if one can't be created. Given one of these pools, dma_pool_alloc() 119 + * Given one of these pools, dma_pool_alloc() 121 120 * may be used to allocate memory. Such memory will all have "consistent" 122 121 * DMA mappings, accessible by the device and its driver without using 123 122 * cache flushing primitives. The actual size of blocks allocated may be ··· 126 127 * cross that size boundary. This is useful for devices which have 127 128 * addressing restrictions on individual DMA transfers, such as not crossing 128 129 * boundaries of 4KBytes. 130 + * 131 + * Return: a dma allocation pool with the requested characteristics, or 132 + * %NULL if one can't be created. 129 133 */ 130 134 struct dma_pool *dma_pool_create(const char *name, struct device *dev, 131 135 size_t size, size_t align, size_t boundary) ··· 315 313 * @mem_flags: GFP_* bitmask 316 314 * @handle: pointer to dma address of block 317 315 * 318 - * This returns the kernel virtual address of a currently unused block, 316 + * Return: the kernel virtual address of a currently unused block, 319 317 * and reports its dma address through the handle. 320 318 * If such a memory block can't be allocated, %NULL is returned. 321 319 */ ··· 500 498 * 501 499 * Managed dma_pool_create(). DMA pool created with this function is 502 500 * automatically destroyed on driver detach. 501 + * 502 + * Return: a managed dma allocation pool with the requested 503 + * characteristics, or %NULL if one can't be created. 503 504 */ 504 505 struct dma_pool *dmam_pool_create(const char *name, struct device *dev, 505 506 size_t size, size_t align, size_t allocation)

+4 -10

mm/failslab.c

··· 48 48 if (IS_ERR(dir)) 49 49 return PTR_ERR(dir); 50 50 51 - if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, 52 - &failslab.ignore_gfp_reclaim)) 53 - goto fail; 54 - if (!debugfs_create_bool("cache-filter", mode, dir, 55 - &failslab.cache_filter)) 56 - goto fail; 51 + debugfs_create_bool("ignore-gfp-wait", mode, dir, 52 + &failslab.ignore_gfp_reclaim); 53 + debugfs_create_bool("cache-filter", mode, dir, 54 + &failslab.cache_filter); 57 55 58 56 return 0; 59 - fail: 60 - debugfs_remove_recursive(dir); 61 - 62 - return -ENOMEM; 63 57 } 64 58 65 59 late_initcall(failslab_debugfs_init);

+67 -26

mm/filemap.c

··· 98 98 * ->swap_lock (try_to_unmap_one) 99 99 * ->private_lock (try_to_unmap_one) 100 100 * ->i_pages lock (try_to_unmap_one) 101 - * ->zone_lru_lock(zone) (follow_page->mark_page_accessed) 102 - * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page) 101 + * ->pgdat->lru_lock (follow_page->mark_page_accessed) 102 + * ->pgdat->lru_lock (check_pte_range->isolate_lru_page) 103 103 * ->private_lock (page_remove_rmap->set_page_dirty) 104 104 * ->i_pages lock (page_remove_rmap->set_page_dirty) 105 105 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) ··· 392 392 * opposed to a regular memory cleansing writeback. The difference between 393 393 * these two operations is that if a dirty page/buffer is encountered, it must 394 394 * be waited upon, and not just skipped over. 395 + * 396 + * Return: %0 on success, negative error code otherwise. 395 397 */ 396 398 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 397 399 loff_t end, int sync_mode) ··· 440 438 * 441 439 * This is a mostly non-blocking flush. Not suitable for data-integrity 442 440 * purposes - I/O may not be started against all dirty pages. 441 + * 442 + * Return: %0 on success, negative error code otherwise. 443 443 */ 444 444 int filemap_flush(struct address_space *mapping) 445 445 { ··· 457 453 * 458 454 * Find at least one page in the range supplied, usually used to check if 459 455 * direct writing in this range will trigger a writeback. 456 + * 457 + * Return: %true if at least one page exists in the specified range, 458 + * %false otherwise. 460 459 */ 461 460 bool filemap_range_has_page(struct address_space *mapping, 462 461 loff_t start_byte, loff_t end_byte) ··· 536 529 * Since the error status of the address space is cleared by this function, 537 530 * callers are responsible for checking the return value and handling and/or 538 531 * reporting the error. 532 + * 533 + * Return: error status of the address space. 539 534 */ 540 535 int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, 541 536 loff_t end_byte) ··· 560 551 * Since the error status of the file is advanced by this function, 561 552 * callers are responsible for checking the return value and handling and/or 562 553 * reporting the error. 554 + * 555 + * Return: error status of the address space vs. the file->f_wb_err cursor. 563 556 */ 564 557 int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) 565 558 { ··· 583 572 * Use this function if callers don't handle errors themselves. Expected 584 573 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), 585 574 * fsfreeze(8) 575 + * 576 + * Return: error status of the address space. 586 577 */ 587 578 int filemap_fdatawait_keep_errors(struct address_space *mapping) 588 579 { ··· 636 623 * 637 624 * Note that @lend is inclusive (describes the last byte to be written) so 638 625 * that this function can be used to write to the very end-of-file (end = -1). 626 + * 627 + * Return: error status of the address space. 639 628 */ 640 629 int filemap_write_and_wait_range(struct address_space *mapping, 641 630 loff_t lstart, loff_t lend) ··· 693 678 * While we handle mapping->wb_err with atomic operations, the f_wb_err 694 679 * value is protected by the f_lock since we must ensure that it reflects 695 680 * the latest value swapped in for this file descriptor. 681 + * 682 + * Return: %0 on success, negative error code otherwise. 696 683 */ 697 684 int file_check_and_advance_wb_err(struct file *file) 698 685 { ··· 737 720 * 738 721 * After writing out and waiting on the data, we check and advance the 739 722 * f_wb_err cursor to the latest value, and return any errors detected there. 723 + * 724 + * Return: %0 on success, negative error code otherwise. 740 725 */ 741 726 int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) 742 727 { ··· 772 753 * caller must do that. 773 754 * 774 755 * The remove + add is atomic. This function cannot fail. 756 + * 757 + * Return: %0 775 758 */ 776 759 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) 777 760 { ··· 888 867 * 889 868 * This function is used to add a page to the pagecache. It must be locked. 890 869 * This function does not add the page to the LRU. The caller must do that. 870 + * 871 + * Return: %0 on success, negative error code otherwise. 891 872 */ 892 873 int add_to_page_cache_locked(struct page *page, struct address_space *mapping, 893 874 pgoff_t offset, gfp_t gfp_mask) ··· 1486 1463 * If the slot holds a shadow entry of a previously evicted page, or a 1487 1464 * swap entry from shmem/tmpfs, it is returned. 1488 1465 * 1489 - * Otherwise, %NULL is returned. 1466 + * Return: the found page or shadow entry, %NULL if nothing is found. 1490 1467 */ 1491 1468 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) 1492 1469 { ··· 1544 1521 * If the slot holds a shadow entry of a previously evicted page, or a 1545 1522 * swap entry from shmem/tmpfs, it is returned. 1546 1523 * 1547 - * Otherwise, %NULL is returned. 1548 - * 1549 1524 * find_lock_entry() may sleep. 1525 + * 1526 + * Return: the found page or shadow entry, %NULL if nothing is found. 1550 1527 */ 1551 1528 struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) 1552 1529 { ··· 1586 1563 * - FGP_CREAT: If page is not present then a new page is allocated using 1587 1564 * @gfp_mask and added to the page cache and the VM's LRU 1588 1565 * list. The page is returned locked and with an increased 1589 - * refcount. Otherwise, NULL is returned. 1566 + * refcount. 1590 1567 * 1591 1568 * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even 1592 1569 * if the GFP flags specified for FGP_CREAT are atomic. 1593 1570 * 1594 1571 * If there is a page cache page, it is returned with an increased refcount. 1572 + * 1573 + * Return: the found page or %NULL otherwise. 1595 1574 */ 1596 1575 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, 1597 1576 int fgp_flags, gfp_t gfp_mask) ··· 1681 1656 * Any shadow entries of evicted pages, or swap entries from 1682 1657 * shmem/tmpfs, are included in the returned array. 1683 1658 * 1684 - * find_get_entries() returns the number of pages and shadow entries 1685 - * which were found. 1659 + * Return: the number of pages and shadow entries which were found. 1686 1660 */ 1687 1661 unsigned find_get_entries(struct address_space *mapping, 1688 1662 pgoff_t start, unsigned int nr_entries, ··· 1751 1727 * indexes. There may be holes in the indices due to not-present pages. 1752 1728 * We also update @start to index the next page for the traversal. 1753 1729 * 1754 - * find_get_pages_range() returns the number of pages which were found. If this 1755 - * number is smaller than @nr_pages, the end of specified range has been 1730 + * Return: the number of pages which were found. If this number is 1731 + * smaller than @nr_pages, the end of specified range has been 1756 1732 * reached. 1757 1733 */ 1758 1734 unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, ··· 1789 1765 1790 1766 pages[ret] = page; 1791 1767 if (++ret == nr_pages) { 1792 - *start = page->index + 1; 1768 + *start = xas.xa_index + 1; 1793 1769 goto out; 1794 1770 } 1795 1771 continue; ··· 1825 1801 * find_get_pages_contig() works exactly like find_get_pages(), except 1826 1802 * that the returned number of pages are guaranteed to be contiguous. 1827 1803 * 1828 - * find_get_pages_contig() returns the number of pages which were found. 1804 + * Return: the number of pages which were found. 1829 1805 */ 1830 1806 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, 1831 1807 unsigned int nr_pages, struct page **pages) ··· 1861 1837 if (unlikely(page != xas_reload(&xas))) 1862 1838 goto put_page; 1863 1839 1864 - /* 1865 - * must check mapping and index after taking the ref. 1866 - * otherwise we can get both false positives and false 1867 - * negatives, which is just confusing to the caller. 1868 - */ 1869 - if (!page->mapping || page_to_pgoff(page) != xas.xa_index) { 1870 - put_page(page); 1871 - break; 1872 - } 1873 - 1874 1840 pages[ret] = page; 1875 1841 if (++ret == nr_pages) 1876 1842 break; ··· 1886 1872 * 1887 1873 * Like find_get_pages, except we only return pages which are tagged with 1888 1874 * @tag. We update @index to index the next page for the traversal. 1875 + * 1876 + * Return: the number of pages which were found. 1889 1877 */ 1890 1878 unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, 1891 1879 pgoff_t end, xa_mark_t tag, unsigned int nr_pages, ··· 1927 1911 1928 1912 pages[ret] = page; 1929 1913 if (++ret == nr_pages) { 1930 - *index = page->index + 1; 1914 + *index = xas.xa_index + 1; 1931 1915 goto out; 1932 1916 } 1933 1917 continue; ··· 1965 1949 * 1966 1950 * Like find_get_entries, except we only return entries which are tagged with 1967 1951 * @tag. 1952 + * 1953 + * Return: the number of entries which were found. 1968 1954 */ 1969 1955 unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, 1970 1956 xa_mark_t tag, unsigned int nr_entries, ··· 2052 2034 * 2053 2035 * This is really ugly. But the goto's actually try to clarify some 2054 2036 * of the logic when it comes to error handling etc. 2037 + * 2038 + * Return: 2039 + * * total number of bytes copied, including those the were already @written 2040 + * * negative error code if nothing was copied 2055 2041 */ 2056 2042 static ssize_t generic_file_buffered_read(struct kiocb *iocb, 2057 2043 struct iov_iter *iter, ssize_t written) ··· 2317 2295 * 2318 2296 * This is the "read_iter()" routine for all filesystems 2319 2297 * that can use the page cache directly. 2298 + * Return: 2299 + * * number of bytes copied, even for partial reads 2300 + * * negative error code if nothing was read 2320 2301 */ 2321 2302 ssize_t 2322 2303 generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) ··· 2387 2362 * 2388 2363 * This adds the requested page to the page cache if it isn't already there, 2389 2364 * and schedules an I/O to read in its contents from disk. 2365 + * 2366 + * Return: %0 on success, negative error code otherwise. 2390 2367 */ 2391 2368 static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) 2392 2369 { ··· 2503 2476 * has not been released. 2504 2477 * 2505 2478 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. 2479 + * 2480 + * Return: bitwise-OR of %VM_FAULT_ codes. 2506 2481 */ 2507 2482 vm_fault_t filemap_fault(struct vm_fault *vmf) 2508 2483 { ··· 2890 2861 * not set, try to fill the page and wait for it to become unlocked. 2891 2862 * 2892 2863 * If the page does not get brought uptodate, return -EIO. 2864 + * 2865 + * Return: up to date page on success, ERR_PTR() on failure. 2893 2866 */ 2894 2867 struct page *read_cache_page(struct address_space *mapping, 2895 2868 pgoff_t index, ··· 2912 2881 * any new page allocations done using the specified allocation flags. 2913 2882 * 2914 2883 * If the page does not get brought uptodate, return -EIO. 2884 + * 2885 + * Return: up to date page on success, ERR_PTR() on failure. 2915 2886 */ 2916 2887 struct page *read_cache_page_gfp(struct address_space *mapping, 2917 2888 pgoff_t index, ··· 3114 3081 if (iocb->ki_flags & IOCB_NOWAIT) { 3115 3082 /* If there are pages to writeback, return */ 3116 3083 if (filemap_range_has_page(inode->i_mapping, pos, 3117 - pos + write_len)) 3084 + pos + write_len - 1)) 3118 3085 return -EAGAIN; 3119 3086 } else { 3120 3087 written = filemap_write_and_wait_range(mapping, pos, ··· 3297 3264 * This function does *not* take care of syncing data in case of O_SYNC write. 3298 3265 * A caller has to handle it. This is mainly due to the fact that we want to 3299 3266 * avoid syncing under i_mutex. 3267 + * 3268 + * Return: 3269 + * * number of bytes written, even for truncated writes 3270 + * * negative error code if no data has been written at all 3300 3271 */ 3301 3272 ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 3302 3273 { ··· 3385 3348 * This is a wrapper around __generic_file_write_iter() to be used by most 3386 3349 * filesystems. It takes care of syncing the file in case of O_SYNC file 3387 3350 * and acquires i_mutex as needed. 3351 + * Return: 3352 + * * negative error code if no data has been written at all of 3353 + * vfs_fsync_range() failed for a synchronous write 3354 + * * number of bytes written, even for truncated writes 3388 3355 */ 3389 3356 ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 3390 3357 { ··· 3415 3374 * @gfp_mask: memory allocation flags (and I/O mode) 3416 3375 * 3417 3376 * The address_space is to try to release any data against the page 3418 - * (presumably at page->private). If the release was successful, return '1'. 3419 - * Otherwise return zero. 3377 + * (presumably at page->private). 3420 3378 * 3421 3379 * This may also be called if PG_fscache is set on a page, indicating that the 3422 3380 * page is known to the local caching routines. ··· 3423 3383 * The @gfp_mask argument specifies whether I/O may be performed to release 3424 3384 * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). 3425 3385 * 3386 + * Return: %1 if the release was successful, otherwise return zero. 3426 3387 */ 3427 3388 int try_to_release_page(struct page *page, gfp_t gfp_mask) 3428 3389 {

+177 -25

mm/gup.c

··· 13 13 #include <linux/sched/signal.h> 14 14 #include <linux/rwsem.h> 15 15 #include <linux/hugetlb.h> 16 + #include <linux/migrate.h> 17 + #include <linux/mm_inline.h> 18 + #include <linux/sched/mm.h> 16 19 17 20 #include <asm/mmu_context.h> 18 21 #include <asm/pgtable.h> ··· 1129 1126 } 1130 1127 EXPORT_SYMBOL(get_user_pages); 1131 1128 1129 + #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) 1130 + 1132 1131 #ifdef CONFIG_FS_DAX 1132 + static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) 1133 + { 1134 + long i; 1135 + struct vm_area_struct *vma_prev = NULL; 1136 + 1137 + for (i = 0; i < nr_pages; i++) { 1138 + struct vm_area_struct *vma = vmas[i]; 1139 + 1140 + if (vma == vma_prev) 1141 + continue; 1142 + 1143 + vma_prev = vma; 1144 + 1145 + if (vma_is_fsdax(vma)) 1146 + return true; 1147 + } 1148 + return false; 1149 + } 1150 + #else 1151 + static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) 1152 + { 1153 + return false; 1154 + } 1155 + #endif 1156 + 1157 + #ifdef CONFIG_CMA 1158 + static struct page *new_non_cma_page(struct page *page, unsigned long private) 1159 + { 1160 + /* 1161 + * We want to make sure we allocate the new page from the same node 1162 + * as the source page. 1163 + */ 1164 + int nid = page_to_nid(page); 1165 + /* 1166 + * Trying to allocate a page for migration. Ignore allocation 1167 + * failure warnings. We don't force __GFP_THISNODE here because 1168 + * this node here is the node where we have CMA reservation and 1169 + * in some case these nodes will have really less non movable 1170 + * allocation memory. 1171 + */ 1172 + gfp_t gfp_mask = GFP_USER | __GFP_NOWARN; 1173 + 1174 + if (PageHighMem(page)) 1175 + gfp_mask |= __GFP_HIGHMEM; 1176 + 1177 + #ifdef CONFIG_HUGETLB_PAGE 1178 + if (PageHuge(page)) { 1179 + struct hstate *h = page_hstate(page); 1180 + /* 1181 + * We don't want to dequeue from the pool because pool pages will 1182 + * mostly be from the CMA region. 1183 + */ 1184 + return alloc_migrate_huge_page(h, gfp_mask, nid, NULL); 1185 + } 1186 + #endif 1187 + if (PageTransHuge(page)) { 1188 + struct page *thp; 1189 + /* 1190 + * ignore allocation failure warnings 1191 + */ 1192 + gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN; 1193 + 1194 + /* 1195 + * Remove the movable mask so that we don't allocate from 1196 + * CMA area again. 1197 + */ 1198 + thp_gfpmask &= ~__GFP_MOVABLE; 1199 + thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER); 1200 + if (!thp) 1201 + return NULL; 1202 + prep_transhuge_page(thp); 1203 + return thp; 1204 + } 1205 + 1206 + return __alloc_pages_node(nid, gfp_mask, 0); 1207 + } 1208 + 1209 + static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, 1210 + unsigned int gup_flags, 1211 + struct page **pages, 1212 + struct vm_area_struct **vmas) 1213 + { 1214 + long i; 1215 + bool drain_allow = true; 1216 + bool migrate_allow = true; 1217 + LIST_HEAD(cma_page_list); 1218 + 1219 + check_again: 1220 + for (i = 0; i < nr_pages; i++) { 1221 + /* 1222 + * If we get a page from the CMA zone, since we are going to 1223 + * be pinning these entries, we might as well move them out 1224 + * of the CMA zone if possible. 1225 + */ 1226 + if (is_migrate_cma_page(pages[i])) { 1227 + 1228 + struct page *head = compound_head(pages[i]); 1229 + 1230 + if (PageHuge(head)) { 1231 + isolate_huge_page(head, &cma_page_list); 1232 + } else { 1233 + if (!PageLRU(head) && drain_allow) { 1234 + lru_add_drain_all(); 1235 + drain_allow = false; 1236 + } 1237 + 1238 + if (!isolate_lru_page(head)) { 1239 + list_add_tail(&head->lru, &cma_page_list); 1240 + mod_node_page_state(page_pgdat(head), 1241 + NR_ISOLATED_ANON + 1242 + page_is_file_cache(head), 1243 + hpage_nr_pages(head)); 1244 + } 1245 + } 1246 + } 1247 + } 1248 + 1249 + if (!list_empty(&cma_page_list)) { 1250 + /* 1251 + * drop the above get_user_pages reference. 1252 + */ 1253 + for (i = 0; i < nr_pages; i++) 1254 + put_page(pages[i]); 1255 + 1256 + if (migrate_pages(&cma_page_list, new_non_cma_page, 1257 + NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) { 1258 + /* 1259 + * some of the pages failed migration. Do get_user_pages 1260 + * without migration. 1261 + */ 1262 + migrate_allow = false; 1263 + 1264 + if (!list_empty(&cma_page_list)) 1265 + putback_movable_pages(&cma_page_list); 1266 + } 1267 + /* 1268 + * We did migrate all the pages, Try to get the page references again 1269 + * migrating any new CMA pages which we failed to isolate earlier. 1270 + */ 1271 + nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas); 1272 + if ((nr_pages > 0) && migrate_allow) { 1273 + drain_allow = true; 1274 + goto check_again; 1275 + } 1276 + } 1277 + 1278 + return nr_pages; 1279 + } 1280 + #else 1281 + static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages, 1282 + unsigned int gup_flags, 1283 + struct page **pages, 1284 + struct vm_area_struct **vmas) 1285 + { 1286 + return nr_pages; 1287 + } 1288 + #endif 1289 + 1133 1290 /* 1134 1291 * This is the same as get_user_pages() in that it assumes we are 1135 1292 * operating on the current task's mm, but it goes further to validate ··· 1303 1140 * Contrast this to iov_iter_get_pages() usages which are transient. 1304 1141 */ 1305 1142 long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, 1306 - unsigned int gup_flags, struct page **pages, 1307 - struct vm_area_struct **vmas_arg) 1143 + unsigned int gup_flags, struct page **pages, 1144 + struct vm_area_struct **vmas_arg) 1308 1145 { 1309 1146 struct vm_area_struct **vmas = vmas_arg; 1310 - struct vm_area_struct *vma_prev = NULL; 1147 + unsigned long flags; 1311 1148 long rc, i; 1312 1149 1313 1150 if (!pages) ··· 1320 1157 return -ENOMEM; 1321 1158 } 1322 1159 1160 + flags = memalloc_nocma_save(); 1323 1161 rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); 1324 - 1325 - for (i = 0; i < rc; i++) { 1326 - struct vm_area_struct *vma = vmas[i]; 1327 - 1328 - if (vma == vma_prev) 1329 - continue; 1330 - 1331 - vma_prev = vma; 1332 - 1333 - if (vma_is_fsdax(vma)) 1334 - break; 1335 - } 1336 - 1337 - /* 1338 - * Either get_user_pages() failed, or the vma validation 1339 - * succeeded, in either case we don't need to put_page() before 1340 - * returning. 1341 - */ 1342 - if (i >= rc) 1162 + memalloc_nocma_restore(flags); 1163 + if (rc < 0) 1343 1164 goto out; 1344 1165 1345 - for (i = 0; i < rc; i++) 1346 - put_page(pages[i]); 1347 - rc = -EOPNOTSUPP; 1166 + if (check_dax_vmas(vmas, rc)) { 1167 + for (i = 0; i < rc; i++) 1168 + put_page(pages[i]); 1169 + rc = -EOPNOTSUPP; 1170 + goto out; 1171 + } 1172 + 1173 + rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas); 1348 1174 out: 1349 1175 if (vmas != vmas_arg) 1350 1176 kfree(vmas);

+2 -6

mm/gup_benchmark.c

··· 122 122 123 123 static int gup_benchmark_init(void) 124 124 { 125 - void *ret; 126 - 127 - ret = debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL, 128 - &gup_benchmark_fops); 129 - if (!ret) 130 - pr_warn("Failed to create gup_benchmark in debugfs"); 125 + debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL, 126 + &gup_benchmark_fops); 131 127 132 128 return 0; 133 129 }

+17 -20

mm/huge_memory.c

··· 33 33 #include <linux/page_idle.h> 34 34 #include <linux/shmem_fs.h> 35 35 #include <linux/oom.h> 36 + #include <linux/numa.h> 36 37 37 38 #include <asm/tlb.h> 38 39 #include <asm/pgalloc.h> ··· 617 616 mm_inc_nr_ptes(vma->vm_mm); 618 617 spin_unlock(vmf->ptl); 619 618 count_vm_event(THP_FAULT_ALLOC); 619 + count_memcg_events(memcg, THP_FAULT_ALLOC, 1); 620 620 } 621 621 622 622 return 0; ··· 1339 1337 } 1340 1338 1341 1339 count_vm_event(THP_FAULT_ALLOC); 1340 + count_memcg_events(memcg, THP_FAULT_ALLOC, 1); 1342 1341 1343 1342 if (!page) 1344 1343 clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); ··· 1478 1475 struct anon_vma *anon_vma = NULL; 1479 1476 struct page *page; 1480 1477 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1481 - int page_nid = -1, this_nid = numa_node_id(); 1478 + int page_nid = NUMA_NO_NODE, this_nid = numa_node_id(); 1482 1479 int target_nid, last_cpupid = -1; 1483 1480 bool page_locked; 1484 1481 bool migrated = false; ··· 1523 1520 */ 1524 1521 page_locked = trylock_page(page); 1525 1522 target_nid = mpol_misplaced(page, vma, haddr); 1526 - if (target_nid == -1) { 1523 + if (target_nid == NUMA_NO_NODE) { 1527 1524 /* If the page was locked, there are no parallel migrations */ 1528 1525 if (page_locked) 1529 1526 goto clear_pmdnuma; ··· 1531 1528 1532 1529 /* Migration could have started since the pmd_trans_migrating check */ 1533 1530 if (!page_locked) { 1534 - page_nid = -1; 1531 + page_nid = NUMA_NO_NODE; 1535 1532 if (!get_page_unless_zero(page)) 1536 1533 goto out_unlock; 1537 1534 spin_unlock(vmf->ptl); ··· 1552 1549 if (unlikely(!pmd_same(pmd, *vmf->pmd))) { 1553 1550 unlock_page(page); 1554 1551 put_page(page); 1555 - page_nid = -1; 1552 + page_nid = NUMA_NO_NODE; 1556 1553 goto out_unlock; 1557 1554 } 1558 1555 1559 1556 /* Bail if we fail to protect against THP splits for any reason */ 1560 1557 if (unlikely(!anon_vma)) { 1561 1558 put_page(page); 1562 - page_nid = -1; 1559 + page_nid = NUMA_NO_NODE; 1563 1560 goto clear_pmdnuma; 1564 1561 } 1565 1562 ··· 1621 1618 if (anon_vma) 1622 1619 page_unlock_anon_vma_read(anon_vma); 1623 1620 1624 - if (page_nid != -1) 1621 + if (page_nid != NUMA_NO_NODE) 1625 1622 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, 1626 1623 flags); 1627 1624 ··· 1982 1979 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, 1983 1980 pud_t *pud, unsigned long addr) 1984 1981 { 1985 - pud_t orig_pud; 1986 1982 spinlock_t *ptl; 1987 1983 1988 1984 ptl = __pud_trans_huge_lock(pud, vma); ··· 1993 1991 * pgtable_trans_huge_withdraw after finishing pudp related 1994 1992 * operations. 1995 1993 */ 1996 - orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud, 1997 - tlb->fullmm); 1994 + pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm); 1998 1995 tlb_remove_pud_tlb_entry(tlb, pud, addr); 1999 1996 if (vma_is_dax(vma)) { 2000 1997 spin_unlock(ptl); ··· 2438 2437 pgoff_t end, unsigned long flags) 2439 2438 { 2440 2439 struct page *head = compound_head(page); 2441 - struct zone *zone = page_zone(head); 2440 + pg_data_t *pgdat = page_pgdat(head); 2442 2441 struct lruvec *lruvec; 2443 2442 int i; 2444 2443 2445 - lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); 2444 + lruvec = mem_cgroup_page_lruvec(head, pgdat); 2446 2445 2447 2446 /* complete memcg works before add pages to LRU */ 2448 2447 mem_cgroup_split_huge_fixup(head); ··· 2473 2472 xa_unlock(&head->mapping->i_pages); 2474 2473 } 2475 2474 2476 - spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); 2475 + spin_unlock_irqrestore(&pgdat->lru_lock, flags); 2477 2476 2478 2477 remap_page(head); 2479 2478 ··· 2684 2683 lru_add_drain(); 2685 2684 2686 2685 /* prevent PageLRU to go away from under us, and freeze lru stats */ 2687 - spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); 2686 + spin_lock_irqsave(&pgdata->lru_lock, flags); 2688 2687 2689 2688 if (mapping) { 2690 2689 XA_STATE(xas, &mapping->i_pages, page_index(head)); ··· 2729 2728 spin_unlock(&pgdata->split_queue_lock); 2730 2729 fail: if (mapping) 2731 2730 xa_unlock(&mapping->i_pages); 2732 - spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); 2731 + spin_unlock_irqrestore(&pgdata->lru_lock, flags); 2733 2732 remap_page(head); 2734 2733 ret = -EBUSY; 2735 2734 } ··· 2887 2886 2888 2887 static int __init split_huge_pages_debugfs(void) 2889 2888 { 2890 - void *ret; 2891 - 2892 - ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL, 2893 - &split_huge_pages_fops); 2894 - if (!ret) 2895 - pr_warn("Failed to create split_huge_pages in debugfs"); 2889 + debugfs_create_file("split_huge_pages", 0200, NULL, NULL, 2890 + &split_huge_pages_fops); 2896 2891 return 0; 2897 2892 } 2898 2893 late_initcall(split_huge_pages_debugfs);

+10 -7

mm/hugetlb.c

··· 25 25 #include <linux/swap.h> 26 26 #include <linux/swapops.h> 27 27 #include <linux/jhash.h> 28 + #include <linux/numa.h> 28 29 29 30 #include <asm/page.h> 30 31 #include <asm/pgtable.h> ··· 888 887 struct zonelist *zonelist; 889 888 struct zone *zone; 890 889 struct zoneref *z; 891 - int node = -1; 890 + int node = NUMA_NO_NODE; 892 891 893 892 zonelist = node_zonelist(nid, gfp_mask); 894 893 ··· 920 919 /* Movability of hugepages depends on migration support. */ 921 920 static inline gfp_t htlb_alloc_mask(struct hstate *h) 922 921 { 923 - if (hugepage_migration_supported(h)) 922 + if (hugepage_movable_supported(h)) 924 923 return GFP_HIGHUSER_MOVABLE; 925 924 else 926 925 return GFP_HIGHUSER; ··· 1587 1586 return page; 1588 1587 } 1589 1588 1590 - static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, 1591 - int nid, nodemask_t *nmask) 1589 + struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, 1590 + int nid, nodemask_t *nmask) 1592 1591 { 1593 1592 struct page *page; 1594 1593 ··· 4399 4398 continue; 4400 4399 } 4401 4400 if (!huge_pte_none(pte)) { 4402 - pte = huge_ptep_get_and_clear(mm, address, ptep); 4403 - pte = pte_mkhuge(huge_pte_modify(pte, newprot)); 4401 + pte_t old_pte; 4402 + 4403 + old_pte = huge_ptep_modify_prot_start(vma, address, ptep); 4404 + pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); 4404 4405 pte = arch_make_huge_pte(pte, vma, NULL, 0); 4405 - set_huge_pte_at(mm, address, ptep, pte); 4406 + huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); 4406 4407 pages++; 4407 4408 } 4408 4409 spin_unlock(ptl);

+19 -7

mm/internal.h

··· 163 163 extern int __isolate_free_page(struct page *page, unsigned int order); 164 164 extern void memblock_free_pages(struct page *page, unsigned long pfn, 165 165 unsigned int order); 166 + extern void __free_pages_core(struct page *page, unsigned int order); 166 167 extern void prep_compound_page(struct page *page, unsigned int order); 167 168 extern void post_alloc_hook(struct page *page, unsigned int order, 168 169 gfp_t gfp_flags); ··· 184 183 struct compact_control { 185 184 struct list_head freepages; /* List of free pages to migrate to */ 186 185 struct list_head migratepages; /* List of pages being migrated */ 187 - struct zone *zone; 188 - unsigned long nr_freepages; /* Number of isolated free pages */ 189 - unsigned long nr_migratepages; /* Number of pages to migrate */ 190 - unsigned long total_migrate_scanned; 191 - unsigned long total_free_scanned; 186 + unsigned int nr_freepages; /* Number of isolated free pages */ 187 + unsigned int nr_migratepages; /* Number of pages to migrate */ 192 188 unsigned long free_pfn; /* isolate_freepages search base */ 193 189 unsigned long migrate_pfn; /* isolate_migratepages search base */ 194 - unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ 190 + unsigned long fast_start_pfn; /* a pfn to start linear scan from */ 191 + struct zone *zone; 192 + unsigned long total_migrate_scanned; 193 + unsigned long total_free_scanned; 194 + unsigned short fast_search_fail;/* failures to use free list searches */ 195 + short search_order; /* order to start a fast search at */ 195 196 const gfp_t gfp_mask; /* gfp mask of a direct compactor */ 196 197 int order; /* order a direct compactor needs */ 197 198 int migratetype; /* migratetype of direct compactor */ ··· 206 203 bool direct_compaction; /* False from kcompactd or /proc/... */ 207 204 bool whole_zone; /* Whole zone should/has been scanned */ 208 205 bool contended; /* Signal lock or sched contention */ 209 - bool finishing_block; /* Finishing current pageblock */ 206 + bool rescan; /* Rescanning the same pageblock */ 207 + }; 208 + 209 + /* 210 + * Used in direct compaction when a page should be taken from the freelists 211 + * immediately when one is created during the free path. 212 + */ 213 + struct capture_control { 214 + struct compact_control *cc; 215 + struct page *page; 210 216 }; 211 217 212 218 unsigned long

+2

mm/kasan/common.c

··· 14 14 * 15 15 */ 16 16 17 + #define __KASAN_INTERNAL 18 + 17 19 #include <linux/export.h> 18 20 #include <linux/interrupt.h> 19 21 #include <linux/init.h>

-19

mm/kasan/generic.c

··· 275 275 void __asan_handle_no_return(void) {} 276 276 EXPORT_SYMBOL(__asan_handle_no_return); 277 277 278 - /* Emitted by compiler to poison large objects when they go out of scope. */ 279 - void __asan_poison_stack_memory(const void *addr, size_t size) 280 - { 281 - /* 282 - * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded 283 - * by redzones, so we simply round up size to simplify logic. 284 - */ 285 - kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE), 286 - KASAN_USE_AFTER_SCOPE); 287 - } 288 - EXPORT_SYMBOL(__asan_poison_stack_memory); 289 - 290 - /* Emitted by compiler to unpoison large objects when they go into scope. */ 291 - void __asan_unpoison_stack_memory(const void *addr, size_t size) 292 - { 293 - kasan_unpoison_shadow(addr, size); 294 - } 295 - EXPORT_SYMBOL(__asan_unpoison_stack_memory); 296 - 297 278 /* Emitted by compiler to poison alloca()ed objects. */ 298 279 void __asan_alloca_poison(unsigned long addr, size_t size) 299 280 {

-3

mm/kasan/generic_report.c

··· 82 82 case KASAN_KMALLOC_FREE: 83 83 bug_type = "use-after-free"; 84 84 break; 85 - case KASAN_USE_AFTER_SCOPE: 86 - bug_type = "use-after-scope"; 87 - break; 88 85 case KASAN_ALLOCA_LEFT: 89 86 case KASAN_ALLOCA_RIGHT: 90 87 bug_type = "alloca-out-of-bounds";

+3 -3

mm/kasan/init.c

··· 42 42 #else 43 43 static inline bool kasan_p4d_table(pgd_t pgd) 44 44 { 45 - return 0; 45 + return false; 46 46 } 47 47 #endif 48 48 #if CONFIG_PGTABLE_LEVELS > 3 ··· 54 54 #else 55 55 static inline bool kasan_pud_table(p4d_t p4d) 56 56 { 57 - return 0; 57 + return false; 58 58 } 59 59 #endif 60 60 #if CONFIG_PGTABLE_LEVELS > 2 ··· 66 66 #else 67 67 static inline bool kasan_pmd_table(pud_t pud) 68 68 { 69 - return 0; 69 + return false; 70 70 } 71 71 #endif 72 72 pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss;

-3

mm/kasan/kasan.h

··· 34 34 #define KASAN_STACK_MID 0xF2 35 35 #define KASAN_STACK_RIGHT 0xF3 36 36 #define KASAN_STACK_PARTIAL 0xF4 37 - #define KASAN_USE_AFTER_SCOPE 0xF8 38 37 39 38 /* 40 39 * alloca redzone shadow values ··· 186 187 void __asan_loadN(unsigned long addr, size_t size); 187 188 void __asan_storeN(unsigned long addr, size_t size); 188 189 void __asan_handle_no_return(void); 189 - void __asan_poison_stack_memory(const void *addr, size_t size); 190 - void __asan_unpoison_stack_memory(const void *addr, size_t size); 191 190 void __asan_alloca_poison(unsigned long addr, size_t size); 192 191 void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom); 193 192

+2

mm/khugepaged.c

··· 1074 1074 BUG_ON(!pmd_none(*pmd)); 1075 1075 page_add_new_anon_rmap(new_page, vma, address, true); 1076 1076 mem_cgroup_commit_charge(new_page, memcg, false, true); 1077 + count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); 1077 1078 lru_cache_add_active_or_unevictable(new_page, vma); 1078 1079 pgtable_trans_huge_deposit(mm, pmd, pgtable); 1079 1080 set_pmd_at(mm, address, pmd, _pmd); ··· 1503 1502 page_ref_add(new_page, HPAGE_PMD_NR - 1); 1504 1503 set_page_dirty(new_page); 1505 1504 mem_cgroup_commit_charge(new_page, memcg, false, true); 1505 + count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); 1506 1506 lru_cache_add_anon(new_page); 1507 1507 1508 1508 /*

+64 -13

mm/ksm.c

··· 598 598 chain->chain_prune_time = jiffies; 599 599 chain->rmap_hlist_len = STABLE_NODE_CHAIN; 600 600 #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) 601 - chain->nid = -1; /* debug */ 601 + chain->nid = NUMA_NO_NODE; /* debug */ 602 602 #endif 603 603 ksm_stable_node_chains++; 604 604 ··· 667 667 free_stable_node(stable_node); 668 668 } 669 669 670 + enum get_ksm_page_flags { 671 + GET_KSM_PAGE_NOLOCK, 672 + GET_KSM_PAGE_LOCK, 673 + GET_KSM_PAGE_TRYLOCK 674 + }; 675 + 670 676 /* 671 677 * get_ksm_page: checks if the page indicated by the stable node 672 678 * is still its ksm page, despite having held no reference to it. ··· 692 686 * a page to put something that might look like our key in page->mapping. 693 687 * is on its way to being freed; but it is an anomaly to bear in mind. 694 688 */ 695 - static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) 689 + static struct page *get_ksm_page(struct stable_node *stable_node, 690 + enum get_ksm_page_flags flags) 696 691 { 697 692 struct page *page; 698 693 void *expected_mapping; ··· 713 706 * case this node is no longer referenced, and should be freed; 714 707 * however, it might mean that the page is under page_ref_freeze(). 715 708 * The __remove_mapping() case is easy, again the node is now stale; 716 - * but if page is swapcache in migrate_page_move_mapping(), it might 717 - * still be our page, in which case it's essential to keep the node. 709 + * the same is in reuse_ksm_page() case; but if page is swapcache 710 + * in migrate_page_move_mapping(), it might still be our page, 711 + * in which case it's essential to keep the node. 718 712 */ 719 713 while (!get_page_unless_zero(page)) { 720 714 /* ··· 736 728 goto stale; 737 729 } 738 730 739 - if (lock_it) { 731 + if (flags == GET_KSM_PAGE_TRYLOCK) { 732 + if (!trylock_page(page)) { 733 + put_page(page); 734 + return ERR_PTR(-EBUSY); 735 + } 736 + } else if (flags == GET_KSM_PAGE_LOCK) 740 737 lock_page(page); 738 + 739 + if (flags != GET_KSM_PAGE_NOLOCK) { 741 740 if (READ_ONCE(page->mapping) != expected_mapping) { 742 741 unlock_page(page); 743 742 put_page(page); ··· 778 763 struct page *page; 779 764 780 765 stable_node = rmap_item->head; 781 - page = get_ksm_page(stable_node, true); 766 + page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); 782 767 if (!page) 783 768 goto out; 784 769 ··· 878 863 struct page *page; 879 864 int err; 880 865 881 - page = get_ksm_page(stable_node, true); 866 + page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); 882 867 if (!page) { 883 868 /* 884 869 * get_ksm_page did remove_node_from_stable_tree itself. ··· 1400 1385 * stable_node parameter itself will be freed from 1401 1386 * under us if it returns NULL. 1402 1387 */ 1403 - _tree_page = get_ksm_page(dup, false); 1388 + _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK); 1404 1389 if (!_tree_page) 1405 1390 continue; 1406 1391 nr += 1; ··· 1523 1508 if (!is_stable_node_chain(stable_node)) { 1524 1509 if (is_page_sharing_candidate(stable_node)) { 1525 1510 *_stable_node_dup = stable_node; 1526 - return get_ksm_page(stable_node, false); 1511 + return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK); 1527 1512 } 1528 1513 /* 1529 1514 * _stable_node_dup set to NULL means the stable_node ··· 1628 1613 * wrprotected at all times. Any will work 1629 1614 * fine to continue the walk. 1630 1615 */ 1631 - tree_page = get_ksm_page(stable_node_any, false); 1616 + tree_page = get_ksm_page(stable_node_any, 1617 + GET_KSM_PAGE_NOLOCK); 1632 1618 } 1633 1619 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); 1634 1620 if (!tree_page) { ··· 1689 1673 * It would be more elegant to return stable_node 1690 1674 * than kpage, but that involves more changes. 1691 1675 */ 1692 - tree_page = get_ksm_page(stable_node_dup, true); 1676 + tree_page = get_ksm_page(stable_node_dup, 1677 + GET_KSM_PAGE_TRYLOCK); 1678 + 1679 + if (PTR_ERR(tree_page) == -EBUSY) 1680 + return ERR_PTR(-EBUSY); 1681 + 1693 1682 if (unlikely(!tree_page)) 1694 1683 /* 1695 1684 * The tree may have been rebalanced, ··· 1863 1842 * wrprotected at all times. Any will work 1864 1843 * fine to continue the walk. 1865 1844 */ 1866 - tree_page = get_ksm_page(stable_node_any, false); 1845 + tree_page = get_ksm_page(stable_node_any, 1846 + GET_KSM_PAGE_NOLOCK); 1867 1847 } 1868 1848 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); 1869 1849 if (!tree_page) { ··· 2090 2068 remove_rmap_item_from_tree(rmap_item); 2091 2069 2092 2070 if (kpage) { 2071 + if (PTR_ERR(kpage) == -EBUSY) 2072 + return; 2073 + 2093 2074 err = try_to_merge_with_ksm_page(rmap_item, page, kpage); 2094 2075 if (!err) { 2095 2076 /* ··· 2267 2242 2268 2243 list_for_each_entry_safe(stable_node, next, 2269 2244 &migrate_nodes, list) { 2270 - page = get_ksm_page(stable_node, false); 2245 + page = get_ksm_page(stable_node, 2246 + GET_KSM_PAGE_NOLOCK); 2271 2247 if (page) 2272 2248 put_page(page); 2273 2249 cond_resched(); ··· 2668 2642 goto again; 2669 2643 } 2670 2644 2645 + bool reuse_ksm_page(struct page *page, 2646 + struct vm_area_struct *vma, 2647 + unsigned long address) 2648 + { 2649 + #ifdef CONFIG_DEBUG_VM 2650 + if (WARN_ON(is_zero_pfn(page_to_pfn(page))) || 2651 + WARN_ON(!page_mapped(page)) || 2652 + WARN_ON(!PageLocked(page))) { 2653 + dump_page(page, "reuse_ksm_page"); 2654 + return false; 2655 + } 2656 + #endif 2657 + 2658 + if (PageSwapCache(page) || !page_stable_node(page)) 2659 + return false; 2660 + /* Prohibit parallel get_ksm_page() */ 2661 + if (!page_ref_freeze(page, 1)) 2662 + return false; 2663 + 2664 + page_move_anon_rmap(page, vma); 2665 + page->index = linear_page_index(vma, address); 2666 + page_ref_unfreeze(page, 1); 2667 + 2668 + return true; 2669 + } 2671 2670 #ifdef CONFIG_MIGRATION 2672 2671 void ksm_migrate_page(struct page *newpage, struct page *oldpage) 2673 2672 {

+1 -2

mm/list_lru.c

··· 601 601 struct lock_class_key *key, struct shrinker *shrinker) 602 602 { 603 603 int i; 604 - size_t size = sizeof(*lru->node) * nr_node_ids; 605 604 int err = -ENOMEM; 606 605 607 606 #ifdef CONFIG_MEMCG_KMEM ··· 611 612 #endif 612 613 memcg_get_cache_ids(); 613 614 614 - lru->node = kzalloc(size, GFP_KERNEL); 615 + lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL); 615 616 if (!lru->node) 616 617 goto out; 617 618

+1 -2

mm/memblock.c

··· 2005 2005 static int __init memblock_init_debugfs(void) 2006 2006 { 2007 2007 struct dentry *root = debugfs_create_dir("memblock", NULL); 2008 - if (!root) 2009 - return -ENXIO; 2008 + 2010 2009 debugfs_create_file("memory", 0444, root, 2011 2010 &memblock.memory, &memblock_debug_fops); 2012 2011 debugfs_create_file("reserved", 0444, root,

+74 -76

mm/memcontrol.c

··· 39 39 #include <linux/shmem_fs.h> 40 40 #include <linux/hugetlb.h> 41 41 #include <linux/pagemap.h> 42 + #include <linux/vm_event_item.h> 42 43 #include <linux/smp.h> 43 44 #include <linux/page-flags.h> 44 45 #include <linux/backing-dev.h> ··· 248 247 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 249 248 iter != NULL; \ 250 249 iter = mem_cgroup_iter(NULL, iter, NULL)) 250 + 251 + static inline bool should_force_charge(void) 252 + { 253 + return tsk_is_oom_victim(current) || fatal_signal_pending(current) || 254 + (current->flags & PF_EXITING); 255 + } 251 256 252 257 /* Some nice accessors for the vmpressure. */ 253 258 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) ··· 1396 1389 }; 1397 1390 bool ret; 1398 1391 1399 - mutex_lock(&oom_lock); 1400 - ret = out_of_memory(&oc); 1392 + if (mutex_lock_killable(&oom_lock)) 1393 + return true; 1394 + /* 1395 + * A few threads which were not waiting at mutex_lock_killable() can 1396 + * fail to bail out. Therefore, check again after holding oom_lock. 1397 + */ 1398 + ret = should_force_charge() || out_of_memory(&oc); 1401 1399 mutex_unlock(&oom_lock); 1402 1400 return ret; 1403 1401 } ··· 2221 2209 * bypass the last charges so that they can exit quickly and 2222 2210 * free their memory. 2223 2211 */ 2224 - if (unlikely(tsk_is_oom_victim(current) || 2225 - fatal_signal_pending(current) || 2226 - current->flags & PF_EXITING)) 2212 + if (unlikely(should_force_charge())) 2227 2213 goto force; 2228 2214 2229 2215 /* ··· 2362 2352 2363 2353 static void lock_page_lru(struct page *page, int *isolated) 2364 2354 { 2365 - struct zone *zone = page_zone(page); 2355 + pg_data_t *pgdat = page_pgdat(page); 2366 2356 2367 - spin_lock_irq(zone_lru_lock(zone)); 2357 + spin_lock_irq(&pgdat->lru_lock); 2368 2358 if (PageLRU(page)) { 2369 2359 struct lruvec *lruvec; 2370 2360 2371 - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 2361 + lruvec = mem_cgroup_page_lruvec(page, pgdat); 2372 2362 ClearPageLRU(page); 2373 2363 del_page_from_lru_list(page, lruvec, page_lru(page)); 2374 2364 *isolated = 1; ··· 2378 2368 2379 2369 static void unlock_page_lru(struct page *page, int isolated) 2380 2370 { 2381 - struct zone *zone = page_zone(page); 2371 + pg_data_t *pgdat = page_pgdat(page); 2382 2372 2383 2373 if (isolated) { 2384 2374 struct lruvec *lruvec; 2385 2375 2386 - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 2376 + lruvec = mem_cgroup_page_lruvec(page, pgdat); 2387 2377 VM_BUG_ON_PAGE(PageLRU(page), page); 2388 2378 SetPageLRU(page); 2389 2379 add_page_to_lru_list(page, lruvec, page_lru(page)); 2390 2380 } 2391 - spin_unlock_irq(zone_lru_lock(zone)); 2381 + spin_unlock_irq(&pgdat->lru_lock); 2392 2382 } 2393 2383 2394 2384 static void commit_charge(struct page *page, struct mem_cgroup *memcg, ··· 2583 2573 } 2584 2574 2585 2575 /** 2586 - * memcg_kmem_charge_memcg: charge a kmem page 2576 + * __memcg_kmem_charge_memcg: charge a kmem page 2587 2577 * @page: page to charge 2588 2578 * @gfp: reclaim mode 2589 2579 * @order: allocation order ··· 2591 2581 * 2592 2582 * Returns 0 on success, an error code on failure. 2593 2583 */ 2594 - int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, 2584 + int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, 2595 2585 struct mem_cgroup *memcg) 2596 2586 { 2597 2587 unsigned int nr_pages = 1 << order; ··· 2614 2604 } 2615 2605 2616 2606 /** 2617 - * memcg_kmem_charge: charge a kmem page to the current memory cgroup 2607 + * __memcg_kmem_charge: charge a kmem page to the current memory cgroup 2618 2608 * @page: page to charge 2619 2609 * @gfp: reclaim mode 2620 2610 * @order: allocation order 2621 2611 * 2622 2612 * Returns 0 on success, an error code on failure. 2623 2613 */ 2624 - int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) 2614 + int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) 2625 2615 { 2626 2616 struct mem_cgroup *memcg; 2627 2617 int ret = 0; 2628 2618 2629 - if (mem_cgroup_disabled() || memcg_kmem_bypass()) 2619 + if (memcg_kmem_bypass()) 2630 2620 return 0; 2631 2621 2632 2622 memcg = get_mem_cgroup_from_current(); 2633 2623 if (!mem_cgroup_is_root(memcg)) { 2634 - ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); 2624 + ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); 2635 2625 if (!ret) 2636 2626 __SetPageKmemcg(page); 2637 2627 } ··· 2639 2629 return ret; 2640 2630 } 2641 2631 /** 2642 - * memcg_kmem_uncharge: uncharge a kmem page 2632 + * __memcg_kmem_uncharge: uncharge a kmem page 2643 2633 * @page: page to uncharge 2644 2634 * @order: allocation order 2645 2635 */ 2646 - void memcg_kmem_uncharge(struct page *page, int order) 2636 + void __memcg_kmem_uncharge(struct page *page, int order) 2647 2637 { 2648 2638 struct mem_cgroup *memcg = page->mem_cgroup; 2649 2639 unsigned int nr_pages = 1 << order; ··· 2674 2664 2675 2665 /* 2676 2666 * Because tail pages are not marked as "used", set it. We're under 2677 - * zone_lru_lock and migration entries setup in all page mappings. 2667 + * pgdat->lru_lock and migration entries setup in all page mappings. 2678 2668 */ 2679 2669 void mem_cgroup_split_huge_fixup(struct page *head) 2680 2670 { ··· 3347 3337 const struct numa_stat *stat; 3348 3338 int nid; 3349 3339 unsigned long nr; 3350 - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3340 + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 3351 3341 3352 3342 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3353 3343 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); ··· 3398 3388 3399 3389 static int memcg_stat_show(struct seq_file *m, void *v) 3400 3390 { 3401 - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3391 + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 3402 3392 unsigned long memory, memsw; 3403 3393 struct mem_cgroup *mi; 3404 3394 unsigned int i; ··· 3636 3626 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 3637 3627 3638 3628 /* Allocate memory for new array of thresholds */ 3639 - new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 3640 - GFP_KERNEL); 3629 + new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); 3641 3630 if (!new) { 3642 3631 ret = -ENOMEM; 3643 3632 goto unlock; ··· 3830 3821 3831 3822 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 3832 3823 { 3833 - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); 3824 + struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 3834 3825 3835 3826 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 3836 3827 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); ··· 4429 4420 static struct mem_cgroup *mem_cgroup_alloc(void) 4430 4421 { 4431 4422 struct mem_cgroup *memcg; 4432 - size_t size; 4423 + unsigned int size; 4433 4424 int node; 4434 4425 4435 4426 size = sizeof(struct mem_cgroup); ··· 5363 5354 root_mem_cgroup->use_hierarchy = false; 5364 5355 } 5365 5356 5357 + static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) 5358 + { 5359 + if (value == PAGE_COUNTER_MAX) 5360 + seq_puts(m, "max\n"); 5361 + else 5362 + seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); 5363 + 5364 + return 0; 5365 + } 5366 + 5366 5367 static u64 memory_current_read(struct cgroup_subsys_state *css, 5367 5368 struct cftype *cft) 5368 5369 { ··· 5383 5364 5384 5365 static int memory_min_show(struct seq_file *m, void *v) 5385 5366 { 5386 - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5387 - unsigned long min = READ_ONCE(memcg->memory.min); 5388 - 5389 - if (min == PAGE_COUNTER_MAX) 5390 - seq_puts(m, "max\n"); 5391 - else 5392 - seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE); 5393 - 5394 - return 0; 5367 + return seq_puts_memcg_tunable(m, 5368 + READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); 5395 5369 } 5396 5370 5397 5371 static ssize_t memory_min_write(struct kernfs_open_file *of, ··· 5406 5394 5407 5395 static int memory_low_show(struct seq_file *m, void *v) 5408 5396 { 5409 - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5410 - unsigned long low = READ_ONCE(memcg->memory.low); 5411 - 5412 - if (low == PAGE_COUNTER_MAX) 5413 - seq_puts(m, "max\n"); 5414 - else 5415 - seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); 5416 - 5417 - return 0; 5397 + return seq_puts_memcg_tunable(m, 5398 + READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); 5418 5399 } 5419 5400 5420 5401 static ssize_t memory_low_write(struct kernfs_open_file *of, ··· 5429 5424 5430 5425 static int memory_high_show(struct seq_file *m, void *v) 5431 5426 { 5432 - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5433 - unsigned long high = READ_ONCE(memcg->high); 5434 - 5435 - if (high == PAGE_COUNTER_MAX) 5436 - seq_puts(m, "max\n"); 5437 - else 5438 - seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); 5439 - 5440 - return 0; 5427 + return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high)); 5441 5428 } 5442 5429 5443 5430 static ssize_t memory_high_write(struct kernfs_open_file *of, ··· 5458 5461 5459 5462 static int memory_max_show(struct seq_file *m, void *v) 5460 5463 { 5461 - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5462 - unsigned long max = READ_ONCE(memcg->memory.max); 5463 - 5464 - if (max == PAGE_COUNTER_MAX) 5465 - seq_puts(m, "max\n"); 5466 - else 5467 - seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); 5468 - 5469 - return 0; 5464 + return seq_puts_memcg_tunable(m, 5465 + READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); 5470 5466 } 5471 5467 5472 5468 static ssize_t memory_max_write(struct kernfs_open_file *of, ··· 5513 5523 5514 5524 static int memory_events_show(struct seq_file *m, void *v) 5515 5525 { 5516 - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5526 + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 5517 5527 5518 5528 seq_printf(m, "low %lu\n", 5519 5529 atomic_long_read(&memcg->memory_events[MEMCG_LOW])); ··· 5531 5541 5532 5542 static int memory_stat_show(struct seq_file *m, void *v) 5533 5543 { 5534 - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5544 + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 5535 5545 struct accumulated_stats acc; 5536 5546 int i; 5537 5547 ··· 5572 5582 seq_printf(m, "file_writeback %llu\n", 5573 5583 (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); 5574 5584 5585 + /* 5586 + * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter 5587 + * with the NR_ANON_THP vm counter, but right now it's a pain in the 5588 + * arse because it requires migrating the work out of rmap to a place 5589 + * where the page->mem_cgroup is set up and stable. 5590 + */ 5591 + seq_printf(m, "anon_thp %llu\n", 5592 + (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE); 5593 + 5575 5594 for (i = 0; i < NR_LRU_LISTS; i++) 5576 5595 seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], 5577 5596 (u64)acc.lru_pages[i] * PAGE_SIZE); ··· 5612 5613 seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); 5613 5614 seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); 5614 5615 5616 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5617 + seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]); 5618 + seq_printf(m, "thp_collapse_alloc %lu\n", 5619 + acc.events[THP_COLLAPSE_ALLOC]); 5620 + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 5621 + 5615 5622 return 0; 5616 5623 } 5617 5624 5618 5625 static int memory_oom_group_show(struct seq_file *m, void *v) 5619 5626 { 5620 - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5627 + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 5621 5628 5622 5629 seq_printf(m, "%d\n", memcg->oom_group); 5623 5630 ··· 5752 5747 * 5753 5748 * | memory.current, if memory.current < memory.low 5754 5749 * low_usage = | 5755 - | 0, otherwise. 5750 + * | 0, otherwise. 5756 5751 * 5757 5752 * 5758 5753 * Such definition of the effective memory.low provides the expected ··· 6606 6601 6607 6602 static int swap_max_show(struct seq_file *m, void *v) 6608 6603 { 6609 - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 6610 - unsigned long max = READ_ONCE(memcg->swap.max); 6611 - 6612 - if (max == PAGE_COUNTER_MAX) 6613 - seq_puts(m, "max\n"); 6614 - else 6615 - seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); 6616 - 6617 - return 0; 6604 + return seq_puts_memcg_tunable(m, 6605 + READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); 6618 6606 } 6619 6607 6620 6608 static ssize_t swap_max_write(struct kernfs_open_file *of, ··· 6629 6631 6630 6632 static int swap_events_show(struct seq_file *m, void *v) 6631 6633 { 6632 - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 6634 + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6633 6635 6634 6636 seq_printf(m, "max %lu\n", 6635 6637 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));

+2 -1

mm/memfd.c

··· 131 131 #define F_ALL_SEALS (F_SEAL_SEAL | \ 132 132 F_SEAL_SHRINK | \ 133 133 F_SEAL_GROW | \ 134 - F_SEAL_WRITE) 134 + F_SEAL_WRITE | \ 135 + F_SEAL_FUTURE_WRITE) 135 136 136 137 static int memfd_add_seals(struct file *file, unsigned int seals) 137 138 {

+6 -8

mm/memory-failure.c

··· 1825 1825 struct page *hpage = compound_head(page); 1826 1826 1827 1827 if (!PageHuge(page) && PageTransHuge(hpage)) { 1828 - lock_page(hpage); 1829 - if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { 1830 - unlock_page(hpage); 1831 - if (!PageAnon(hpage)) 1828 + lock_page(page); 1829 + if (!PageAnon(page) || unlikely(split_huge_page(page))) { 1830 + unlock_page(page); 1831 + if (!PageAnon(page)) 1832 1832 pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); 1833 1833 else 1834 1834 pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); 1835 - put_hwpoison_page(hpage); 1835 + put_hwpoison_page(page); 1836 1836 return -EBUSY; 1837 1837 } 1838 - unlock_page(hpage); 1839 - get_hwpoison_page(page); 1840 - put_hwpoison_page(hpage); 1838 + unlock_page(page); 1841 1839 } 1842 1840 1843 1841 /*

+48 -24

mm/memory.c

··· 69 69 #include <linux/userfaultfd_k.h> 70 70 #include <linux/dax.h> 71 71 #include <linux/oom.h> 72 + #include <linux/numa.h> 72 73 73 74 #include <asm/io.h> 74 75 #include <asm/mmu_context.h> ··· 1452 1451 spinlock_t *ptl; 1453 1452 1454 1453 retval = -EINVAL; 1455 - if (PageAnon(page)) 1454 + if (PageAnon(page) || PageSlab(page) || page_has_type(page)) 1456 1455 goto out; 1457 1456 retval = -ENOMEM; 1458 1457 flush_dcache_page(page); ··· 1504 1503 * under mm->mmap_sem write-lock, so it can change vma->vm_flags. 1505 1504 * Caller must set VM_MIXEDMAP on vma if it wants to call this 1506 1505 * function from other places, for example from page-fault handler. 1506 + * 1507 + * Return: %0 on success, negative error code otherwise. 1507 1508 */ 1508 1509 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, 1509 1510 struct page *page) ··· 1833 1830 * @size: size of map area 1834 1831 * @prot: page protection flags for this mapping 1835 1832 * 1836 - * Note: this is only safe if the mm semaphore is held when called. 1833 + * Note: this is only safe if the mm semaphore is held when called. 1834 + * 1835 + * Return: %0 on success, negative error code otherwise. 1837 1836 */ 1838 1837 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, 1839 1838 unsigned long pfn, unsigned long size, pgprot_t prot) ··· 1908 1903 * 1909 1904 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get 1910 1905 * whatever write-combining details or similar. 1906 + * 1907 + * Return: %0 on success, negative error code otherwise. 1911 1908 */ 1912 1909 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) 1913 1910 { ··· 2388 2381 * 2389 2382 * This function handles all that is needed to finish a write page fault in a 2390 2383 * shared mapping due to PTE being read-only once the mapped page is prepared. 2391 - * It handles locking of PTE and modifying it. The function returns 2392 - * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE 2393 - * lock. 2384 + * It handles locking of PTE and modifying it. 2394 2385 * 2395 2386 * The function expects the page to be locked or other protection against 2396 2387 * concurrent faults / writeback (such as DAX radix tree locks). 2388 + * 2389 + * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before 2390 + * we acquired PTE lock. 2397 2391 */ 2398 2392 vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) 2399 2393 { ··· 2512 2504 * Take out anonymous pages first, anonymous shared vmas are 2513 2505 * not dirty accountable. 2514 2506 */ 2515 - if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { 2507 + if (PageAnon(vmf->page)) { 2516 2508 int total_map_swapcount; 2509 + if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) || 2510 + page_count(vmf->page) != 1)) 2511 + goto copy; 2517 2512 if (!trylock_page(vmf->page)) { 2518 2513 get_page(vmf->page); 2519 2514 pte_unmap_unlock(vmf->pte, vmf->ptl); ··· 2530 2519 return 0; 2531 2520 } 2532 2521 put_page(vmf->page); 2522 + } 2523 + if (PageKsm(vmf->page)) { 2524 + bool reused = reuse_ksm_page(vmf->page, vmf->vma, 2525 + vmf->address); 2526 + unlock_page(vmf->page); 2527 + if (!reused) 2528 + goto copy; 2529 + wp_page_reuse(vmf); 2530 + return VM_FAULT_WRITE; 2533 2531 } 2534 2532 if (reuse_swap_page(vmf->page, &total_map_swapcount)) { 2535 2533 if (total_map_swapcount == 1) { ··· 2560 2540 (VM_WRITE|VM_SHARED))) { 2561 2541 return wp_page_shared(vmf); 2562 2542 } 2563 - 2543 + copy: 2564 2544 /* 2565 2545 * Ok, we need to copy. Oh, well.. 2566 2546 */ ··· 3221 3201 * 3222 3202 * Target users are page handler itself and implementations of 3223 3203 * vm_ops->map_pages. 3204 + * 3205 + * Return: %0 on success, %VM_FAULT_ code in case of error. 3224 3206 */ 3225 3207 vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, 3226 3208 struct page *page) ··· 3283 3261 * This function handles all that is needed to finish a page fault once the 3284 3262 * page to fault in is prepared. It handles locking of PTEs, inserts PTE for 3285 3263 * given page, adds reverse page mapping, handles memcg charges and LRU 3286 - * addition. The function returns 0 on success, VM_FAULT_ code in case of 3287 - * error. 3264 + * addition. 3288 3265 * 3289 3266 * The function expects the page to be locked and on success it consumes a 3290 3267 * reference of a page being mapped (for the PTE which maps it). 3268 + * 3269 + * Return: %0 on success, %VM_FAULT_ code in case of error. 3291 3270 */ 3292 3271 vm_fault_t finish_fault(struct vm_fault *vmf) 3293 3272 { ··· 3344 3321 3345 3322 static int __init fault_around_debugfs(void) 3346 3323 { 3347 - void *ret; 3348 - 3349 - ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, 3350 - &fault_around_bytes_fops); 3351 - if (!ret) 3352 - pr_warn("Failed to create fault_around_bytes in debugfs"); 3324 + debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, 3325 + &fault_around_bytes_fops); 3353 3326 return 0; 3354 3327 } 3355 3328 late_initcall(fault_around_debugfs); ··· 3536 3517 * but allow concurrent faults). 3537 3518 * The mmap_sem may have been released depending on flags and our 3538 3519 * return value. See filemap_fault() and __lock_page_or_retry(). 3520 + * If mmap_sem is released, vma may become invalid (for example 3521 + * by other thread calling munmap()). 3539 3522 */ 3540 3523 static vm_fault_t do_fault(struct vm_fault *vmf) 3541 3524 { 3542 3525 struct vm_area_struct *vma = vmf->vma; 3526 + struct mm_struct *vm_mm = vma->vm_mm; 3543 3527 vm_fault_t ret; 3544 3528 3545 3529 /* ··· 3583 3561 3584 3562 /* preallocated pagetable is unused: free it */ 3585 3563 if (vmf->prealloc_pte) { 3586 - pte_free(vma->vm_mm, vmf->prealloc_pte); 3564 + pte_free(vm_mm, vmf->prealloc_pte); 3587 3565 vmf->prealloc_pte = NULL; 3588 3566 } 3589 3567 return ret; ··· 3608 3586 { 3609 3587 struct vm_area_struct *vma = vmf->vma; 3610 3588 struct page *page = NULL; 3611 - int page_nid = -1; 3589 + int page_nid = NUMA_NO_NODE; 3612 3590 int last_cpupid; 3613 3591 int target_nid; 3614 3592 bool migrated = false; 3615 - pte_t pte; 3593 + pte_t pte, old_pte; 3616 3594 bool was_writable = pte_savedwrite(vmf->orig_pte); 3617 3595 int flags = 0; 3618 3596 ··· 3632 3610 * Make it present again, Depending on how arch implementes non 3633 3611 * accessible ptes, some can allow access by kernel mode. 3634 3612 */ 3635 - pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte); 3636 - pte = pte_modify(pte, vma->vm_page_prot); 3613 + old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); 3614 + pte = pte_modify(old_pte, vma->vm_page_prot); 3637 3615 pte = pte_mkyoung(pte); 3638 3616 if (was_writable) 3639 3617 pte = pte_mkwrite(pte); 3640 - ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte); 3618 + ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); 3641 3619 update_mmu_cache(vma, vmf->address, vmf->pte); 3642 3620 3643 3621 page = vm_normal_page(vma, vmf->address, pte); ··· 3675 3653 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, 3676 3654 &flags); 3677 3655 pte_unmap_unlock(vmf->pte, vmf->ptl); 3678 - if (target_nid == -1) { 3656 + if (target_nid == NUMA_NO_NODE) { 3679 3657 put_page(page); 3680 3658 goto out; 3681 3659 } ··· 3689 3667 flags |= TNF_MIGRATE_FAIL; 3690 3668 3691 3669 out: 3692 - if (page_nid != -1) 3670 + if (page_nid != NUMA_NO_NODE) 3693 3671 task_numa_fault(last_cpupid, page_nid, 1, flags); 3694 3672 return 0; 3695 3673 } ··· 4172 4150 * 4173 4151 * Only IO mappings and raw PFN mappings are allowed. 4174 4152 * 4175 - * Returns zero and the pfn at @pfn on success, -ve otherwise. 4153 + * Return: zero and the pfn at @pfn on success, -ve otherwise. 4176 4154 */ 4177 4155 int follow_pfn(struct vm_area_struct *vma, unsigned long address, 4178 4156 unsigned long *pfn) ··· 4322 4300 * @gup_flags: flags modifying lookup behaviour 4323 4301 * 4324 4302 * The caller must hold a reference on @mm. 4303 + * 4304 + * Return: number of bytes copied from source to destination. 4325 4305 */ 4326 4306 int access_remote_vm(struct mm_struct *mm, unsigned long addr, 4327 4307 void *buf, int len, unsigned int gup_flags)

+34 -21

mm/memory_hotplug.c

··· 47 47 * and restore_online_page_callback() for generic callback restore. 48 48 */ 49 49 50 - static void generic_online_page(struct page *page); 50 + static void generic_online_page(struct page *page, unsigned int order); 51 51 52 52 static online_page_callback_t online_page_callback = generic_online_page; 53 53 static DEFINE_MUTEX(online_page_callback_lock); ··· 656 656 } 657 657 EXPORT_SYMBOL_GPL(__online_page_free); 658 658 659 - static void generic_online_page(struct page *page) 659 + static void generic_online_page(struct page *page, unsigned int order) 660 660 { 661 - __online_page_set_limits(page); 662 - __online_page_increment_counters(page); 663 - __online_page_free(page); 661 + kernel_map_pages(page, 1 << order, 1); 662 + __free_pages_core(page, order); 663 + totalram_pages_add(1UL << order); 664 + #ifdef CONFIG_HIGHMEM 665 + if (PageHighMem(page)) 666 + totalhigh_pages_add(1UL << order); 667 + #endif 668 + } 669 + 670 + static int online_pages_blocks(unsigned long start, unsigned long nr_pages) 671 + { 672 + unsigned long end = start + nr_pages; 673 + int order, onlined_pages = 0; 674 + 675 + while (start < end) { 676 + order = min(MAX_ORDER - 1, 677 + get_order(PFN_PHYS(end) - PFN_PHYS(start))); 678 + (*online_page_callback)(pfn_to_page(start), order); 679 + 680 + onlined_pages += (1UL << order); 681 + start += (1UL << order); 682 + } 683 + return onlined_pages; 664 684 } 665 685 666 686 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 667 687 void *arg) 668 688 { 669 - unsigned long i; 670 689 unsigned long onlined_pages = *(unsigned long *)arg; 671 - struct page *page; 672 690 673 691 if (PageReserved(pfn_to_page(start_pfn))) 674 - for (i = 0; i < nr_pages; i++) { 675 - page = pfn_to_page(start_pfn + i); 676 - (*online_page_callback)(page); 677 - onlined_pages++; 678 - } 692 + onlined_pages += online_pages_blocks(start_pfn, nr_pages); 679 693 680 694 online_mem_sections(start_pfn, start_pfn + nr_pages); 681 695 ··· 703 689 { 704 690 int nid = zone_to_nid(zone); 705 691 706 - arg->status_change_nid = -1; 707 - arg->status_change_nid_normal = -1; 708 - arg->status_change_nid_high = -1; 692 + arg->status_change_nid = NUMA_NO_NODE; 693 + arg->status_change_nid_normal = NUMA_NO_NODE; 694 + arg->status_change_nid_high = NUMA_NO_NODE; 709 695 710 696 if (!node_state(nid, N_MEMORY)) 711 697 arg->status_change_nid = nid; ··· 1379 1365 1380 1366 if (PageHuge(page)) { 1381 1367 struct page *head = compound_head(page); 1382 - pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1383 1368 if (compound_order(head) > PFN_SECTION_SHIFT) { 1384 1369 ret = -EBUSY; 1385 1370 break; 1386 1371 } 1387 - isolate_huge_page(page, &source); 1372 + pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1373 + isolate_huge_page(head, &source); 1388 1374 continue; 1389 1375 } else if (PageTransHuge(page)) 1390 1376 pfn = page_to_pfn(compound_head(page)) ··· 1510 1496 unsigned long present_pages = 0; 1511 1497 enum zone_type zt; 1512 1498 1513 - arg->status_change_nid = -1; 1514 - arg->status_change_nid_normal = -1; 1515 - arg->status_change_nid_high = -1; 1499 + arg->status_change_nid = NUMA_NO_NODE; 1500 + arg->status_change_nid_normal = NUMA_NO_NODE; 1501 + arg->status_change_nid_high = NUMA_NO_NODE; 1516 1502 1517 1503 /* 1518 1504 * Check whether node_states[N_NORMAL_MEMORY] will be changed. ··· 1626 1612 1627 1613 cond_resched(); 1628 1614 lru_add_drain_all(); 1629 - drain_all_pages(zone); 1630 1615 1631 1616 pfn = scan_movable_pages(pfn, end_pfn); 1632 1617 if (pfn) {

+2 -2

mm/mempolicy.c

··· 350 350 { 351 351 if (!pol) 352 352 return; 353 - if (!mpol_store_user_nodemask(pol) && 353 + if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) && 354 354 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 355 355 return; 356 356 ··· 2304 2304 unsigned long pgoff; 2305 2305 int thiscpu = raw_smp_processor_id(); 2306 2306 int thisnid = cpu_to_node(thiscpu); 2307 - int polnid = -1; 2307 + int polnid = NUMA_NO_NODE; 2308 2308 int ret = -1; 2309 2309 2310 2310 pol = get_vma_policy(vma, addr);

+8

mm/mempool.c

··· 222 222 * 223 223 * Like mempool_create(), but initializes the pool in (i.e. embedded in another 224 224 * structure). 225 + * 226 + * Return: %0 on success, negative error code otherwise. 225 227 */ 226 228 int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, 227 229 mempool_free_t *free_fn, void *pool_data) ··· 247 245 * functions. This function might sleep. Both the alloc_fn() and the free_fn() 248 246 * functions might sleep - as long as the mempool_alloc() function is not called 249 247 * from IRQ contexts. 248 + * 249 + * Return: pointer to the created memory pool object or %NULL on error. 250 250 */ 251 251 mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, 252 252 mempool_free_t *free_fn, void *pool_data) ··· 293 289 * Note, the caller must guarantee that no mempool_destroy is called 294 290 * while this function is running. mempool_alloc() & mempool_free() 295 291 * might be called (eg. from IRQ contexts) while this function executes. 292 + * 293 + * Return: %0 on success, negative error code otherwise. 296 294 */ 297 295 int mempool_resize(mempool_t *pool, int new_min_nr) 298 296 { ··· 369 363 * *never* fails when called from process contexts. (it might 370 364 * fail if called from an IRQ context.) 371 365 * Note: using __GFP_ZERO is not supported. 366 + * 367 + * Return: pointer to the allocated element or %NULL on error. 372 368 */ 373 369 void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) 374 370 {

+7 -7

mm/migrate.c

··· 100 100 /* 101 101 * Check PageMovable before holding a PG_lock because page's owner 102 102 * assumes anybody doesn't touch PG_lock of newly allocated page 103 - * so unconditionally grapping the lock ruins page's owner side. 103 + * so unconditionally grabbing the lock ruins page's owner side. 104 104 */ 105 105 if (unlikely(!__PageMovable(page))) 106 106 goto out_putpage; ··· 374 374 } 375 375 #endif 376 376 377 - static int expected_page_refs(struct page *page) 377 + static int expected_page_refs(struct address_space *mapping, struct page *page) 378 378 { 379 379 int expected_count = 1; 380 380 ··· 384 384 */ 385 385 expected_count += is_device_private_page(page); 386 386 expected_count += is_device_public_page(page); 387 - if (page_mapping(page)) 387 + if (mapping) 388 388 expected_count += hpage_nr_pages(page) + page_has_private(page); 389 389 390 390 return expected_count; ··· 405 405 XA_STATE(xas, &mapping->i_pages, page_index(page)); 406 406 struct zone *oldzone, *newzone; 407 407 int dirty; 408 - int expected_count = expected_page_refs(page) + extra_count; 408 + int expected_count = expected_page_refs(mapping, page) + extra_count; 409 409 410 410 if (!mapping) { 411 411 /* Anonymous page without mapping */ ··· 750 750 return migrate_page(mapping, newpage, page, mode); 751 751 752 752 /* Check whether page does not have extra refs before we do more work */ 753 - expected_count = expected_page_refs(page); 753 + expected_count = expected_page_refs(mapping, page); 754 754 if (page_count(page) != expected_count) 755 755 return -EAGAIN; 756 756 ··· 911 911 */ 912 912 if (page_has_private(page) && 913 913 !try_to_release_page(page, GFP_KERNEL)) 914 - return -EAGAIN; 914 + return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; 915 915 916 916 return migrate_page(mapping, newpage, page, mode); 917 917 } ··· 1287 1287 struct anon_vma *anon_vma = NULL; 1288 1288 1289 1289 /* 1290 - * Movability of hugepages depends on architectures and hugepage size. 1290 + * Migratability of hugepages depends on architectures and their size. 1291 1291 * This check is necessary because some callers of hugepage migration 1292 1292 * like soft offline and memory hotremove don't walk through page 1293 1293 * tables or check whether the hugepage is pmd-based or not before

+7 -7

mm/mlock.c

··· 182 182 unsigned int munlock_vma_page(struct page *page) 183 183 { 184 184 int nr_pages; 185 - struct zone *zone = page_zone(page); 185 + pg_data_t *pgdat = page_pgdat(page); 186 186 187 187 /* For try_to_munlock() and to serialize with page migration */ 188 188 BUG_ON(!PageLocked(page)); ··· 194 194 * might otherwise copy PageMlocked to part of the tail pages before 195 195 * we clear it in the head page. It also stabilizes hpage_nr_pages(). 196 196 */ 197 - spin_lock_irq(zone_lru_lock(zone)); 197 + spin_lock_irq(&pgdat->lru_lock); 198 198 199 199 if (!TestClearPageMlocked(page)) { 200 200 /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ ··· 203 203 } 204 204 205 205 nr_pages = hpage_nr_pages(page); 206 - __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); 206 + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); 207 207 208 208 if (__munlock_isolate_lru_page(page, true)) { 209 - spin_unlock_irq(zone_lru_lock(zone)); 209 + spin_unlock_irq(&pgdat->lru_lock); 210 210 __munlock_isolated_page(page); 211 211 goto out; 212 212 } 213 213 __munlock_isolation_failed(page); 214 214 215 215 unlock_out: 216 - spin_unlock_irq(zone_lru_lock(zone)); 216 + spin_unlock_irq(&pgdat->lru_lock); 217 217 218 218 out: 219 219 return nr_pages - 1; ··· 298 298 pagevec_init(&pvec_putback); 299 299 300 300 /* Phase 1: page isolation */ 301 - spin_lock_irq(zone_lru_lock(zone)); 301 + spin_lock_irq(&zone->zone_pgdat->lru_lock); 302 302 for (i = 0; i < nr; i++) { 303 303 struct page *page = pvec->pages[i]; 304 304 ··· 325 325 pvec->pages[i] = NULL; 326 326 } 327 327 __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); 328 - spin_unlock_irq(zone_lru_lock(zone)); 328 + spin_unlock_irq(&zone->zone_pgdat->lru_lock); 329 329 330 330 /* Now we can release pins of pages that we are not munlocking */ 331 331 pagevec_release(&pvec_putback);

+7 -8

mm/mmap.c

··· 438 438 { 439 439 /* 440 440 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback 441 - * function that does exacltly what we want. 441 + * function that does exactly what we want. 442 442 */ 443 443 vma_gap_callbacks_propagate(&vma->vm_rb, NULL); 444 444 } ··· 1012 1012 * VM_SOFTDIRTY should not prevent from VMA merging, if we 1013 1013 * match the flags but dirty bit -- the caller should mark 1014 1014 * merged VMA as dirty. If dirty bit won't be excluded from 1015 - * comparison, we increase pressue on the memory system forcing 1015 + * comparison, we increase pressure on the memory system forcing 1016 1016 * the kernel to generate new VMAs when old one could be 1017 1017 * extended instead. 1018 1018 */ ··· 1115 1115 * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN 1116 1116 * might become case 1 below case 2 below case 3 below 1117 1117 * 1118 - * It is important for case 8 that the the vma NNNN overlapping the 1118 + * It is important for case 8 that the vma NNNN overlapping the 1119 1119 * region AAAA is never going to extended over XXXX. Instead XXXX must 1120 1120 * be extended in region AAAA and NNNN must be removed. This way in 1121 1121 * all cases where vma_merge succeeds, the moment vma_adjust drops the ··· 1645 1645 #endif /* __ARCH_WANT_SYS_OLD_MMAP */ 1646 1646 1647 1647 /* 1648 - * Some shared mappigns will want the pages marked read-only 1648 + * Some shared mappings will want the pages marked read-only 1649 1649 * to track write events. If so, we'll downgrade vm_page_prot 1650 1650 * to the private version (using protection_map[] without the 1651 1651 * VM_SHARED bit). ··· 2126 2126 */ 2127 2127 #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 2128 2128 unsigned long 2129 - arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, 2130 - const unsigned long len, const unsigned long pgoff, 2131 - const unsigned long flags) 2129 + arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, 2130 + unsigned long len, unsigned long pgoff, 2131 + unsigned long flags) 2132 2132 { 2133 2133 struct vm_area_struct *vma, *prev; 2134 2134 struct mm_struct *mm = current->mm; 2135 - unsigned long addr = addr0; 2136 2135 struct vm_unmapped_area_info info; 2137 2136 const unsigned long mmap_end = arch_get_mmap_end(addr); 2138 2137

+3 -3

mm/mprotect.c

··· 110 110 continue; 111 111 } 112 112 113 - ptent = ptep_modify_prot_start(mm, addr, pte); 114 - ptent = pte_modify(ptent, newprot); 113 + oldpte = ptep_modify_prot_start(vma, addr, pte); 114 + ptent = pte_modify(oldpte, newprot); 115 115 if (preserve_write) 116 116 ptent = pte_mk_savedwrite(ptent); 117 117 ··· 121 121 !(vma->vm_flags & VM_SOFTDIRTY))) { 122 122 ptent = pte_mkwrite(ptent); 123 123 } 124 - ptep_modify_prot_commit(mm, addr, pte, ptent); 124 + ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); 125 125 pages++; 126 126 } else if (IS_ENABLED(CONFIG_MIGRATION)) { 127 127 swp_entry_t entry = pte_to_swp_entry(oldpte);

+17

mm/mremap.c

··· 516 516 if (addr + old_len > new_addr && new_addr + new_len > addr) 517 517 goto out; 518 518 519 + /* 520 + * move_vma() need us to stay 4 maps below the threshold, otherwise 521 + * it will bail out at the very beginning. 522 + * That is a problem if we have already unmaped the regions here 523 + * (new_addr, and old_addr), because userspace will not know the 524 + * state of the vma's after it gets -ENOMEM. 525 + * So, to avoid such scenario we can pre-compute if the whole 526 + * operation has high chances to success map-wise. 527 + * Worst-scenario case is when both vma's (new_addr and old_addr) get 528 + * split in 3 before unmaping it. 529 + * That means 2 more maps (1 for each) to the ones we already hold. 530 + * Check whether current map count plus 2 still leads us to 4 maps below 531 + * the threshold, otherwise return -ENOMEM here to be more safe. 532 + */ 533 + if ((mm->map_count + 2) >= sysctl_max_map_count - 3) 534 + return -ENOMEM; 535 + 519 536 ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); 520 537 if (ret) 521 538 goto out;

+20 -61

mm/oom_kill.c

··· 843 843 return ret; 844 844 } 845 845 846 - static void __oom_kill_process(struct task_struct *victim) 846 + static void __oom_kill_process(struct task_struct *victim, const char *message) 847 847 { 848 848 struct task_struct *p; 849 849 struct mm_struct *mm; ··· 874 874 */ 875 875 do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); 876 876 mark_oom_victim(victim); 877 - pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", 878 - task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), 877 + pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", 878 + message, task_pid_nr(victim), victim->comm, 879 + K(victim->mm->total_vm), 879 880 K(get_mm_counter(victim->mm, MM_ANONPAGES)), 880 881 K(get_mm_counter(victim->mm, MM_FILEPAGES)), 881 882 K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); ··· 927 926 * Kill provided task unless it's secured by setting 928 927 * oom_score_adj to OOM_SCORE_ADJ_MIN. 929 928 */ 930 - static int oom_kill_memcg_member(struct task_struct *task, void *unused) 929 + static int oom_kill_memcg_member(struct task_struct *task, void *message) 931 930 { 932 - if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { 931 + if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN && 932 + !is_global_init(task)) { 933 933 get_task_struct(task); 934 - __oom_kill_process(task); 934 + __oom_kill_process(task, message); 935 935 } 936 936 return 0; 937 937 } 938 938 939 939 static void oom_kill_process(struct oom_control *oc, const char *message) 940 940 { 941 - struct task_struct *p = oc->chosen; 942 - unsigned int points = oc->chosen_points; 943 - struct task_struct *victim = p; 944 - struct task_struct *child; 945 - struct task_struct *t; 941 + struct task_struct *victim = oc->chosen; 946 942 struct mem_cgroup *oom_group; 947 - unsigned int victim_points = 0; 948 943 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, 949 944 DEFAULT_RATELIMIT_BURST); 950 945 ··· 949 952 * its children or threads, just give it access to memory reserves 950 953 * so it can die quickly 951 954 */ 952 - task_lock(p); 953 - if (task_will_free_mem(p)) { 954 - mark_oom_victim(p); 955 - wake_oom_reaper(p); 956 - task_unlock(p); 957 - put_task_struct(p); 955 + task_lock(victim); 956 + if (task_will_free_mem(victim)) { 957 + mark_oom_victim(victim); 958 + wake_oom_reaper(victim); 959 + task_unlock(victim); 960 + put_task_struct(victim); 958 961 return; 959 962 } 960 - task_unlock(p); 963 + task_unlock(victim); 961 964 962 965 if (__ratelimit(&oom_rs)) 963 - dump_header(oc, p); 964 - 965 - pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", 966 - message, task_pid_nr(p), p->comm, points); 967 - 968 - /* 969 - * If any of p's children has a different mm and is eligible for kill, 970 - * the one with the highest oom_badness() score is sacrificed for its 971 - * parent. This attempts to lose the minimal amount of work done while 972 - * still freeing memory. 973 - */ 974 - read_lock(&tasklist_lock); 975 - 976 - /* 977 - * The task 'p' might have already exited before reaching here. The 978 - * put_task_struct() will free task_struct 'p' while the loop still try 979 - * to access the field of 'p', so, get an extra reference. 980 - */ 981 - get_task_struct(p); 982 - for_each_thread(p, t) { 983 - list_for_each_entry(child, &t->children, sibling) { 984 - unsigned int child_points; 985 - 986 - if (process_shares_mm(child, p->mm)) 987 - continue; 988 - /* 989 - * oom_badness() returns 0 if the thread is unkillable 990 - */ 991 - child_points = oom_badness(child, 992 - oc->memcg, oc->nodemask, oc->totalpages); 993 - if (child_points > victim_points) { 994 - put_task_struct(victim); 995 - victim = child; 996 - victim_points = child_points; 997 - get_task_struct(victim); 998 - } 999 - } 1000 - } 1001 - put_task_struct(p); 1002 - read_unlock(&tasklist_lock); 966 + dump_header(oc, victim); 1003 967 1004 968 /* 1005 969 * Do we need to kill the entire memory cgroup? ··· 969 1011 */ 970 1012 oom_group = mem_cgroup_get_oom_group(victim, oc->memcg); 971 1013 972 - __oom_kill_process(victim); 1014 + __oom_kill_process(victim, message); 973 1015 974 1016 /* 975 1017 * If necessary, kill all tasks in the selected memory cgroup. 976 1018 */ 977 1019 if (oom_group) { 978 1020 mem_cgroup_print_oom_group(oom_group); 979 - mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL); 1021 + mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, 1022 + (void*)message); 980 1023 mem_cgroup_put(oom_group); 981 1024 } 982 1025 }

+16 -8

mm/page-writeback.c

··· 270 270 * node_dirtyable_memory - number of dirtyable pages in a node 271 271 * @pgdat: the node 272 272 * 273 - * Returns the node's number of pages potentially available for dirty 273 + * Return: the node's number of pages potentially available for dirty 274 274 * page cache. This is the base value for the per-node dirty limits. 275 275 */ 276 276 static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) ··· 355 355 /** 356 356 * global_dirtyable_memory - number of globally dirtyable pages 357 357 * 358 - * Returns the global number of pages potentially available for dirty 358 + * Return: the global number of pages potentially available for dirty 359 359 * page cache. This is the base value for the global dirty limits. 360 360 */ 361 361 static unsigned long global_dirtyable_memory(void) ··· 470 470 * node_dirty_limit - maximum number of dirty pages allowed in a node 471 471 * @pgdat: the node 472 472 * 473 - * Returns the maximum number of dirty pages allowed in a node, based 473 + * Return: the maximum number of dirty pages allowed in a node, based 474 474 * on the node's dirtyable memory. 475 475 */ 476 476 static unsigned long node_dirty_limit(struct pglist_data *pgdat) ··· 495 495 * node_dirty_ok - tells whether a node is within its dirty limits 496 496 * @pgdat: the node to check 497 497 * 498 - * Returns %true when the dirty pages in @pgdat are within the node's 498 + * Return: %true when the dirty pages in @pgdat are within the node's 499 499 * dirty limit, %false if the limit is exceeded. 500 500 */ 501 501 bool node_dirty_ok(struct pglist_data *pgdat) ··· 743 743 * __wb_calc_thresh - @wb's share of dirty throttling threshold 744 744 * @dtc: dirty_throttle_context of interest 745 745 * 746 - * Returns @wb's dirty limit in pages. The term "dirty" in the context of 747 - * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. 748 - * 749 746 * Note that balance_dirty_pages() will only seriously take it as a hard limit 750 747 * when sleeping max_pause per page is not enough to keep the dirty pages under 751 748 * control. For example, when the device is completely stalled due to some error ··· 756 759 * 757 760 * The wb's share of dirty limit will be adapting to its throughput and 758 761 * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. 762 + * 763 + * Return: @wb's dirty limit in pages. The term "dirty" in the context of 764 + * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. 759 765 */ 760 766 static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) 761 767 { ··· 1918 1918 * @wb: bdi_writeback of interest 1919 1919 * 1920 1920 * Determines whether background writeback should keep writing @wb or it's 1921 - * clean enough. Returns %true if writeback should continue. 1921 + * clean enough. 1922 + * 1923 + * Return: %true if writeback should continue. 1922 1924 */ 1923 1925 bool wb_over_bg_thresh(struct bdi_writeback *wb) 1924 1926 { ··· 2149 2147 * lock/page writeback access order inversion - we should only ever lock 2150 2148 * multiple pages in ascending page->index order, and looping back to the start 2151 2149 * of the file violates that rule and causes deadlocks. 2150 + * 2151 + * Return: %0 on success, negative error code otherwise 2152 2152 */ 2153 2153 int write_cache_pages(struct address_space *mapping, 2154 2154 struct writeback_control *wbc, writepage_t writepage, ··· 2309 2305 * 2310 2306 * This is a library function, which implements the writepages() 2311 2307 * address_space_operation. 2308 + * 2309 + * Return: %0 on success, negative error code otherwise 2312 2310 */ 2313 2311 int generic_writepages(struct address_space *mapping, 2314 2312 struct writeback_control *wbc) ··· 2357 2351 * 2358 2352 * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this 2359 2353 * function returns. 2354 + * 2355 + * Return: %0 on success, negative error code otherwise 2360 2356 */ 2361 2357 int write_one_page(struct page *page) 2362 2358 {

+115 -43

mm/page_alloc.c

··· 289 289 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 290 290 291 291 #if MAX_NUMNODES > 1 292 - int nr_node_ids __read_mostly = MAX_NUMNODES; 293 - int nr_online_nodes __read_mostly = 1; 292 + unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; 293 + unsigned int nr_online_nodes __read_mostly = 1; 294 294 EXPORT_SYMBOL(nr_node_ids); 295 295 EXPORT_SYMBOL(nr_online_nodes); 296 296 #endif ··· 789 789 return 0; 790 790 } 791 791 792 + #ifdef CONFIG_COMPACTION 793 + static inline struct capture_control *task_capc(struct zone *zone) 794 + { 795 + struct capture_control *capc = current->capture_control; 796 + 797 + return capc && 798 + !(current->flags & PF_KTHREAD) && 799 + !capc->page && 800 + capc->cc->zone == zone && 801 + capc->cc->direct_compaction ? capc : NULL; 802 + } 803 + 804 + static inline bool 805 + compaction_capture(struct capture_control *capc, struct page *page, 806 + int order, int migratetype) 807 + { 808 + if (!capc || order != capc->cc->order) 809 + return false; 810 + 811 + /* Do not accidentally pollute CMA or isolated regions*/ 812 + if (is_migrate_cma(migratetype) || 813 + is_migrate_isolate(migratetype)) 814 + return false; 815 + 816 + /* 817 + * Do not let lower order allocations polluate a movable pageblock. 818 + * This might let an unmovable request use a reclaimable pageblock 819 + * and vice-versa but no more than normal fallback logic which can 820 + * have trouble finding a high-order free page. 821 + */ 822 + if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) 823 + return false; 824 + 825 + capc->page = page; 826 + return true; 827 + } 828 + 829 + #else 830 + static inline struct capture_control *task_capc(struct zone *zone) 831 + { 832 + return NULL; 833 + } 834 + 835 + static inline bool 836 + compaction_capture(struct capture_control *capc, struct page *page, 837 + int order, int migratetype) 838 + { 839 + return false; 840 + } 841 + #endif /* CONFIG_COMPACTION */ 842 + 792 843 /* 793 844 * Freeing function for a buddy system allocator. 794 845 * ··· 873 822 unsigned long uninitialized_var(buddy_pfn); 874 823 struct page *buddy; 875 824 unsigned int max_order; 825 + struct capture_control *capc = task_capc(zone); 876 826 877 827 max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); 878 828 ··· 889 837 890 838 continue_merging: 891 839 while (order < max_order - 1) { 840 + if (compaction_capture(capc, page, order, migratetype)) { 841 + __mod_zone_freepage_state(zone, -(1 << order), 842 + migratetype); 843 + return; 844 + } 892 845 buddy_pfn = __find_buddy_pfn(pfn, order); 893 846 buddy = page + (buddy_pfn - pfn); 894 847 ··· 1113 1056 if (PageMappingFlags(page)) 1114 1057 page->mapping = NULL; 1115 1058 if (memcg_kmem_enabled() && PageKmemcg(page)) 1116 - memcg_kmem_uncharge(page, order); 1059 + __memcg_kmem_uncharge(page, order); 1117 1060 if (check_free) 1118 1061 bad += free_pages_check(page); 1119 1062 if (bad) ··· 1360 1303 local_irq_restore(flags); 1361 1304 } 1362 1305 1363 - static void __init __free_pages_boot_core(struct page *page, unsigned int order) 1306 + void __free_pages_core(struct page *page, unsigned int order) 1364 1307 { 1365 1308 unsigned int nr_pages = 1 << order; 1366 1309 struct page *p = page; ··· 1439 1382 { 1440 1383 if (early_page_uninitialised(pfn)) 1441 1384 return; 1442 - return __free_pages_boot_core(page, order); 1385 + __free_pages_core(page, order); 1443 1386 } 1444 1387 1445 1388 /* ··· 1529 1472 if (nr_pages == pageblock_nr_pages && 1530 1473 (pfn & (pageblock_nr_pages - 1)) == 0) { 1531 1474 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1532 - __free_pages_boot_core(page, pageblock_order); 1475 + __free_pages_core(page, pageblock_order); 1533 1476 return; 1534 1477 } 1535 1478 1536 1479 for (i = 0; i < nr_pages; i++, page++, pfn++) { 1537 1480 if ((pfn & (pageblock_nr_pages - 1)) == 0) 1538 1481 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 1539 - __free_pages_boot_core(page, 0); 1482 + __free_pages_core(page, 0); 1540 1483 } 1541 1484 } 1542 1485 ··· 2002 1945 2003 1946 arch_alloc_page(page, order); 2004 1947 kernel_map_pages(page, 1 << order, 1); 2005 - kernel_poison_pages(page, 1 << order, 1); 2006 1948 kasan_alloc_pages(page, order); 1949 + kernel_poison_pages(page, 1 << order, 1); 2007 1950 set_page_owner(page, order, gfp_flags); 2008 1951 } 2009 1952 ··· 3019 2962 * watermark, because we already know our high-order page 3020 2963 * exists. 3021 2964 */ 3022 - watermark = min_wmark_pages(zone) + (1UL << order); 2965 + watermark = zone->_watermark[WMARK_MIN] + (1UL << order); 3023 2966 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) 3024 2967 return 0; 3025 2968 ··· 3230 3173 3231 3174 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 3232 3175 &fail_page_alloc.attr); 3233 - if (IS_ERR(dir)) 3234 - return PTR_ERR(dir); 3235 3176 3236 - if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, 3237 - &fail_page_alloc.ignore_gfp_reclaim)) 3238 - goto fail; 3239 - if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, 3240 - &fail_page_alloc.ignore_gfp_highmem)) 3241 - goto fail; 3242 - if (!debugfs_create_u32("min-order", mode, dir, 3243 - &fail_page_alloc.min_order)) 3244 - goto fail; 3177 + debugfs_create_bool("ignore-gfp-wait", mode, dir, 3178 + &fail_page_alloc.ignore_gfp_reclaim); 3179 + debugfs_create_bool("ignore-gfp-highmem", mode, dir, 3180 + &fail_page_alloc.ignore_gfp_highmem); 3181 + debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); 3245 3182 3246 3183 return 0; 3247 - fail: 3248 - debugfs_remove_recursive(dir); 3249 - 3250 - return -ENOMEM; 3251 3184 } 3252 3185 3253 3186 late_initcall(fail_page_alloc_debugfs); ··· 3757 3710 unsigned int alloc_flags, const struct alloc_context *ac, 3758 3711 enum compact_priority prio, enum compact_result *compact_result) 3759 3712 { 3760 - struct page *page; 3713 + struct page *page = NULL; 3761 3714 unsigned long pflags; 3762 3715 unsigned int noreclaim_flag; 3763 3716 ··· 3768 3721 noreclaim_flag = memalloc_noreclaim_save(); 3769 3722 3770 3723 *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, 3771 - prio); 3724 + prio, &page); 3772 3725 3773 3726 memalloc_noreclaim_restore(noreclaim_flag); 3774 3727 psi_memstall_leave(&pflags); 3775 3728 3776 - if (*compact_result <= COMPACT_INACTIVE) 3729 + if (*compact_result <= COMPACT_INACTIVE) { 3730 + WARN_ON_ONCE(page); 3777 3731 return NULL; 3732 + } 3778 3733 3779 3734 /* 3780 3735 * At least in one zone compaction wasn't deferred or skipped, so let's ··· 3784 3735 */ 3785 3736 count_vm_event(COMPACTSTALL); 3786 3737 3787 - page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 3738 + /* Prep a captured page if available */ 3739 + if (page) 3740 + prep_new_page(page, order, gfp_mask, alloc_flags); 3741 + 3742 + /* Try get a page from the freelist if available */ 3743 + if (!page) 3744 + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); 3788 3745 3789 3746 if (page) { 3790 3747 struct zone *zone = page_zone(page); ··· 4623 4568 4624 4569 out: 4625 4570 if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && 4626 - unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) { 4571 + unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) { 4627 4572 __free_pages(page, order); 4628 4573 page = NULL; 4629 4574 } ··· 4816 4761 * This function is also limited by MAX_ORDER. 4817 4762 * 4818 4763 * Memory allocated by this function must be released by free_pages_exact(). 4764 + * 4765 + * Return: pointer to the allocated area or %NULL in case of error. 4819 4766 */ 4820 4767 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) 4821 4768 { ··· 4838 4781 * 4839 4782 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 4840 4783 * back. 4784 + * 4785 + * Return: pointer to the allocated area or %NULL in case of error. 4841 4786 */ 4842 4787 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 4843 4788 { ··· 4873 4814 * nr_free_zone_pages - count number of pages beyond high watermark 4874 4815 * @offset: The zone index of the highest zone 4875 4816 * 4876 - * nr_free_zone_pages() counts the number of counts pages which are beyond the 4817 + * nr_free_zone_pages() counts the number of pages which are beyond the 4877 4818 * high watermark within all zones at or below a given zone index. For each 4878 4819 * zone, the number of pages is calculated as: 4879 4820 * 4880 4821 * nr_free_zone_pages = managed_pages - high_pages 4822 + * 4823 + * Return: number of pages beyond high watermark. 4881 4824 */ 4882 4825 static unsigned long nr_free_zone_pages(int offset) 4883 4826 { ··· 4906 4845 * 4907 4846 * nr_free_buffer_pages() counts the number of pages which are beyond the high 4908 4847 * watermark within ZONE_DMA and ZONE_NORMAL. 4848 + * 4849 + * Return: number of pages beyond high watermark within ZONE_DMA and 4850 + * ZONE_NORMAL. 4909 4851 */ 4910 4852 unsigned long nr_free_buffer_pages(void) 4911 4853 { ··· 4921 4857 * 4922 4858 * nr_free_pagecache_pages() counts the number of pages which are beyond the 4923 4859 * high watermark within all zones. 4860 + * 4861 + * Return: number of pages beyond high watermark within all zones. 4924 4862 */ 4925 4863 unsigned long nr_free_pagecache_pages(void) 4926 4864 { ··· 5369 5303 * from each node to each node in the system), and should also prefer nodes 5370 5304 * with no CPUs, since presumably they'll have very little allocation pressure 5371 5305 * on them otherwise. 5372 - * It returns -1 if no node is found. 5306 + * 5307 + * Return: node id of the found node or %NUMA_NO_NODE if no node is found. 5373 5308 */ 5374 5309 static int find_next_best_node(int node, nodemask_t *used_node_mask) 5375 5310 { ··· 5676 5609 else 5677 5610 page_group_by_mobility_disabled = 0; 5678 5611 5679 - pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n", 5612 + pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n", 5680 5613 nr_online_nodes, 5681 5614 page_group_by_mobility_disabled ? "off" : "on", 5682 5615 vm_total_pages); ··· 6083 6016 return state->last_nid; 6084 6017 6085 6018 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); 6086 - if (nid != -1) { 6019 + if (nid != NUMA_NO_NODE) { 6087 6020 state->last_start = start_pfn; 6088 6021 state->last_end = end_pfn; 6089 6022 state->last_nid = nid; ··· 6281 6214 * @start_pfn: The start PFN to start searching for holes 6282 6215 * @end_pfn: The end PFN to stop searching for holes 6283 6216 * 6284 - * It returns the number of pages frames in memory holes within a range. 6217 + * Return: the number of pages frames in memory holes within a range. 6285 6218 */ 6286 6219 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 6287 6220 unsigned long end_pfn) ··· 6443 6376 { 6444 6377 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); 6445 6378 zone->pageblock_flags = NULL; 6446 - if (usemapsize) 6379 + if (usemapsize) { 6447 6380 zone->pageblock_flags = 6448 6381 memblock_alloc_node_nopanic(usemapsize, 6449 6382 pgdat->node_id); 6383 + if (!zone->pageblock_flags) 6384 + panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n", 6385 + usemapsize, zone->name, pgdat->node_id); 6386 + } 6450 6387 } 6451 6388 #else 6452 6389 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, ··· 6680 6609 end = ALIGN(end, MAX_ORDER_NR_PAGES); 6681 6610 size = (end - start) * sizeof(struct page); 6682 6611 map = memblock_alloc_node_nopanic(size, pgdat->node_id); 6612 + if (!map) 6613 + panic("Failed to allocate %ld bytes for node %d memory map\n", 6614 + size, pgdat->node_id); 6683 6615 pgdat->node_mem_map = map + offset; 6684 6616 } 6685 6617 pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", ··· 6838 6764 * model has fine enough granularity to avoid incorrect mapping for the 6839 6765 * populated node map. 6840 6766 * 6841 - * Returns the determined alignment in pfn's. 0 if there is no alignment 6767 + * Return: the determined alignment in pfn's. 0 if there is no alignment 6842 6768 * requirement (single node). 6843 6769 */ 6844 6770 unsigned long __init node_map_pfn_alignment(void) 6845 6771 { 6846 6772 unsigned long accl_mask = 0, last_end = 0; 6847 6773 unsigned long start, end, mask; 6848 - int last_nid = -1; 6774 + int last_nid = NUMA_NO_NODE; 6849 6775 int i, nid; 6850 6776 6851 6777 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { ··· 6893 6819 /** 6894 6820 * find_min_pfn_with_active_regions - Find the minimum PFN registered 6895 6821 * 6896 - * It returns the minimum PFN based on information provided via 6822 + * Return: the minimum PFN based on information provided via 6897 6823 * memblock_set_node(). 6898 6824 */ 6899 6825 unsigned long __init find_min_pfn_with_active_regions(void) ··· 7341 7267 7342 7268 return pages; 7343 7269 } 7344 - EXPORT_SYMBOL(free_reserved_area); 7345 7270 7346 7271 #ifdef CONFIG_HIGHMEM 7347 7272 void free_highmem_page(struct page *page) ··· 7569 7496 * value here. 7570 7497 * 7571 7498 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 7572 - * deltas control asynch page reclaim, and so should 7499 + * deltas control async page reclaim, and so should 7573 7500 * not be capped for highmem. 7574 7501 */ 7575 7502 unsigned long min_pages; ··· 8046 7973 8047 7974 /* 8048 7975 * Hugepages are not in LRU lists, but they're movable. 8049 - * We need not scan over tail pages bacause we don't 7976 + * We need not scan over tail pages because we don't 8050 7977 * handle each tail page individually in migration. 8051 7978 */ 8052 7979 if (PageHuge(page)) { ··· 8185 8112 * pageblocks in the range. Once isolated, the pageblocks should not 8186 8113 * be modified by others. 8187 8114 * 8188 - * Returns zero on success or negative error code. On success all 8115 + * Return: zero on success or negative error code. On success all 8189 8116 * pages which PFN is in [start, end) are allocated for the caller and 8190 8117 * need to be freed with free_contig_range(). 8191 8118 */ ··· 8269 8196 */ 8270 8197 8271 8198 lru_add_drain_all(); 8272 - drain_all_pages(cc.zone); 8273 8199 8274 8200 order = 0; 8275 8201 outer_start = start;

+2 -1

mm/page_ext.c

··· 273 273 table_size = get_entry_size() * PAGES_PER_SECTION; 274 274 275 275 BUG_ON(PageReserved(page)); 276 + kmemleak_free(addr); 276 277 free_pages_exact(addr, table_size); 277 278 } 278 279 } ··· 301 300 start = SECTION_ALIGN_DOWN(start_pfn); 302 301 end = SECTION_ALIGN_UP(start_pfn + nr_pages); 303 302 304 - if (nid == -1) { 303 + if (nid == NUMA_NO_NODE) { 305 304 /* 306 305 * In this case, "nid" already exists and contains valid memory. 307 306 * "start_pfn" passed to us is a pfn which is an arg for

+4 -4

mm/page_idle.c

··· 31 31 static struct page *page_idle_get_page(unsigned long pfn) 32 32 { 33 33 struct page *page; 34 - struct zone *zone; 34 + pg_data_t *pgdat; 35 35 36 36 if (!pfn_valid(pfn)) 37 37 return NULL; ··· 41 41 !get_page_unless_zero(page)) 42 42 return NULL; 43 43 44 - zone = page_zone(page); 45 - spin_lock_irq(zone_lru_lock(zone)); 44 + pgdat = page_pgdat(page); 45 + spin_lock_irq(&pgdat->lru_lock); 46 46 if (unlikely(!PageLRU(page))) { 47 47 put_page(page); 48 48 page = NULL; 49 49 } 50 - spin_unlock_irq(zone_lru_lock(zone)); 50 + spin_unlock_irq(&pgdat->lru_lock); 51 51 return page; 52 52 } 53 53

+3 -5

mm/page_owner.c

··· 625 625 626 626 static int __init pageowner_init(void) 627 627 { 628 - struct dentry *dentry; 629 - 630 628 if (!static_branch_unlikely(&page_owner_inited)) { 631 629 pr_info("page_owner is disabled\n"); 632 630 return 0; 633 631 } 634 632 635 - dentry = debugfs_create_file("page_owner", 0400, NULL, 636 - NULL, &proc_page_owner_operations); 633 + debugfs_create_file("page_owner", 0400, NULL, NULL, 634 + &proc_page_owner_operations); 637 635 638 - return PTR_ERR_OR_ZERO(dentry); 636 + return 0; 639 637 } 640 638 late_initcall(pageowner_init)

+4

mm/page_poison.c

··· 6 6 #include <linux/page_ext.h> 7 7 #include <linux/poison.h> 8 8 #include <linux/ratelimit.h> 9 + #include <linux/kasan.h> 9 10 10 11 static bool want_page_poisoning __read_mostly; 11 12 ··· 41 40 { 42 41 void *addr = kmap_atomic(page); 43 42 43 + /* KASAN still think the page is in-use, so skip it. */ 44 + kasan_disable_current(); 44 45 memset(addr, PAGE_POISON, PAGE_SIZE); 46 + kasan_enable_current(); 45 47 kunmap_atomic(addr); 46 48 } 47 49

+2

mm/readahead.c

··· 81 81 * @data: private data for the callback routine. 82 82 * 83 83 * Hides the details of the LRU cache etc from the filesystems. 84 + * 85 + * Returns: %0 on success, error return by @filler otherwise 84 86 */ 85 87 int read_cache_pages(struct address_space *mapping, struct list_head *pages, 86 88 int (*filler)(void *, struct page *), void *data)

+1 -1

mm/rmap.c

··· 27 27 * mapping->i_mmap_rwsem 28 28 * anon_vma->rwsem 29 29 * mm->page_table_lock or pte_lock 30 - * zone_lru_lock (in mark_page_accessed, isolate_lru_page) 30 + * pgdat->lru_lock (in mark_page_accessed, isolate_lru_page) 31 31 * swap_lock (in swap_duplicate, swap_info_get) 32 32 * mmlist_lock (in mmput, drain_mmlist and others) 33 33 * mapping->private_lock (in __set_page_dirty_buffers)

+422 -337

mm/shmem.c

··· 36 36 #include <linux/uio.h> 37 37 #include <linux/khugepaged.h> 38 38 #include <linux/hugetlb.h> 39 + #include <linux/frontswap.h> 39 40 40 41 #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ 41 42 ··· 124 123 static bool shmem_should_replace_page(struct page *page, gfp_t gfp); 125 124 static int shmem_replace_page(struct page **pagep, gfp_t gfp, 126 125 struct shmem_inode_info *info, pgoff_t index); 126 + static int shmem_swapin_page(struct inode *inode, pgoff_t index, 127 + struct page **pagep, enum sgp_type sgp, 128 + gfp_t gfp, struct vm_area_struct *vma, 129 + vm_fault_t *fault_type); 127 130 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 128 131 struct page **pagep, enum sgp_type sgp, 129 132 gfp_t gfp, struct vm_area_struct *vma, ··· 1094 1089 clear_inode(inode); 1095 1090 } 1096 1091 1097 - static unsigned long find_swap_entry(struct xarray *xa, void *item) 1092 + extern struct swap_info_struct *swap_info[]; 1093 + 1094 + static int shmem_find_swap_entries(struct address_space *mapping, 1095 + pgoff_t start, unsigned int nr_entries, 1096 + struct page **entries, pgoff_t *indices, 1097 + bool frontswap) 1098 1098 { 1099 - XA_STATE(xas, xa, 0); 1100 - unsigned int checked = 0; 1101 - void *entry; 1099 + XA_STATE(xas, &mapping->i_pages, start); 1100 + struct page *page; 1101 + unsigned int ret = 0; 1102 + 1103 + if (!nr_entries) 1104 + return 0; 1102 1105 1103 1106 rcu_read_lock(); 1104 - xas_for_each(&xas, entry, ULONG_MAX) { 1105 - if (xas_retry(&xas, entry)) 1107 + xas_for_each(&xas, page, ULONG_MAX) { 1108 + if (xas_retry(&xas, page)) 1106 1109 continue; 1107 - if (entry == item) 1110 + 1111 + if (!xa_is_value(page)) 1112 + continue; 1113 + 1114 + if (frontswap) { 1115 + swp_entry_t entry = radix_to_swp_entry(page); 1116 + 1117 + if (!frontswap_test(swap_info[swp_type(entry)], 1118 + swp_offset(entry))) 1119 + continue; 1120 + } 1121 + 1122 + indices[ret] = xas.xa_index; 1123 + entries[ret] = page; 1124 + 1125 + if (need_resched()) { 1126 + xas_pause(&xas); 1127 + cond_resched_rcu(); 1128 + } 1129 + if (++ret == nr_entries) 1108 1130 break; 1109 - checked++; 1110 - if ((checked % XA_CHECK_SCHED) != 0) 1111 - continue; 1112 - xas_pause(&xas); 1113 - cond_resched_rcu(); 1114 1131 } 1115 1132 rcu_read_unlock(); 1116 1133 1117 - return entry ? xas.xa_index : -1; 1134 + return ret; 1135 + } 1136 + 1137 + /* 1138 + * Move the swapped pages for an inode to page cache. Returns the count 1139 + * of pages swapped in, or the error in case of failure. 1140 + */ 1141 + static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec, 1142 + pgoff_t *indices) 1143 + { 1144 + int i = 0; 1145 + int ret = 0; 1146 + int error = 0; 1147 + struct address_space *mapping = inode->i_mapping; 1148 + 1149 + for (i = 0; i < pvec.nr; i++) { 1150 + struct page *page = pvec.pages[i]; 1151 + 1152 + if (!xa_is_value(page)) 1153 + continue; 1154 + error = shmem_swapin_page(inode, indices[i], 1155 + &page, SGP_CACHE, 1156 + mapping_gfp_mask(mapping), 1157 + NULL, NULL); 1158 + if (error == 0) { 1159 + unlock_page(page); 1160 + put_page(page); 1161 + ret++; 1162 + } 1163 + if (error == -ENOMEM) 1164 + break; 1165 + error = 0; 1166 + } 1167 + return error ? error : ret; 1118 1168 } 1119 1169 1120 1170 /* 1121 1171 * If swap found in inode, free it and move page from swapcache to filecache. 1122 1172 */ 1123 - static int shmem_unuse_inode(struct shmem_inode_info *info, 1124 - swp_entry_t swap, struct page **pagep) 1173 + static int shmem_unuse_inode(struct inode *inode, unsigned int type, 1174 + bool frontswap, unsigned long *fs_pages_to_unuse) 1125 1175 { 1126 - struct address_space *mapping = info->vfs_inode.i_mapping; 1127 - void *radswap; 1128 - pgoff_t index; 1129 - gfp_t gfp; 1130 - int error = 0; 1176 + struct address_space *mapping = inode->i_mapping; 1177 + pgoff_t start = 0; 1178 + struct pagevec pvec; 1179 + pgoff_t indices[PAGEVEC_SIZE]; 1180 + bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0); 1181 + int ret = 0; 1131 1182 1132 - radswap = swp_to_radix_entry(swap); 1133 - index = find_swap_entry(&mapping->i_pages, radswap); 1134 - if (index == -1) 1135 - return -EAGAIN; /* tell shmem_unuse we found nothing */ 1183 + pagevec_init(&pvec); 1184 + do { 1185 + unsigned int nr_entries = PAGEVEC_SIZE; 1136 1186 1137 - /* 1138 - * Move _head_ to start search for next from here. 1139 - * But be careful: shmem_evict_inode checks list_empty without taking 1140 - * mutex, and there's an instant in list_move_tail when info->swaplist 1141 - * would appear empty, if it were the only one on shmem_swaplist. 1142 - */ 1143 - if (shmem_swaplist.next != &info->swaplist) 1144 - list_move_tail(&shmem_swaplist, &info->swaplist); 1187 + if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE) 1188 + nr_entries = *fs_pages_to_unuse; 1145 1189 1146 - gfp = mapping_gfp_mask(mapping); 1147 - if (shmem_should_replace_page(*pagep, gfp)) { 1148 - mutex_unlock(&shmem_swaplist_mutex); 1149 - error = shmem_replace_page(pagep, gfp, info, index); 1150 - mutex_lock(&shmem_swaplist_mutex); 1151 - /* 1152 - * We needed to drop mutex to make that restrictive page 1153 - * allocation, but the inode might have been freed while we 1154 - * dropped it: although a racing shmem_evict_inode() cannot 1155 - * complete without emptying the page cache, our page lock 1156 - * on this swapcache page is not enough to prevent that - 1157 - * free_swap_and_cache() of our swap entry will only 1158 - * trylock_page(), removing swap from page cache whatever. 1159 - * 1160 - * We must not proceed to shmem_add_to_page_cache() if the 1161 - * inode has been freed, but of course we cannot rely on 1162 - * inode or mapping or info to check that. However, we can 1163 - * safely check if our swap entry is still in use (and here 1164 - * it can't have got reused for another page): if it's still 1165 - * in use, then the inode cannot have been freed yet, and we 1166 - * can safely proceed (if it's no longer in use, that tells 1167 - * nothing about the inode, but we don't need to unuse swap). 1168 - */ 1169 - if (!page_swapcount(*pagep)) 1170 - error = -ENOENT; 1171 - } 1172 - 1173 - /* 1174 - * We rely on shmem_swaplist_mutex, not only to protect the swaplist, 1175 - * but also to hold up shmem_evict_inode(): so inode cannot be freed 1176 - * beneath us (pagelock doesn't help until the page is in pagecache). 1177 - */ 1178 - if (!error) 1179 - error = shmem_add_to_page_cache(*pagep, mapping, index, 1180 - radswap, gfp); 1181 - if (error != -ENOMEM) { 1182 - /* 1183 - * Truncation and eviction use free_swap_and_cache(), which 1184 - * only does trylock page: if we raced, best clean up here. 1185 - */ 1186 - delete_from_swap_cache(*pagep); 1187 - set_page_dirty(*pagep); 1188 - if (!error) { 1189 - spin_lock_irq(&info->lock); 1190 - info->swapped--; 1191 - spin_unlock_irq(&info->lock); 1192 - swap_free(swap); 1190 + pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries, 1191 + pvec.pages, indices, 1192 + frontswap); 1193 + if (pvec.nr == 0) { 1194 + ret = 0; 1195 + break; 1193 1196 } 1194 - } 1195 - return error; 1197 + 1198 + ret = shmem_unuse_swap_entries(inode, pvec, indices); 1199 + if (ret < 0) 1200 + break; 1201 + 1202 + if (frontswap_partial) { 1203 + *fs_pages_to_unuse -= ret; 1204 + if (*fs_pages_to_unuse == 0) { 1205 + ret = FRONTSWAP_PAGES_UNUSED; 1206 + break; 1207 + } 1208 + } 1209 + 1210 + start = indices[pvec.nr - 1]; 1211 + } while (true); 1212 + 1213 + return ret; 1196 1214 } 1197 1215 1198 1216 /* 1199 - * Search through swapped inodes to find and replace swap by page. 1217 + * Read all the shared memory data that resides in the swap 1218 + * device 'type' back into memory, so the swap device can be 1219 + * unused. 1200 1220 */ 1201 - int shmem_unuse(swp_entry_t swap, struct page *page) 1221 + int shmem_unuse(unsigned int type, bool frontswap, 1222 + unsigned long *fs_pages_to_unuse) 1202 1223 { 1203 - struct list_head *this, *next; 1204 - struct shmem_inode_info *info; 1205 - struct mem_cgroup *memcg; 1224 + struct shmem_inode_info *info, *next; 1225 + struct inode *inode; 1226 + struct inode *prev_inode = NULL; 1206 1227 int error = 0; 1207 1228 1208 - /* 1209 - * There's a faint possibility that swap page was replaced before 1210 - * caller locked it: caller will come back later with the right page. 1211 - */ 1212 - if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) 1213 - goto out; 1214 - 1215 - /* 1216 - * Charge page using GFP_KERNEL while we can wait, before taking 1217 - * the shmem_swaplist_mutex which might hold up shmem_writepage(). 1218 - * Charged back to the user (not to caller) when swap account is used. 1219 - */ 1220 - error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL, 1221 - &memcg, false); 1222 - if (error) 1223 - goto out; 1224 - /* No memory allocation: swap entry occupies the slot for the page */ 1225 - error = -EAGAIN; 1229 + if (list_empty(&shmem_swaplist)) 1230 + return 0; 1226 1231 1227 1232 mutex_lock(&shmem_swaplist_mutex); 1228 - list_for_each_safe(this, next, &shmem_swaplist) { 1229 - info = list_entry(this, struct shmem_inode_info, swaplist); 1230 - if (info->swapped) 1231 - error = shmem_unuse_inode(info, swap, &page); 1232 - else 1233 + 1234 + /* 1235 + * The extra refcount on the inode is necessary to safely dereference 1236 + * p->next after re-acquiring the lock. New shmem inodes with swap 1237 + * get added to the end of the list and we will scan them all. 1238 + */ 1239 + list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { 1240 + if (!info->swapped) { 1233 1241 list_del_init(&info->swaplist); 1242 + continue; 1243 + } 1244 + 1245 + inode = igrab(&info->vfs_inode); 1246 + if (!inode) 1247 + continue; 1248 + 1249 + mutex_unlock(&shmem_swaplist_mutex); 1250 + if (prev_inode) 1251 + iput(prev_inode); 1252 + prev_inode = inode; 1253 + 1254 + error = shmem_unuse_inode(inode, type, frontswap, 1255 + fs_pages_to_unuse); 1234 1256 cond_resched(); 1235 - if (error != -EAGAIN) 1257 + 1258 + mutex_lock(&shmem_swaplist_mutex); 1259 + next = list_next_entry(info, swaplist); 1260 + if (!info->swapped) 1261 + list_del_init(&info->swaplist); 1262 + if (error) 1236 1263 break; 1237 - /* found nothing in this: move on to search the next */ 1238 1264 } 1239 1265 mutex_unlock(&shmem_swaplist_mutex); 1240 1266 1241 - if (error) { 1242 - if (error != -ENOMEM) 1243 - error = 0; 1244 - mem_cgroup_cancel_charge(page, memcg, false); 1245 - } else 1246 - mem_cgroup_commit_charge(page, memcg, true, false); 1247 - out: 1248 - unlock_page(page); 1249 - put_page(page); 1267 + if (prev_inode) 1268 + iput(prev_inode); 1269 + 1250 1270 return error; 1251 1271 } 1252 1272 ··· 1355 1325 */ 1356 1326 mutex_lock(&shmem_swaplist_mutex); 1357 1327 if (list_empty(&info->swaplist)) 1358 - list_add_tail(&info->swaplist, &shmem_swaplist); 1328 + list_add(&info->swaplist, &shmem_swaplist); 1359 1329 1360 1330 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 1361 1331 spin_lock_irq(&info->lock); ··· 1606 1576 } 1607 1577 1608 1578 /* 1579 + * Swap in the page pointed to by *pagep. 1580 + * Caller has to make sure that *pagep contains a valid swapped page. 1581 + * Returns 0 and the page in pagep if success. On failure, returns the 1582 + * the error code and NULL in *pagep. 1583 + */ 1584 + static int shmem_swapin_page(struct inode *inode, pgoff_t index, 1585 + struct page **pagep, enum sgp_type sgp, 1586 + gfp_t gfp, struct vm_area_struct *vma, 1587 + vm_fault_t *fault_type) 1588 + { 1589 + struct address_space *mapping = inode->i_mapping; 1590 + struct shmem_inode_info *info = SHMEM_I(inode); 1591 + struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm; 1592 + struct mem_cgroup *memcg; 1593 + struct page *page; 1594 + swp_entry_t swap; 1595 + int error; 1596 + 1597 + VM_BUG_ON(!*pagep || !xa_is_value(*pagep)); 1598 + swap = radix_to_swp_entry(*pagep); 1599 + *pagep = NULL; 1600 + 1601 + /* Look it up and read it in.. */ 1602 + page = lookup_swap_cache(swap, NULL, 0); 1603 + if (!page) { 1604 + /* Or update major stats only when swapin succeeds?? */ 1605 + if (fault_type) { 1606 + *fault_type |= VM_FAULT_MAJOR; 1607 + count_vm_event(PGMAJFAULT); 1608 + count_memcg_event_mm(charge_mm, PGMAJFAULT); 1609 + } 1610 + /* Here we actually start the io */ 1611 + page = shmem_swapin(swap, gfp, info, index); 1612 + if (!page) { 1613 + error = -ENOMEM; 1614 + goto failed; 1615 + } 1616 + } 1617 + 1618 + /* We have to do this with page locked to prevent races */ 1619 + lock_page(page); 1620 + if (!PageSwapCache(page) || page_private(page) != swap.val || 1621 + !shmem_confirm_swap(mapping, index, swap)) { 1622 + error = -EEXIST; 1623 + goto unlock; 1624 + } 1625 + if (!PageUptodate(page)) { 1626 + error = -EIO; 1627 + goto failed; 1628 + } 1629 + wait_on_page_writeback(page); 1630 + 1631 + if (shmem_should_replace_page(page, gfp)) { 1632 + error = shmem_replace_page(&page, gfp, info, index); 1633 + if (error) 1634 + goto failed; 1635 + } 1636 + 1637 + error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, 1638 + false); 1639 + if (!error) { 1640 + error = shmem_add_to_page_cache(page, mapping, index, 1641 + swp_to_radix_entry(swap), gfp); 1642 + /* 1643 + * We already confirmed swap under page lock, and make 1644 + * no memory allocation here, so usually no possibility 1645 + * of error; but free_swap_and_cache() only trylocks a 1646 + * page, so it is just possible that the entry has been 1647 + * truncated or holepunched since swap was confirmed. 1648 + * shmem_undo_range() will have done some of the 1649 + * unaccounting, now delete_from_swap_cache() will do 1650 + * the rest. 1651 + */ 1652 + if (error) { 1653 + mem_cgroup_cancel_charge(page, memcg, false); 1654 + delete_from_swap_cache(page); 1655 + } 1656 + } 1657 + if (error) 1658 + goto failed; 1659 + 1660 + mem_cgroup_commit_charge(page, memcg, true, false); 1661 + 1662 + spin_lock_irq(&info->lock); 1663 + info->swapped--; 1664 + shmem_recalc_inode(inode); 1665 + spin_unlock_irq(&info->lock); 1666 + 1667 + if (sgp == SGP_WRITE) 1668 + mark_page_accessed(page); 1669 + 1670 + delete_from_swap_cache(page); 1671 + set_page_dirty(page); 1672 + swap_free(swap); 1673 + 1674 + *pagep = page; 1675 + return 0; 1676 + failed: 1677 + if (!shmem_confirm_swap(mapping, index, swap)) 1678 + error = -EEXIST; 1679 + unlock: 1680 + if (page) { 1681 + unlock_page(page); 1682 + put_page(page); 1683 + } 1684 + 1685 + return error; 1686 + } 1687 + 1688 + /* 1609 1689 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate 1610 1690 * 1611 1691 * If we allocate a new one we do not mark it dirty. That's up to the ··· 1736 1596 struct mm_struct *charge_mm; 1737 1597 struct mem_cgroup *memcg; 1738 1598 struct page *page; 1739 - swp_entry_t swap; 1740 1599 enum sgp_type sgp_huge = sgp; 1741 1600 pgoff_t hindex = index; 1742 1601 int error; ··· 1747 1608 if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) 1748 1609 sgp = SGP_CACHE; 1749 1610 repeat: 1750 - swap.val = 0; 1751 - page = find_lock_entry(mapping, index); 1752 - if (xa_is_value(page)) { 1753 - swap = radix_to_swp_entry(page); 1754 - page = NULL; 1755 - } 1756 - 1757 1611 if (sgp <= SGP_CACHE && 1758 1612 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { 1759 - error = -EINVAL; 1760 - goto unlock; 1613 + return -EINVAL; 1614 + } 1615 + 1616 + sbinfo = SHMEM_SB(inode->i_sb); 1617 + charge_mm = vma ? vma->vm_mm : current->mm; 1618 + 1619 + page = find_lock_entry(mapping, index); 1620 + if (xa_is_value(page)) { 1621 + error = shmem_swapin_page(inode, index, &page, 1622 + sgp, gfp, vma, fault_type); 1623 + if (error == -EEXIST) 1624 + goto repeat; 1625 + 1626 + *pagep = page; 1627 + return error; 1761 1628 } 1762 1629 1763 1630 if (page && sgp == SGP_WRITE) ··· 1777 1632 put_page(page); 1778 1633 page = NULL; 1779 1634 } 1780 - if (page || (sgp == SGP_READ && !swap.val)) { 1635 + if (page || sgp == SGP_READ) { 1781 1636 *pagep = page; 1782 1637 return 0; 1783 1638 } ··· 1786 1641 * Fast cache lookup did not find it: 1787 1642 * bring it back from swap or allocate. 1788 1643 */ 1789 - sbinfo = SHMEM_SB(inode->i_sb); 1790 - charge_mm = vma ? vma->vm_mm : current->mm; 1791 1644 1792 - if (swap.val) { 1793 - /* Look it up and read it in.. */ 1794 - page = lookup_swap_cache(swap, NULL, 0); 1795 - if (!page) { 1796 - /* Or update major stats only when swapin succeeds?? */ 1797 - if (fault_type) { 1798 - *fault_type |= VM_FAULT_MAJOR; 1799 - count_vm_event(PGMAJFAULT); 1800 - count_memcg_event_mm(charge_mm, PGMAJFAULT); 1801 - } 1802 - /* Here we actually start the io */ 1803 - page = shmem_swapin(swap, gfp, info, index); 1804 - if (!page) { 1805 - error = -ENOMEM; 1806 - goto failed; 1807 - } 1808 - } 1645 + if (vma && userfaultfd_missing(vma)) { 1646 + *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); 1647 + return 0; 1648 + } 1809 1649 1810 - /* We have to do this with page locked to prevent races */ 1811 - lock_page(page); 1812 - if (!PageSwapCache(page) || page_private(page) != swap.val || 1813 - !shmem_confirm_swap(mapping, index, swap)) { 1814 - error = -EEXIST; /* try again */ 1815 - goto unlock; 1816 - } 1817 - if (!PageUptodate(page)) { 1818 - error = -EIO; 1819 - goto failed; 1820 - } 1821 - wait_on_page_writeback(page); 1822 - 1823 - if (shmem_should_replace_page(page, gfp)) { 1824 - error = shmem_replace_page(&page, gfp, info, index); 1825 - if (error) 1826 - goto failed; 1827 - } 1828 - 1829 - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, 1830 - false); 1831 - if (!error) { 1832 - error = shmem_add_to_page_cache(page, mapping, index, 1833 - swp_to_radix_entry(swap), gfp); 1834 - /* 1835 - * We already confirmed swap under page lock, and make 1836 - * no memory allocation here, so usually no possibility 1837 - * of error; but free_swap_and_cache() only trylocks a 1838 - * page, so it is just possible that the entry has been 1839 - * truncated or holepunched since swap was confirmed. 1840 - * shmem_undo_range() will have done some of the 1841 - * unaccounting, now delete_from_swap_cache() will do 1842 - * the rest. 1843 - * Reset swap.val? No, leave it so "failed" goes back to 1844 - * "repeat": reading a hole and writing should succeed. 1845 - */ 1846 - if (error) { 1847 - mem_cgroup_cancel_charge(page, memcg, false); 1848 - delete_from_swap_cache(page); 1849 - } 1850 - } 1851 - if (error) 1852 - goto failed; 1853 - 1854 - mem_cgroup_commit_charge(page, memcg, true, false); 1855 - 1856 - spin_lock_irq(&info->lock); 1857 - info->swapped--; 1858 - shmem_recalc_inode(inode); 1859 - spin_unlock_irq(&info->lock); 1860 - 1861 - if (sgp == SGP_WRITE) 1862 - mark_page_accessed(page); 1863 - 1864 - delete_from_swap_cache(page); 1865 - set_page_dirty(page); 1866 - swap_free(swap); 1867 - 1868 - } else { 1869 - if (vma && userfaultfd_missing(vma)) { 1870 - *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); 1871 - return 0; 1872 - } 1873 - 1874 - /* shmem_symlink() */ 1875 - if (mapping->a_ops != &shmem_aops) 1876 - goto alloc_nohuge; 1877 - if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) 1878 - goto alloc_nohuge; 1879 - if (shmem_huge == SHMEM_HUGE_FORCE) 1650 + /* shmem_symlink() */ 1651 + if (mapping->a_ops != &shmem_aops) 1652 + goto alloc_nohuge; 1653 + if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) 1654 + goto alloc_nohuge; 1655 + if (shmem_huge == SHMEM_HUGE_FORCE) 1656 + goto alloc_huge; 1657 + switch (sbinfo->huge) { 1658 + loff_t i_size; 1659 + pgoff_t off; 1660 + case SHMEM_HUGE_NEVER: 1661 + goto alloc_nohuge; 1662 + case SHMEM_HUGE_WITHIN_SIZE: 1663 + off = round_up(index, HPAGE_PMD_NR); 1664 + i_size = round_up(i_size_read(inode), PAGE_SIZE); 1665 + if (i_size >= HPAGE_PMD_SIZE && 1666 + i_size >> PAGE_SHIFT >= off) 1880 1667 goto alloc_huge; 1881 - switch (sbinfo->huge) { 1882 - loff_t i_size; 1883 - pgoff_t off; 1884 - case SHMEM_HUGE_NEVER: 1885 - goto alloc_nohuge; 1886 - case SHMEM_HUGE_WITHIN_SIZE: 1887 - off = round_up(index, HPAGE_PMD_NR); 1888 - i_size = round_up(i_size_read(inode), PAGE_SIZE); 1889 - if (i_size >= HPAGE_PMD_SIZE && 1890 - i_size >> PAGE_SHIFT >= off) 1891 - goto alloc_huge; 1892 - /* fallthrough */ 1893 - case SHMEM_HUGE_ADVISE: 1894 - if (sgp_huge == SGP_HUGE) 1895 - goto alloc_huge; 1896 - /* TODO: implement fadvise() hints */ 1897 - goto alloc_nohuge; 1898 - } 1668 + /* fallthrough */ 1669 + case SHMEM_HUGE_ADVISE: 1670 + if (sgp_huge == SGP_HUGE) 1671 + goto alloc_huge; 1672 + /* TODO: implement fadvise() hints */ 1673 + goto alloc_nohuge; 1674 + } 1899 1675 1900 1676 alloc_huge: 1901 - page = shmem_alloc_and_acct_page(gfp, inode, index, true); 1902 - if (IS_ERR(page)) { 1903 - alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, 1904 - index, false); 1905 - } 1906 - if (IS_ERR(page)) { 1907 - int retry = 5; 1908 - error = PTR_ERR(page); 1909 - page = NULL; 1910 - if (error != -ENOSPC) 1911 - goto failed; 1912 - /* 1913 - * Try to reclaim some spece by splitting a huge page 1914 - * beyond i_size on the filesystem. 1915 - */ 1916 - while (retry--) { 1917 - int ret; 1918 - ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); 1919 - if (ret == SHRINK_STOP) 1920 - break; 1921 - if (ret) 1922 - goto alloc_nohuge; 1923 - } 1924 - goto failed; 1925 - } 1677 + page = shmem_alloc_and_acct_page(gfp, inode, index, true); 1678 + if (IS_ERR(page)) { 1679 + alloc_nohuge: 1680 + page = shmem_alloc_and_acct_page(gfp, inode, 1681 + index, false); 1682 + } 1683 + if (IS_ERR(page)) { 1684 + int retry = 5; 1926 1685 1927 - if (PageTransHuge(page)) 1928 - hindex = round_down(index, HPAGE_PMD_NR); 1929 - else 1930 - hindex = index; 1931 - 1932 - if (sgp == SGP_WRITE) 1933 - __SetPageReferenced(page); 1934 - 1935 - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, 1936 - PageTransHuge(page)); 1937 - if (error) 1938 - goto unacct; 1939 - error = shmem_add_to_page_cache(page, mapping, hindex, 1940 - NULL, gfp & GFP_RECLAIM_MASK); 1941 - if (error) { 1942 - mem_cgroup_cancel_charge(page, memcg, 1943 - PageTransHuge(page)); 1944 - goto unacct; 1945 - } 1946 - mem_cgroup_commit_charge(page, memcg, false, 1947 - PageTransHuge(page)); 1948 - lru_cache_add_anon(page); 1949 - 1950 - spin_lock_irq(&info->lock); 1951 - info->alloced += 1 << compound_order(page); 1952 - inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); 1953 - shmem_recalc_inode(inode); 1954 - spin_unlock_irq(&info->lock); 1955 - alloced = true; 1956 - 1957 - if (PageTransHuge(page) && 1958 - DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < 1959 - hindex + HPAGE_PMD_NR - 1) { 1960 - /* 1961 - * Part of the huge page is beyond i_size: subject 1962 - * to shrink under memory pressure. 1963 - */ 1964 - spin_lock(&sbinfo->shrinklist_lock); 1965 - /* 1966 - * _careful to defend against unlocked access to 1967 - * ->shrink_list in shmem_unused_huge_shrink() 1968 - */ 1969 - if (list_empty_careful(&info->shrinklist)) { 1970 - list_add_tail(&info->shrinklist, 1971 - &sbinfo->shrinklist); 1972 - sbinfo->shrinklist_len++; 1973 - } 1974 - spin_unlock(&sbinfo->shrinklist_lock); 1975 - } 1976 - 1686 + error = PTR_ERR(page); 1687 + page = NULL; 1688 + if (error != -ENOSPC) 1689 + goto unlock; 1977 1690 /* 1978 - * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. 1691 + * Try to reclaim some space by splitting a huge page 1692 + * beyond i_size on the filesystem. 1979 1693 */ 1980 - if (sgp == SGP_FALLOC) 1981 - sgp = SGP_WRITE; 1694 + while (retry--) { 1695 + int ret; 1696 + 1697 + ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); 1698 + if (ret == SHRINK_STOP) 1699 + break; 1700 + if (ret) 1701 + goto alloc_nohuge; 1702 + } 1703 + goto unlock; 1704 + } 1705 + 1706 + if (PageTransHuge(page)) 1707 + hindex = round_down(index, HPAGE_PMD_NR); 1708 + else 1709 + hindex = index; 1710 + 1711 + if (sgp == SGP_WRITE) 1712 + __SetPageReferenced(page); 1713 + 1714 + error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, 1715 + PageTransHuge(page)); 1716 + if (error) 1717 + goto unacct; 1718 + error = shmem_add_to_page_cache(page, mapping, hindex, 1719 + NULL, gfp & GFP_RECLAIM_MASK); 1720 + if (error) { 1721 + mem_cgroup_cancel_charge(page, memcg, 1722 + PageTransHuge(page)); 1723 + goto unacct; 1724 + } 1725 + mem_cgroup_commit_charge(page, memcg, false, 1726 + PageTransHuge(page)); 1727 + lru_cache_add_anon(page); 1728 + 1729 + spin_lock_irq(&info->lock); 1730 + info->alloced += 1 << compound_order(page); 1731 + inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); 1732 + shmem_recalc_inode(inode); 1733 + spin_unlock_irq(&info->lock); 1734 + alloced = true; 1735 + 1736 + if (PageTransHuge(page) && 1737 + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < 1738 + hindex + HPAGE_PMD_NR - 1) { 1739 + /* 1740 + * Part of the huge page is beyond i_size: subject 1741 + * to shrink under memory pressure. 1742 + */ 1743 + spin_lock(&sbinfo->shrinklist_lock); 1744 + /* 1745 + * _careful to defend against unlocked access to 1746 + * ->shrink_list in shmem_unused_huge_shrink() 1747 + */ 1748 + if (list_empty_careful(&info->shrinklist)) { 1749 + list_add_tail(&info->shrinklist, 1750 + &sbinfo->shrinklist); 1751 + sbinfo->shrinklist_len++; 1752 + } 1753 + spin_unlock(&sbinfo->shrinklist_lock); 1754 + } 1755 + 1756 + /* 1757 + * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. 1758 + */ 1759 + if (sgp == SGP_FALLOC) 1760 + sgp = SGP_WRITE; 1982 1761 clear: 1983 - /* 1984 - * Let SGP_WRITE caller clear ends if write does not fill page; 1985 - * but SGP_FALLOC on a page fallocated earlier must initialize 1986 - * it now, lest undo on failure cancel our earlier guarantee. 1987 - */ 1988 - if (sgp != SGP_WRITE && !PageUptodate(page)) { 1989 - struct page *head = compound_head(page); 1990 - int i; 1762 + /* 1763 + * Let SGP_WRITE caller clear ends if write does not fill page; 1764 + * but SGP_FALLOC on a page fallocated earlier must initialize 1765 + * it now, lest undo on failure cancel our earlier guarantee. 1766 + */ 1767 + if (sgp != SGP_WRITE && !PageUptodate(page)) { 1768 + struct page *head = compound_head(page); 1769 + int i; 1991 1770 1992 - for (i = 0; i < (1 << compound_order(head)); i++) { 1993 - clear_highpage(head + i); 1994 - flush_dcache_page(head + i); 1995 - } 1996 - SetPageUptodate(head); 1771 + for (i = 0; i < (1 << compound_order(head)); i++) { 1772 + clear_highpage(head + i); 1773 + flush_dcache_page(head + i); 1997 1774 } 1775 + SetPageUptodate(head); 1998 1776 } 1999 1777 2000 1778 /* Perhaps the file has been truncated since we checked */ ··· 1947 1879 put_page(page); 1948 1880 goto alloc_nohuge; 1949 1881 } 1950 - failed: 1951 - if (swap.val && !shmem_confirm_swap(mapping, index, swap)) 1952 - error = -EEXIST; 1953 1882 unlock: 1954 1883 if (page) { 1955 1884 unlock_page(page); ··· 2190 2125 2191 2126 static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 2192 2127 { 2128 + struct shmem_inode_info *info = SHMEM_I(file_inode(file)); 2129 + 2130 + if (info->seals & F_SEAL_FUTURE_WRITE) { 2131 + /* 2132 + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when 2133 + * "future write" seal active. 2134 + */ 2135 + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) 2136 + return -EPERM; 2137 + 2138 + /* 2139 + * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED 2140 + * read-only mapping, take care to not allow mprotect to revert 2141 + * protections. 2142 + */ 2143 + vma->vm_flags &= ~(VM_MAYWRITE); 2144 + } 2145 + 2193 2146 file_accessed(file); 2194 2147 vma->vm_ops = &shmem_vm_ops; 2195 2148 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && ··· 2458 2375 pgoff_t index = pos >> PAGE_SHIFT; 2459 2376 2460 2377 /* i_mutex is held by caller */ 2461 - if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) { 2462 - if (info->seals & F_SEAL_WRITE) 2378 + if (unlikely(info->seals & (F_SEAL_GROW | 2379 + F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { 2380 + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) 2463 2381 return -EPERM; 2464 2382 if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) 2465 2383 return -EPERM; ··· 2723 2639 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); 2724 2640 2725 2641 /* protected by i_mutex */ 2726 - if (info->seals & F_SEAL_WRITE) { 2642 + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { 2727 2643 error = -EPERM; 2728 2644 goto out; 2729 2645 } ··· 3931 3847 return 0; 3932 3848 } 3933 3849 3934 - int shmem_unuse(swp_entry_t swap, struct page *page) 3850 + int shmem_unuse(unsigned int type, bool frontswap, 3851 + unsigned long *fs_pages_to_unuse) 3935 3852 { 3936 3853 return 0; 3937 3854 }

+24 -10

mm/slab.c

··· 550 550 551 551 static void init_arraycache(struct array_cache *ac, int limit, int batch) 552 552 { 553 - /* 554 - * The array_cache structures contain pointers to free object. 555 - * However, when such objects are allocated or transferred to another 556 - * cache the pointers are not cleared and they could be counted as 557 - * valid references during a kmemleak scan. Therefore, kmemleak must 558 - * not scan such objects. 559 - */ 560 - kmemleak_no_scan(ac); 561 553 if (ac) { 562 554 ac->avail = 0; 563 555 ac->limit = limit; ··· 565 573 struct array_cache *ac = NULL; 566 574 567 575 ac = kmalloc_node(memsize, gfp, node); 576 + /* 577 + * The array_cache structures contain pointers to free object. 578 + * However, when such objects are allocated or transferred to another 579 + * cache the pointers are not cleared and they could be counted as 580 + * valid references during a kmemleak scan. Therefore, kmemleak must 581 + * not scan such objects. 582 + */ 583 + kmemleak_no_scan(ac); 568 584 init_arraycache(ac, entries, batchcount); 569 585 return ac; 570 586 } ··· 667 667 668 668 alc = kmalloc_node(memsize, gfp, node); 669 669 if (alc) { 670 + kmemleak_no_scan(alc); 670 671 init_arraycache(&alc->ac, entries, batch); 671 672 spin_lock_init(&alc->lock); 672 673 } ··· 677 676 static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) 678 677 { 679 678 struct alien_cache **alc_ptr; 680 - size_t memsize = sizeof(void *) * nr_node_ids; 681 679 int i; 682 680 683 681 if (limit > 1) 684 682 limit = 12; 685 - alc_ptr = kzalloc_node(memsize, gfp, node); 683 + alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node); 686 684 if (!alc_ptr) 687 685 return NULL; 688 686 ··· 1727 1727 * This could be made much more intelligent. For now, try to avoid using 1728 1728 * high order pages for slabs. When the gfp() functions are more friendly 1729 1729 * towards high-order requests, this should be changed. 1730 + * 1731 + * Return: number of left-over bytes in a slab 1730 1732 */ 1731 1733 static size_t calculate_slab_order(struct kmem_cache *cachep, 1732 1734 size_t size, slab_flags_t flags) ··· 1977 1975 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 1978 1976 * cacheline. This can be beneficial if you're counting cycles as closely 1979 1977 * as davem. 1978 + * 1979 + * Return: a pointer to the created cache or %NULL in case of error 1980 1980 */ 1981 1981 int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) 1982 1982 { ··· 3546 3542 * 3547 3543 * Allocate an object from this cache. The flags are only relevant 3548 3544 * if the cache has no available objects. 3545 + * 3546 + * Return: pointer to the new object or %NULL in case of error 3549 3547 */ 3550 3548 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3551 3549 { ··· 3637 3631 * node, which can improve the performance for cpu bound structures. 3638 3632 * 3639 3633 * Fallback to other node is possible if __GFP_THISNODE is not set. 3634 + * 3635 + * Return: pointer to the new object or %NULL in case of error 3640 3636 */ 3641 3637 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3642 3638 { ··· 3707 3699 * @size: how many bytes of memory are required. 3708 3700 * @flags: the type of memory to allocate (see kmalloc). 3709 3701 * @caller: function caller for debug tracking of the caller 3702 + * 3703 + * Return: pointer to the allocated memory or %NULL in case of error 3710 3704 */ 3711 3705 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, 3712 3706 unsigned long caller) ··· 4174 4164 * @buffer: user buffer 4175 4165 * @count: data length 4176 4166 * @ppos: unused 4167 + * 4168 + * Return: %0 on success, negative error code otherwise. 4177 4169 */ 4178 4170 ssize_t slabinfo_write(struct file *file, const char __user *buffer, 4179 4171 size_t count, loff_t *ppos) ··· 4469 4457 * The caller must guarantee that objp points to a valid object previously 4470 4458 * allocated with either kmalloc() or kmem_cache_alloc(). The object 4471 4459 * must not be freed during the duration of the call. 4460 + * 4461 + * Return: size of the actual memory used by @objp in bytes 4472 4462 */ 4473 4463 size_t ksize(const void *objp) 4474 4464 {

-4

mm/slab.h

··· 276 276 gfp_t gfp, int order, 277 277 struct kmem_cache *s) 278 278 { 279 - if (!memcg_kmem_enabled()) 280 - return 0; 281 279 if (is_root_cache(s)) 282 280 return 0; 283 281 return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); ··· 284 286 static __always_inline void memcg_uncharge_slab(struct page *page, int order, 285 287 struct kmem_cache *s) 286 288 { 287 - if (!memcg_kmem_enabled()) 288 - return; 289 289 memcg_kmem_uncharge(page, order); 290 290 } 291 291

+9 -3

mm/slab_common.c

··· 939 939 * 940 940 * Releases as many slabs as possible for a cache. 941 941 * To help debugging, a zero exit status indicates all slabs were released. 942 + * 943 + * Return: %0 if all slabs were released, non-zero otherwise 942 944 */ 943 945 int kmem_cache_shrink(struct kmem_cache *cachep) 944 946 { ··· 1427 1425 #if defined(CONFIG_MEMCG) 1428 1426 void *memcg_slab_start(struct seq_file *m, loff_t *pos) 1429 1427 { 1430 - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 1428 + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 1431 1429 1432 1430 mutex_lock(&slab_mutex); 1433 1431 return seq_list_start(&memcg->kmem_caches, *pos); ··· 1435 1433 1436 1434 void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos) 1437 1435 { 1438 - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 1436 + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 1439 1437 1440 1438 return seq_list_next(p, &memcg->kmem_caches, pos); 1441 1439 } ··· 1449 1447 { 1450 1448 struct kmem_cache *s = list_entry(p, struct kmem_cache, 1451 1449 memcg_params.kmem_caches_node); 1452 - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 1450 + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 1453 1451 1454 1452 if (p == memcg->kmem_caches.next) 1455 1453 print_slabinfo_header(m); ··· 1530 1528 * This function is like krealloc() except it never frees the originally 1531 1529 * allocated buffer. Use this if you don't want to free the buffer immediately 1532 1530 * like, for example, with RCU. 1531 + * 1532 + * Return: pointer to the allocated memory or %NULL in case of error 1533 1533 */ 1534 1534 void *__krealloc(const void *p, size_t new_size, gfp_t flags) 1535 1535 { ··· 1553 1549 * lesser of the new and old sizes. If @p is %NULL, krealloc() 1554 1550 * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a 1555 1551 * %NULL pointer, the object pointed to is freed. 1552 + * 1553 + * Return: pointer to the allocated memory or %NULL in case of error 1556 1554 */ 1557 1555 void *krealloc(const void *p, size_t new_size, gfp_t flags) 1558 1556 {

+7 -9

mm/slub.c

··· 1093 1093 } 1094 1094 1095 1095 static inline int alloc_consistency_checks(struct kmem_cache *s, 1096 - struct page *page, 1097 - void *object, unsigned long addr) 1096 + struct page *page, void *object) 1098 1097 { 1099 1098 if (!check_slab(s, page)) 1100 1099 return 0; ··· 1114 1115 void *object, unsigned long addr) 1115 1116 { 1116 1117 if (s->flags & SLAB_CONSISTENCY_CHECKS) { 1117 - if (!alloc_consistency_checks(s, page, object, addr)) 1118 + if (!alloc_consistency_checks(s, page, object)) 1118 1119 goto bad; 1119 1120 } 1120 1121 ··· 2129 2130 if (!lock) { 2130 2131 lock = 1; 2131 2132 /* 2132 - * Taking the spinlock removes the possiblity 2133 + * Taking the spinlock removes the possibility 2133 2134 * that acquire_slab() will see a slab page that 2134 2135 * is frozen 2135 2136 */ ··· 2253 2254 } 2254 2255 2255 2256 /* 2256 - * Put a page that was just frozen (in __slab_free) into a partial page 2257 - * slot if available. 2257 + * Put a page that was just frozen (in __slab_free|get_partial_node) into a 2258 + * partial page slot if available. 2258 2259 * 2259 2260 * If we did not find a slot then simply move all the partials to the 2260 2261 * per node partial list. ··· 2481 2482 stat(s, ALLOC_SLAB); 2482 2483 c->page = page; 2483 2484 *pc = c; 2484 - } else 2485 - freelist = NULL; 2485 + } 2486 2486 2487 2487 return freelist; 2488 2488 } ··· 4262 4264 cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, 4263 4265 slub_cpu_dead); 4264 4266 4265 - pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n", 4267 + pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n", 4266 4268 cache_line_size(), 4267 4269 slub_min_order, slub_max_order, slub_min_objects, 4268 4270 nr_cpu_ids, nr_node_ids);

+1 -1

mm/sparse.c

··· 197 197 } 198 198 #define for_each_present_section_nr(start, section_nr) \ 199 199 for (section_nr = next_present_section_nr(start-1); \ 200 - ((section_nr >= 0) && \ 200 + ((section_nr != -1) && \ 201 201 (section_nr <= __highest_present_section_nr)); \ 202 202 section_nr = next_present_section_nr(section_nr)) 203 203

+8 -8

mm/swap.c

··· 58 58 static void __page_cache_release(struct page *page) 59 59 { 60 60 if (PageLRU(page)) { 61 - struct zone *zone = page_zone(page); 61 + pg_data_t *pgdat = page_pgdat(page); 62 62 struct lruvec *lruvec; 63 63 unsigned long flags; 64 64 65 - spin_lock_irqsave(zone_lru_lock(zone), flags); 66 - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 65 + spin_lock_irqsave(&pgdat->lru_lock, flags); 66 + lruvec = mem_cgroup_page_lruvec(page, pgdat); 67 67 VM_BUG_ON_PAGE(!PageLRU(page), page); 68 68 __ClearPageLRU(page); 69 69 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 70 - spin_unlock_irqrestore(zone_lru_lock(zone), flags); 70 + spin_unlock_irqrestore(&pgdat->lru_lock, flags); 71 71 } 72 72 __ClearPageWaiters(page); 73 73 mem_cgroup_uncharge(page); ··· 322 322 323 323 void activate_page(struct page *page) 324 324 { 325 - struct zone *zone = page_zone(page); 325 + pg_data_t *pgdat = page_pgdat(page); 326 326 327 327 page = compound_head(page); 328 - spin_lock_irq(zone_lru_lock(zone)); 329 - __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL); 330 - spin_unlock_irq(zone_lru_lock(zone)); 328 + spin_lock_irq(&pgdat->lru_lock); 329 + __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL); 330 + spin_unlock_irq(&pgdat->lru_lock); 331 331 } 332 332 #endif 333 333

+22 -1

mm/swap_state.c

··· 523 523 * This has been extended to use the NUMA policies from the mm triggering 524 524 * the readahead. 525 525 * 526 - * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL. 526 + * Caller must hold read mmap_sem if vmf->vma is not NULL. 527 527 */ 528 528 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, 529 529 struct vm_fault *vmf) ··· 542 542 mask = swapin_nr_pages(offset) - 1; 543 543 if (!mask) 544 544 goto skip; 545 + 546 + /* Test swap type to make sure the dereference is safe */ 547 + if (likely(si->flags & (SWP_BLKDEV | SWP_FS))) { 548 + struct inode *inode = si->swap_file->f_mapping->host; 549 + if (inode_read_congested(inode)) 550 + goto skip; 551 + } 545 552 546 553 do_poll = false; 547 554 /* Read a page_cluster sized and aligned cluster around offset. */ ··· 698 691 pte_unmap(orig_pte); 699 692 } 700 693 694 + /** 695 + * swap_vma_readahead - swap in pages in hope we need them soon 696 + * @entry: swap entry of this memory 697 + * @gfp_mask: memory allocation flags 698 + * @vmf: fault information 699 + * 700 + * Returns the struct page for entry and addr, after queueing swapin. 701 + * 702 + * Primitive swap readahead code. We simply read in a few pages whoes 703 + * virtual addresses are around the fault address in the same vma. 704 + * 705 + * Caller must hold read mmap_sem if vmf->vma is not NULL. 706 + * 707 + */ 701 708 static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, 702 709 struct vm_fault *vmf) 703 710 {

+200 -301

mm/swapfile.c

··· 98 98 99 99 atomic_t nr_rotate_swap = ATOMIC_INIT(0); 100 100 101 + static struct swap_info_struct *swap_type_to_swap_info(int type) 102 + { 103 + if (type >= READ_ONCE(nr_swapfiles)) 104 + return NULL; 105 + 106 + smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */ 107 + return READ_ONCE(swap_info[type]); 108 + } 109 + 101 110 static inline unsigned char swap_count(unsigned char ent) 102 111 { 103 112 return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ ··· 1053 1044 /* The only caller of this function is now suspend routine */ 1054 1045 swp_entry_t get_swap_page_of_type(int type) 1055 1046 { 1056 - struct swap_info_struct *si; 1047 + struct swap_info_struct *si = swap_type_to_swap_info(type); 1057 1048 pgoff_t offset; 1058 1049 1059 - si = swap_info[type]; 1050 + if (!si) 1051 + goto fail; 1052 + 1060 1053 spin_lock(&si->lock); 1061 - if (si && (si->flags & SWP_WRITEOK)) { 1054 + if (si->flags & SWP_WRITEOK) { 1062 1055 atomic_long_dec(&nr_swap_pages); 1063 1056 /* This is called for allocating swap entry, not cache */ 1064 1057 offset = scan_swap_map(si, 1); ··· 1071 1060 atomic_long_inc(&nr_swap_pages); 1072 1061 } 1073 1062 spin_unlock(&si->lock); 1063 + fail: 1074 1064 return (swp_entry_t) {0}; 1075 1065 } 1076 1066 ··· 1083 1071 if (!entry.val) 1084 1072 goto out; 1085 1073 type = swp_type(entry); 1086 - if (type >= nr_swapfiles) 1074 + p = swap_type_to_swap_info(type); 1075 + if (!p) 1087 1076 goto bad_nofile; 1088 - p = swap_info[type]; 1089 1077 if (!(p->flags & SWP_USED)) 1090 1078 goto bad_device; 1091 1079 offset = swp_offset(entry); ··· 1709 1697 sector_t swapdev_block(int type, pgoff_t offset) 1710 1698 { 1711 1699 struct block_device *bdev; 1700 + struct swap_info_struct *si = swap_type_to_swap_info(type); 1712 1701 1713 - if ((unsigned int)type >= nr_swapfiles) 1714 - return 0; 1715 - if (!(swap_info[type]->flags & SWP_WRITEOK)) 1702 + if (!si || !(si->flags & SWP_WRITEOK)) 1716 1703 return 0; 1717 1704 return map_swap_entry(swp_entry(type, offset), &bdev); 1718 1705 } ··· 1810 1799 } 1811 1800 1812 1801 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 1813 - unsigned long addr, unsigned long end, 1814 - swp_entry_t entry, struct page *page) 1802 + unsigned long addr, unsigned long end, 1803 + unsigned int type, bool frontswap, 1804 + unsigned long *fs_pages_to_unuse) 1815 1805 { 1816 - pte_t swp_pte = swp_entry_to_pte(entry); 1806 + struct page *page; 1807 + swp_entry_t entry; 1817 1808 pte_t *pte; 1809 + struct swap_info_struct *si; 1810 + unsigned long offset; 1818 1811 int ret = 0; 1812 + volatile unsigned char *swap_map; 1819 1813 1820 - /* 1821 - * We don't actually need pte lock while scanning for swp_pte: since 1822 - * we hold page lock and mmap_sem, swp_pte cannot be inserted into the 1823 - * page table while we're scanning; though it could get zapped, and on 1824 - * some architectures (e.g. x86_32 with PAE) we might catch a glimpse 1825 - * of unmatched parts which look like swp_pte, so unuse_pte must 1826 - * recheck under pte lock. Scanning without pte lock lets it be 1827 - * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. 1828 - */ 1814 + si = swap_info[type]; 1829 1815 pte = pte_offset_map(pmd, addr); 1830 1816 do { 1831 - /* 1832 - * swapoff spends a _lot_ of time in this loop! 1833 - * Test inline before going to call unuse_pte. 1834 - */ 1835 - if (unlikely(pte_same_as_swp(*pte, swp_pte))) { 1836 - pte_unmap(pte); 1837 - ret = unuse_pte(vma, pmd, addr, entry, page); 1838 - if (ret) 1839 - goto out; 1840 - pte = pte_offset_map(pmd, addr); 1817 + struct vm_fault vmf; 1818 + 1819 + if (!is_swap_pte(*pte)) 1820 + continue; 1821 + 1822 + entry = pte_to_swp_entry(*pte); 1823 + if (swp_type(entry) != type) 1824 + continue; 1825 + 1826 + offset = swp_offset(entry); 1827 + if (frontswap && !frontswap_test(si, offset)) 1828 + continue; 1829 + 1830 + pte_unmap(pte); 1831 + swap_map = &si->swap_map[offset]; 1832 + vmf.vma = vma; 1833 + vmf.address = addr; 1834 + vmf.pmd = pmd; 1835 + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); 1836 + if (!page) { 1837 + if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) 1838 + goto try_next; 1839 + return -ENOMEM; 1841 1840 } 1841 + 1842 + lock_page(page); 1843 + wait_on_page_writeback(page); 1844 + ret = unuse_pte(vma, pmd, addr, entry, page); 1845 + if (ret < 0) { 1846 + unlock_page(page); 1847 + put_page(page); 1848 + goto out; 1849 + } 1850 + 1851 + try_to_free_swap(page); 1852 + unlock_page(page); 1853 + put_page(page); 1854 + 1855 + if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) { 1856 + ret = FRONTSWAP_PAGES_UNUSED; 1857 + goto out; 1858 + } 1859 + try_next: 1860 + pte = pte_offset_map(pmd, addr); 1842 1861 } while (pte++, addr += PAGE_SIZE, addr != end); 1843 1862 pte_unmap(pte - 1); 1863 + 1864 + ret = 0; 1844 1865 out: 1845 1866 return ret; 1846 1867 } 1847 1868 1848 1869 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 1849 1870 unsigned long addr, unsigned long end, 1850 - swp_entry_t entry, struct page *page) 1871 + unsigned int type, bool frontswap, 1872 + unsigned long *fs_pages_to_unuse) 1851 1873 { 1852 1874 pmd_t *pmd; 1853 1875 unsigned long next; ··· 1892 1848 next = pmd_addr_end(addr, end); 1893 1849 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 1894 1850 continue; 1895 - ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 1851 + ret = unuse_pte_range(vma, pmd, addr, next, type, 1852 + frontswap, fs_pages_to_unuse); 1896 1853 if (ret) 1897 1854 return ret; 1898 1855 } while (pmd++, addr = next, addr != end); ··· 1902 1857 1903 1858 static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, 1904 1859 unsigned long addr, unsigned long end, 1905 - swp_entry_t entry, struct page *page) 1860 + unsigned int type, bool frontswap, 1861 + unsigned long *fs_pages_to_unuse) 1906 1862 { 1907 1863 pud_t *pud; 1908 1864 unsigned long next; ··· 1914 1868 next = pud_addr_end(addr, end); 1915 1869 if (pud_none_or_clear_bad(pud)) 1916 1870 continue; 1917 - ret = unuse_pmd_range(vma, pud, addr, next, entry, page); 1871 + ret = unuse_pmd_range(vma, pud, addr, next, type, 1872 + frontswap, fs_pages_to_unuse); 1918 1873 if (ret) 1919 1874 return ret; 1920 1875 } while (pud++, addr = next, addr != end); ··· 1924 1877 1925 1878 static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, 1926 1879 unsigned long addr, unsigned long end, 1927 - swp_entry_t entry, struct page *page) 1880 + unsigned int type, bool frontswap, 1881 + unsigned long *fs_pages_to_unuse) 1928 1882 { 1929 1883 p4d_t *p4d; 1930 1884 unsigned long next; ··· 1936 1888 next = p4d_addr_end(addr, end); 1937 1889 if (p4d_none_or_clear_bad(p4d)) 1938 1890 continue; 1939 - ret = unuse_pud_range(vma, p4d, addr, next, entry, page); 1891 + ret = unuse_pud_range(vma, p4d, addr, next, type, 1892 + frontswap, fs_pages_to_unuse); 1940 1893 if (ret) 1941 1894 return ret; 1942 1895 } while (p4d++, addr = next, addr != end); 1943 1896 return 0; 1944 1897 } 1945 1898 1946 - static int unuse_vma(struct vm_area_struct *vma, 1947 - swp_entry_t entry, struct page *page) 1899 + static int unuse_vma(struct vm_area_struct *vma, unsigned int type, 1900 + bool frontswap, unsigned long *fs_pages_to_unuse) 1948 1901 { 1949 1902 pgd_t *pgd; 1950 1903 unsigned long addr, end, next; 1951 1904 int ret; 1952 1905 1953 - if (page_anon_vma(page)) { 1954 - addr = page_address_in_vma(page, vma); 1955 - if (addr == -EFAULT) 1956 - return 0; 1957 - else 1958 - end = addr + PAGE_SIZE; 1959 - } else { 1960 - addr = vma->vm_start; 1961 - end = vma->vm_end; 1962 - } 1906 + addr = vma->vm_start; 1907 + end = vma->vm_end; 1963 1908 1964 1909 pgd = pgd_offset(vma->vm_mm, addr); 1965 1910 do { 1966 1911 next = pgd_addr_end(addr, end); 1967 1912 if (pgd_none_or_clear_bad(pgd)) 1968 1913 continue; 1969 - ret = unuse_p4d_range(vma, pgd, addr, next, entry, page); 1914 + ret = unuse_p4d_range(vma, pgd, addr, next, type, 1915 + frontswap, fs_pages_to_unuse); 1970 1916 if (ret) 1971 1917 return ret; 1972 1918 } while (pgd++, addr = next, addr != end); 1973 1919 return 0; 1974 1920 } 1975 1921 1976 - static int unuse_mm(struct mm_struct *mm, 1977 - swp_entry_t entry, struct page *page) 1922 + static int unuse_mm(struct mm_struct *mm, unsigned int type, 1923 + bool frontswap, unsigned long *fs_pages_to_unuse) 1978 1924 { 1979 1925 struct vm_area_struct *vma; 1980 1926 int ret = 0; 1981 1927 1982 - if (!down_read_trylock(&mm->mmap_sem)) { 1983 - /* 1984 - * Activate page so shrink_inactive_list is unlikely to unmap 1985 - * its ptes while lock is dropped, so swapoff can make progress. 1986 - */ 1987 - activate_page(page); 1988 - unlock_page(page); 1989 - down_read(&mm->mmap_sem); 1990 - lock_page(page); 1991 - } 1928 + down_read(&mm->mmap_sem); 1992 1929 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1993 - if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) 1994 - break; 1930 + if (vma->anon_vma) { 1931 + ret = unuse_vma(vma, type, frontswap, 1932 + fs_pages_to_unuse); 1933 + if (ret) 1934 + break; 1935 + } 1995 1936 cond_resched(); 1996 1937 } 1997 1938 up_read(&mm->mmap_sem); 1998 - return (ret < 0)? ret: 0; 1939 + return ret; 1999 1940 } 2000 1941 2001 1942 /* 2002 1943 * Scan swap_map (or frontswap_map if frontswap parameter is true) 2003 - * from current position to next entry still in use. 2004 - * Recycle to start on reaching the end, returning 0 when empty. 1944 + * from current position to next entry still in use. Return 0 1945 + * if there are no inuse entries after prev till end of the map. 2005 1946 */ 2006 1947 static unsigned int find_next_to_unuse(struct swap_info_struct *si, 2007 1948 unsigned int prev, bool frontswap) 2008 1949 { 2009 - unsigned int max = si->max; 2010 - unsigned int i = prev; 1950 + unsigned int i; 2011 1951 unsigned char count; 2012 1952 2013 1953 /* ··· 2004 1968 * hits are okay, and sys_swapoff() has already prevented new 2005 1969 * allocations from this area (while holding swap_lock). 2006 1970 */ 2007 - for (;;) { 2008 - if (++i >= max) { 2009 - if (!prev) { 2010 - i = 0; 2011 - break; 2012 - } 2013 - /* 2014 - * No entries in use at top of swap_map, 2015 - * loop back to start and recheck there. 2016 - */ 2017 - max = prev + 1; 2018 - prev = 0; 2019 - i = 1; 2020 - } 1971 + for (i = prev + 1; i < si->max; i++) { 2021 1972 count = READ_ONCE(si->swap_map[i]); 2022 1973 if (count && swap_count(count) != SWAP_MAP_BAD) 2023 1974 if (!frontswap || frontswap_test(si, i)) ··· 2012 1989 if ((i % LATENCY_LIMIT) == 0) 2013 1990 cond_resched(); 2014 1991 } 1992 + 1993 + if (i == si->max) 1994 + i = 0; 1995 + 2015 1996 return i; 2016 1997 } 2017 1998 2018 1999 /* 2019 - * We completely avoid races by reading each swap page in advance, 2020 - * and then search for the process using it. All the necessary 2021 - * page table adjustments can then be made atomically. 2022 - * 2023 - * if the boolean frontswap is true, only unuse pages_to_unuse pages; 2000 + * If the boolean frontswap is true, only unuse pages_to_unuse pages; 2024 2001 * pages_to_unuse==0 means all pages; ignored if frontswap is false 2025 2002 */ 2003 + #define SWAP_UNUSE_MAX_TRIES 3 2026 2004 int try_to_unuse(unsigned int type, bool frontswap, 2027 2005 unsigned long pages_to_unuse) 2028 2006 { 2007 + struct mm_struct *prev_mm; 2008 + struct mm_struct *mm; 2009 + struct list_head *p; 2010 + int retval = 0; 2029 2011 struct swap_info_struct *si = swap_info[type]; 2030 - struct mm_struct *start_mm; 2031 - volatile unsigned char *swap_map; /* swap_map is accessed without 2032 - * locking. Mark it as volatile 2033 - * to prevent compiler doing 2034 - * something odd. 2035 - */ 2036 - unsigned char swcount; 2037 2012 struct page *page; 2038 2013 swp_entry_t entry; 2039 - unsigned int i = 0; 2040 - int retval = 0; 2014 + unsigned int i; 2015 + int retries = 0; 2041 2016 2042 - /* 2043 - * When searching mms for an entry, a good strategy is to 2044 - * start at the first mm we freed the previous entry from 2045 - * (though actually we don't notice whether we or coincidence 2046 - * freed the entry). Initialize this start_mm with a hold. 2047 - * 2048 - * A simpler strategy would be to start at the last mm we 2049 - * freed the previous entry from; but that would take less 2050 - * advantage of mmlist ordering, which clusters forked mms 2051 - * together, child after parent. If we race with dup_mmap(), we 2052 - * prefer to resolve parent before child, lest we miss entries 2053 - * duplicated after we scanned child: using last mm would invert 2054 - * that. 2055 - */ 2056 - start_mm = &init_mm; 2057 - mmget(&init_mm); 2017 + if (!si->inuse_pages) 2018 + return 0; 2058 2019 2059 - /* 2060 - * Keep on scanning until all entries have gone. Usually, 2061 - * one pass through swap_map is enough, but not necessarily: 2062 - * there are races when an instance of an entry might be missed. 2063 - */ 2064 - while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { 2020 + if (!frontswap) 2021 + pages_to_unuse = 0; 2022 + 2023 + retry: 2024 + retval = shmem_unuse(type, frontswap, &pages_to_unuse); 2025 + if (retval) 2026 + goto out; 2027 + 2028 + prev_mm = &init_mm; 2029 + mmget(prev_mm); 2030 + 2031 + spin_lock(&mmlist_lock); 2032 + p = &init_mm.mmlist; 2033 + while ((p = p->next) != &init_mm.mmlist) { 2065 2034 if (signal_pending(current)) { 2066 2035 retval = -EINTR; 2067 2036 break; 2068 2037 } 2069 2038 2070 - /* 2071 - * Get a page for the entry, using the existing swap 2072 - * cache page if there is one. Otherwise, get a clean 2073 - * page and read the swap into it. 2074 - */ 2075 - swap_map = &si->swap_map[i]; 2076 - entry = swp_entry(type, i); 2077 - page = read_swap_cache_async(entry, 2078 - GFP_HIGHUSER_MOVABLE, NULL, 0, false); 2079 - if (!page) { 2080 - /* 2081 - * Either swap_duplicate() failed because entry 2082 - * has been freed independently, and will not be 2083 - * reused since sys_swapoff() already disabled 2084 - * allocation from here, or alloc_page() failed. 2085 - */ 2086 - swcount = *swap_map; 2087 - /* 2088 - * We don't hold lock here, so the swap entry could be 2089 - * SWAP_MAP_BAD (when the cluster is discarding). 2090 - * Instead of fail out, We can just skip the swap 2091 - * entry because swapoff will wait for discarding 2092 - * finish anyway. 2093 - */ 2094 - if (!swcount || swcount == SWAP_MAP_BAD) 2095 - continue; 2096 - retval = -ENOMEM; 2097 - break; 2098 - } 2099 - 2100 - /* 2101 - * Don't hold on to start_mm if it looks like exiting. 2102 - */ 2103 - if (atomic_read(&start_mm->mm_users) == 1) { 2104 - mmput(start_mm); 2105 - start_mm = &init_mm; 2106 - mmget(&init_mm); 2107 - } 2108 - 2109 - /* 2110 - * Wait for and lock page. When do_swap_page races with 2111 - * try_to_unuse, do_swap_page can handle the fault much 2112 - * faster than try_to_unuse can locate the entry. This 2113 - * apparently redundant "wait_on_page_locked" lets try_to_unuse 2114 - * defer to do_swap_page in such a case - in some tests, 2115 - * do_swap_page and try_to_unuse repeatedly compete. 2116 - */ 2117 - wait_on_page_locked(page); 2118 - wait_on_page_writeback(page); 2119 - lock_page(page); 2120 - wait_on_page_writeback(page); 2121 - 2122 - /* 2123 - * Remove all references to entry. 2124 - */ 2125 - swcount = *swap_map; 2126 - if (swap_count(swcount) == SWAP_MAP_SHMEM) { 2127 - retval = shmem_unuse(entry, page); 2128 - /* page has already been unlocked and released */ 2129 - if (retval < 0) 2130 - break; 2039 + mm = list_entry(p, struct mm_struct, mmlist); 2040 + if (!mmget_not_zero(mm)) 2131 2041 continue; 2132 - } 2133 - if (swap_count(swcount) && start_mm != &init_mm) 2134 - retval = unuse_mm(start_mm, entry, page); 2042 + spin_unlock(&mmlist_lock); 2043 + mmput(prev_mm); 2044 + prev_mm = mm; 2045 + retval = unuse_mm(mm, type, frontswap, &pages_to_unuse); 2135 2046 2136 - if (swap_count(*swap_map)) { 2137 - int set_start_mm = (*swap_map >= swcount); 2138 - struct list_head *p = &start_mm->mmlist; 2139 - struct mm_struct *new_start_mm = start_mm; 2140 - struct mm_struct *prev_mm = start_mm; 2141 - struct mm_struct *mm; 2142 - 2143 - mmget(new_start_mm); 2144 - mmget(prev_mm); 2145 - spin_lock(&mmlist_lock); 2146 - while (swap_count(*swap_map) && !retval && 2147 - (p = p->next) != &start_mm->mmlist) { 2148 - mm = list_entry(p, struct mm_struct, mmlist); 2149 - if (!mmget_not_zero(mm)) 2150 - continue; 2151 - spin_unlock(&mmlist_lock); 2152 - mmput(prev_mm); 2153 - prev_mm = mm; 2154 - 2155 - cond_resched(); 2156 - 2157 - swcount = *swap_map; 2158 - if (!swap_count(swcount)) /* any usage ? */ 2159 - ; 2160 - else if (mm == &init_mm) 2161 - set_start_mm = 1; 2162 - else 2163 - retval = unuse_mm(mm, entry, page); 2164 - 2165 - if (set_start_mm && *swap_map < swcount) { 2166 - mmput(new_start_mm); 2167 - mmget(mm); 2168 - new_start_mm = mm; 2169 - set_start_mm = 0; 2170 - } 2171 - spin_lock(&mmlist_lock); 2172 - } 2173 - spin_unlock(&mmlist_lock); 2174 - mmput(prev_mm); 2175 - mmput(start_mm); 2176 - start_mm = new_start_mm; 2177 - } 2178 2047 if (retval) { 2179 - unlock_page(page); 2180 - put_page(page); 2181 - break; 2048 + mmput(prev_mm); 2049 + goto out; 2182 2050 } 2183 - 2184 - /* 2185 - * If a reference remains (rare), we would like to leave 2186 - * the page in the swap cache; but try_to_unmap could 2187 - * then re-duplicate the entry once we drop page lock, 2188 - * so we might loop indefinitely; also, that page could 2189 - * not be swapped out to other storage meanwhile. So: 2190 - * delete from cache even if there's another reference, 2191 - * after ensuring that the data has been saved to disk - 2192 - * since if the reference remains (rarer), it will be 2193 - * read from disk into another page. Splitting into two 2194 - * pages would be incorrect if swap supported "shared 2195 - * private" pages, but they are handled by tmpfs files. 2196 - * 2197 - * Given how unuse_vma() targets one particular offset 2198 - * in an anon_vma, once the anon_vma has been determined, 2199 - * this splitting happens to be just what is needed to 2200 - * handle where KSM pages have been swapped out: re-reading 2201 - * is unnecessarily slow, but we can fix that later on. 2202 - */ 2203 - if (swap_count(*swap_map) && 2204 - PageDirty(page) && PageSwapCache(page)) { 2205 - struct writeback_control wbc = { 2206 - .sync_mode = WB_SYNC_NONE, 2207 - }; 2208 - 2209 - swap_writepage(compound_head(page), &wbc); 2210 - lock_page(page); 2211 - wait_on_page_writeback(page); 2212 - } 2213 - 2214 - /* 2215 - * It is conceivable that a racing task removed this page from 2216 - * swap cache just before we acquired the page lock at the top, 2217 - * or while we dropped it in unuse_mm(). The page might even 2218 - * be back in swap cache on another swap area: that we must not 2219 - * delete, since it may not have been written out to swap yet. 2220 - */ 2221 - if (PageSwapCache(page) && 2222 - likely(page_private(page) == entry.val) && 2223 - (!PageTransCompound(page) || 2224 - !swap_page_trans_huge_swapped(si, entry))) 2225 - delete_from_swap_cache(compound_head(page)); 2226 - 2227 - /* 2228 - * So we could skip searching mms once swap count went 2229 - * to 1, we did not mark any present ptes as dirty: must 2230 - * mark page dirty so shrink_page_list will preserve it. 2231 - */ 2232 - SetPageDirty(page); 2233 - unlock_page(page); 2234 - put_page(page); 2235 2051 2236 2052 /* 2237 2053 * Make sure that we aren't completely killing 2238 2054 * interactive performance. 2239 2055 */ 2240 2056 cond_resched(); 2241 - if (frontswap && pages_to_unuse > 0) { 2242 - if (!--pages_to_unuse) 2243 - break; 2244 - } 2057 + spin_lock(&mmlist_lock); 2058 + } 2059 + spin_unlock(&mmlist_lock); 2060 + 2061 + mmput(prev_mm); 2062 + 2063 + i = 0; 2064 + while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { 2065 + 2066 + entry = swp_entry(type, i); 2067 + page = find_get_page(swap_address_space(entry), i); 2068 + if (!page) 2069 + continue; 2070 + 2071 + /* 2072 + * It is conceivable that a racing task removed this page from 2073 + * swap cache just before we acquired the page lock. The page 2074 + * might even be back in swap cache on another swap area. But 2075 + * that is okay, try_to_free_swap() only removes stale pages. 2076 + */ 2077 + lock_page(page); 2078 + wait_on_page_writeback(page); 2079 + try_to_free_swap(page); 2080 + unlock_page(page); 2081 + put_page(page); 2082 + 2083 + /* 2084 + * For frontswap, we just need to unuse pages_to_unuse, if 2085 + * it was specified. Need not check frontswap again here as 2086 + * we already zeroed out pages_to_unuse if not frontswap. 2087 + */ 2088 + if (pages_to_unuse && --pages_to_unuse == 0) 2089 + goto out; 2245 2090 } 2246 2091 2247 - mmput(start_mm); 2248 - return retval; 2092 + /* 2093 + * Lets check again to see if there are still swap entries in the map. 2094 + * If yes, we would need to do retry the unuse logic again. 2095 + * Under global memory pressure, swap entries can be reinserted back 2096 + * into process space after the mmlist loop above passes over them. 2097 + * Its not worth continuosuly retrying to unuse the swap in this case. 2098 + * So we try SWAP_UNUSE_MAX_TRIES times. 2099 + */ 2100 + if (++retries >= SWAP_UNUSE_MAX_TRIES) 2101 + retval = -EBUSY; 2102 + else if (si->inuse_pages) 2103 + goto retry; 2104 + 2105 + out: 2106 + return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval; 2249 2107 } 2250 2108 2251 2109 /* ··· 2162 2258 struct swap_extent *se; 2163 2259 pgoff_t offset; 2164 2260 2165 - sis = swap_info[swp_type(entry)]; 2261 + sis = swp_swap_info(entry); 2166 2262 *bdev = sis->bdev; 2167 2263 2168 2264 offset = swp_offset(entry); ··· 2604 2700 if (!l) 2605 2701 return SEQ_START_TOKEN; 2606 2702 2607 - for (type = 0; type < nr_swapfiles; type++) { 2608 - smp_rmb(); /* read nr_swapfiles before swap_info[type] */ 2609 - si = swap_info[type]; 2703 + for (type = 0; (si = swap_type_to_swap_info(type)); type++) { 2610 2704 if (!(si->flags & SWP_USED) || !si->swap_map) 2611 2705 continue; 2612 2706 if (!--l) ··· 2624 2722 else 2625 2723 type = si->type + 1; 2626 2724 2627 - for (; type < nr_swapfiles; type++) { 2628 - smp_rmb(); /* read nr_swapfiles before swap_info[type] */ 2629 - si = swap_info[type]; 2725 + for (; (si = swap_type_to_swap_info(type)); type++) { 2630 2726 if (!(si->flags & SWP_USED) || !si->swap_map) 2631 2727 continue; 2632 2728 ++*pos; ··· 2713 2813 struct swap_info_struct *p; 2714 2814 unsigned int type; 2715 2815 int i; 2716 - int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node); 2717 2816 2718 - p = kvzalloc(size, GFP_KERNEL); 2817 + p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); 2719 2818 if (!p) 2720 2819 return ERR_PTR(-ENOMEM); 2721 2820 ··· 2730 2831 } 2731 2832 if (type >= nr_swapfiles) { 2732 2833 p->type = type; 2733 - swap_info[type] = p; 2834 + WRITE_ONCE(swap_info[type], p); 2734 2835 /* 2735 2836 * Write swap_info[type] before nr_swapfiles, in case a 2736 2837 * racing procfs swap_start() or swap_next() is reading them. 2737 2838 * (We never shrink nr_swapfiles, we never free this entry.) 2738 2839 */ 2739 2840 smp_wmb(); 2740 - nr_swapfiles++; 2841 + WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1); 2741 2842 } else { 2742 2843 kvfree(p); 2743 2844 p = swap_info[type]; ··· 3257 3358 { 3258 3359 struct swap_info_struct *p; 3259 3360 struct swap_cluster_info *ci; 3260 - unsigned long offset, type; 3361 + unsigned long offset; 3261 3362 unsigned char count; 3262 3363 unsigned char has_cache; 3263 3364 int err = -EINVAL; ··· 3265 3366 if (non_swap_entry(entry)) 3266 3367 goto out; 3267 3368 3268 - type = swp_type(entry); 3269 - if (type >= nr_swapfiles) 3369 + p = swp_swap_info(entry); 3370 + if (!p) 3270 3371 goto bad_file; 3271 - p = swap_info[type]; 3372 + 3272 3373 offset = swp_offset(entry); 3273 3374 if (unlikely(offset >= p->max)) 3274 3375 goto out; ··· 3365 3466 3366 3467 struct swap_info_struct *swp_swap_info(swp_entry_t entry) 3367 3468 { 3368 - return swap_info[swp_type(entry)]; 3469 + return swap_type_to_swap_info(swp_type(entry)); 3369 3470 } 3370 3471 3371 3472 struct swap_info_struct *page_swap_info(struct page *page)

+4 -2

mm/truncate.c

··· 539 539 * invalidate_mapping_pages() will not block on IO activity. It will not 540 540 * invalidate pages which are dirty, locked, under writeback or mapped into 541 541 * pagetables. 542 + * 543 + * Return: the number of the pages that were invalidated 542 544 */ 543 545 unsigned long invalidate_mapping_pages(struct address_space *mapping, 544 546 pgoff_t start, pgoff_t end) ··· 666 664 * Any pages which are found to be mapped into pagetables are unmapped prior to 667 665 * invalidation. 668 666 * 669 - * Returns -EBUSY if any pages could not be invalidated. 667 + * Return: -EBUSY if any pages could not be invalidated. 670 668 */ 671 669 int invalidate_inode_pages2_range(struct address_space *mapping, 672 670 pgoff_t start, pgoff_t end) ··· 763 761 * Any pages which are found to be mapped into pagetables are unmapped prior to 764 762 * invalidation. 765 763 * 766 - * Returns -EBUSY if any pages could not be invalidated. 764 + * Return: -EBUSY if any pages could not be invalidated. 767 765 */ 768 766 int invalidate_inode_pages2(struct address_space *mapping) 769 767 {

+26 -11

mm/util.c

··· 36 36 * kstrdup - allocate space for and copy an existing string 37 37 * @s: the string to duplicate 38 38 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 39 + * 40 + * Return: newly allocated copy of @s or %NULL in case of error 39 41 */ 40 42 char *kstrdup(const char *s, gfp_t gfp) 41 43 { ··· 60 58 * @s: the string to duplicate 61 59 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 62 60 * 63 - * Function returns source string if it is in .rodata section otherwise it 64 - * fallbacks to kstrdup. 65 - * Strings allocated by kstrdup_const should be freed by kfree_const. 61 + * Note: Strings allocated by kstrdup_const should be freed by kfree_const. 62 + * 63 + * Return: source string if it is in .rodata section otherwise 64 + * fallback to kstrdup. 66 65 */ 67 66 const char *kstrdup_const(const char *s, gfp_t gfp) 68 67 { ··· 81 78 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 82 79 * 83 80 * Note: Use kmemdup_nul() instead if the size is known exactly. 81 + * 82 + * Return: newly allocated copy of @s or %NULL in case of error 84 83 */ 85 84 char *kstrndup(const char *s, size_t max, gfp_t gfp) 86 85 { ··· 108 103 * @src: memory region to duplicate 109 104 * @len: memory region length 110 105 * @gfp: GFP mask to use 106 + * 107 + * Return: newly allocated copy of @src or %NULL in case of error 111 108 */ 112 109 void *kmemdup(const void *src, size_t len, gfp_t gfp) 113 110 { ··· 127 120 * @s: The data to stringify 128 121 * @len: The size of the data 129 122 * @gfp: the GFP mask used in the kmalloc() call when allocating memory 123 + * 124 + * Return: newly allocated copy of @s with NUL-termination or %NULL in 125 + * case of error 130 126 */ 131 127 char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) 132 128 { ··· 153 143 * @src: source address in user space 154 144 * @len: number of bytes to copy 155 145 * 156 - * Returns an ERR_PTR() on failure. Result is physically 146 + * Return: an ERR_PTR() on failure. Result is physically 157 147 * contiguous, to be freed by kfree(). 158 148 */ 159 149 void *memdup_user(const void __user *src, size_t len) ··· 179 169 * @src: source address in user space 180 170 * @len: number of bytes to copy 181 171 * 182 - * Returns an ERR_PTR() on failure. Result may be not 172 + * Return: an ERR_PTR() on failure. Result may be not 183 173 * physically contiguous. Use kvfree() to free. 184 174 */ 185 175 void *vmemdup_user(const void __user *src, size_t len) ··· 203 193 * strndup_user - duplicate an existing string from user space 204 194 * @s: The string to duplicate 205 195 * @n: Maximum number of bytes to copy, including the trailing NUL. 196 + * 197 + * Return: newly allocated copy of @s or %NULL in case of error 206 198 */ 207 199 char *strndup_user(const char __user *s, long n) 208 200 { ··· 236 224 * @src: source address in user space 237 225 * @len: number of bytes to copy 238 226 * 239 - * Returns an ERR_PTR() on failure. 227 + * Return: an ERR_PTR() on failure. 240 228 */ 241 229 void *memdup_user_nul(const void __user *src, size_t len) 242 230 { ··· 322 310 * @pages: array that receives pointers to the pages pinned. 323 311 * Should be at least nr_pages long. 324 312 * 325 - * Returns number of pages pinned. This may be fewer than the number 326 - * requested. If nr_pages is 0 or negative, returns 0. If no pages 327 - * were pinned, returns -errno. 328 - * 329 313 * get_user_pages_fast provides equivalent functionality to get_user_pages, 330 314 * operating on current and current->mm, with force=0 and vma=NULL. However 331 315 * unlike get_user_pages, it must be called without mmap_sem held. ··· 333 325 * pages have to be faulted in, it may turn out to be slightly slower so 334 326 * callers need to carefully consider what to use. On many architectures, 335 327 * get_user_pages_fast simply falls back to get_user_pages. 328 + * 329 + * Return: number of pages pinned. This may be fewer than the number 330 + * requested. If nr_pages is 0 or negative, returns 0. If no pages 331 + * were pinned, returns -errno. 336 332 */ 337 333 int __weak get_user_pages_fast(unsigned long start, 338 334 int nr_pages, int write, struct page **pages) ··· 398 386 * 399 387 * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not 400 388 * fall back to vmalloc. 389 + * 390 + * Return: pointer to the allocated memory of %NULL in case of failure 401 391 */ 402 392 void *kvmalloc_node(size_t size, gfp_t flags, int node) 403 393 { ··· 743 729 * @buffer: the buffer to copy to. 744 730 * @buflen: the length of the buffer. Larger cmdline values are truncated 745 731 * to this length. 746 - * Returns the size of the cmdline field copied. Note that the copy does 732 + * 733 + * Return: the size of the cmdline field copied. Note that the copy does 747 734 * not guarantee an ending NULL byte. 748 735 */ 749 736 int get_cmdline(struct task_struct *task, char *buffer, int buflen)

+233 -204

mm/vmalloc.c

··· 498 498 } 499 499 500 500 found: 501 - if (addr + size > vend) 501 + /* 502 + * Check also calculated address against the vstart, 503 + * because it can be 0 because of big align request. 504 + */ 505 + if (addr + size > vend || addr < vstart) 502 506 goto overflow; 503 507 504 508 va->va_start = addr; ··· 844 840 * @order: how many 2^order pages should be occupied in newly allocated block 845 841 * @gfp_mask: flags for the page level allocator 846 842 * 847 - * Returns: virtual address in a newly allocated block or ERR_PTR(-errno) 843 + * Return: virtual address in a newly allocated block or ERR_PTR(-errno) 848 844 */ 849 845 static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) 850 846 { ··· 1191 1187 EXPORT_SYMBOL(vm_map_ram); 1192 1188 1193 1189 static struct vm_struct *vmlist __initdata; 1190 + 1194 1191 /** 1195 1192 * vm_area_add_early - add vmap area early during boot 1196 1193 * @vm: vm_struct to add ··· 1426 1421 } 1427 1422 1428 1423 /** 1429 - * get_vm_area - reserve a contiguous kernel virtual area 1430 - * @size: size of the area 1431 - * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 1424 + * get_vm_area - reserve a contiguous kernel virtual area 1425 + * @size: size of the area 1426 + * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 1432 1427 * 1433 - * Search an area of @size in the kernel virtual mapping area, 1434 - * and reserved it for out purposes. Returns the area descriptor 1435 - * on success or %NULL on failure. 1428 + * Search an area of @size in the kernel virtual mapping area, 1429 + * and reserved it for out purposes. Returns the area descriptor 1430 + * on success or %NULL on failure. 1431 + * 1432 + * Return: the area descriptor on success or %NULL on failure. 1436 1433 */ 1437 1434 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 1438 1435 { ··· 1451 1444 } 1452 1445 1453 1446 /** 1454 - * find_vm_area - find a continuous kernel virtual area 1455 - * @addr: base address 1447 + * find_vm_area - find a continuous kernel virtual area 1448 + * @addr: base address 1456 1449 * 1457 - * Search for the kernel VM area starting at @addr, and return it. 1458 - * It is up to the caller to do all required locking to keep the returned 1459 - * pointer valid. 1450 + * Search for the kernel VM area starting at @addr, and return it. 1451 + * It is up to the caller to do all required locking to keep the returned 1452 + * pointer valid. 1453 + * 1454 + * Return: pointer to the found area or %NULL on faulure 1460 1455 */ 1461 1456 struct vm_struct *find_vm_area(const void *addr) 1462 1457 { ··· 1472 1463 } 1473 1464 1474 1465 /** 1475 - * remove_vm_area - find and remove a continuous kernel virtual area 1476 - * @addr: base address 1466 + * remove_vm_area - find and remove a continuous kernel virtual area 1467 + * @addr: base address 1477 1468 * 1478 - * Search for the kernel VM area starting at @addr, and remove it. 1479 - * This function returns the found VM area, but using it is NOT safe 1480 - * on SMP machines, except for its size or flags. 1469 + * Search for the kernel VM area starting at @addr, and remove it. 1470 + * This function returns the found VM area, but using it is NOT safe 1471 + * on SMP machines, except for its size or flags. 1472 + * 1473 + * Return: pointer to the found area or %NULL on faulure 1481 1474 */ 1482 1475 struct vm_struct *remove_vm_area(const void *addr) 1483 1476 { ··· 1516 1505 addr)) 1517 1506 return; 1518 1507 1519 - area = find_vmap_area((unsigned long)addr)->vm; 1508 + area = find_vm_area(addr); 1520 1509 if (unlikely(!area)) { 1521 1510 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 1522 1511 addr); ··· 1559 1548 } 1560 1549 1561 1550 /** 1562 - * vfree_atomic - release memory allocated by vmalloc() 1563 - * @addr: memory base address 1551 + * vfree_atomic - release memory allocated by vmalloc() 1552 + * @addr: memory base address 1564 1553 * 1565 - * This one is just like vfree() but can be called in any atomic context 1566 - * except NMIs. 1554 + * This one is just like vfree() but can be called in any atomic context 1555 + * except NMIs. 1567 1556 */ 1568 1557 void vfree_atomic(const void *addr) 1569 1558 { ··· 1576 1565 __vfree_deferred(addr); 1577 1566 } 1578 1567 1568 + static void __vfree(const void *addr) 1569 + { 1570 + if (unlikely(in_interrupt())) 1571 + __vfree_deferred(addr); 1572 + else 1573 + __vunmap(addr, 1); 1574 + } 1575 + 1579 1576 /** 1580 - * vfree - release memory allocated by vmalloc() 1581 - * @addr: memory base address 1577 + * vfree - release memory allocated by vmalloc() 1578 + * @addr: memory base address 1582 1579 * 1583 - * Free the virtually continuous memory area starting at @addr, as 1584 - * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is 1585 - * NULL, no operation is performed. 1580 + * Free the virtually continuous memory area starting at @addr, as 1581 + * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is 1582 + * NULL, no operation is performed. 1586 1583 * 1587 - * Must not be called in NMI context (strictly speaking, only if we don't 1588 - * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling 1589 - * conventions for vfree() arch-depenedent would be a really bad idea) 1584 + * Must not be called in NMI context (strictly speaking, only if we don't 1585 + * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling 1586 + * conventions for vfree() arch-depenedent would be a really bad idea) 1590 1587 * 1591 - * May sleep if called *not* from interrupt context. 1588 + * May sleep if called *not* from interrupt context. 1592 1589 * 1593 - * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) 1590 + * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) 1594 1591 */ 1595 1592 void vfree(const void *addr) 1596 1593 { ··· 1610 1591 1611 1592 if (!addr) 1612 1593 return; 1613 - if (unlikely(in_interrupt())) 1614 - __vfree_deferred(addr); 1615 - else 1616 - __vunmap(addr, 1); 1594 + 1595 + __vfree(addr); 1617 1596 } 1618 1597 EXPORT_SYMBOL(vfree); 1619 1598 1620 1599 /** 1621 - * vunmap - release virtual mapping obtained by vmap() 1622 - * @addr: memory base address 1600 + * vunmap - release virtual mapping obtained by vmap() 1601 + * @addr: memory base address 1623 1602 * 1624 - * Free the virtually contiguous memory area starting at @addr, 1625 - * which was created from the page array passed to vmap(). 1603 + * Free the virtually contiguous memory area starting at @addr, 1604 + * which was created from the page array passed to vmap(). 1626 1605 * 1627 - * Must not be called in interrupt context. 1606 + * Must not be called in interrupt context. 1628 1607 */ 1629 1608 void vunmap(const void *addr) 1630 1609 { ··· 1634 1617 EXPORT_SYMBOL(vunmap); 1635 1618 1636 1619 /** 1637 - * vmap - map an array of pages into virtually contiguous space 1638 - * @pages: array of page pointers 1639 - * @count: number of pages to map 1640 - * @flags: vm_area->flags 1641 - * @prot: page protection for the mapping 1620 + * vmap - map an array of pages into virtually contiguous space 1621 + * @pages: array of page pointers 1622 + * @count: number of pages to map 1623 + * @flags: vm_area->flags 1624 + * @prot: page protection for the mapping 1642 1625 * 1643 - * Maps @count pages from @pages into contiguous kernel virtual 1644 - * space. 1626 + * Maps @count pages from @pages into contiguous kernel virtual 1627 + * space. 1628 + * 1629 + * Return: the address of the area or %NULL on failure 1645 1630 */ 1646 1631 void *vmap(struct page **pages, unsigned int count, 1647 - unsigned long flags, pgprot_t prot) 1632 + unsigned long flags, pgprot_t prot) 1648 1633 { 1649 1634 struct vm_struct *area; 1650 1635 unsigned long size; /* In bytes */ ··· 1728 1709 warn_alloc(gfp_mask, NULL, 1729 1710 "vmalloc: allocation failure, allocated %ld of %ld bytes", 1730 1711 (area->nr_pages*PAGE_SIZE), area->size); 1731 - vfree(area->addr); 1712 + __vfree(area->addr); 1732 1713 return NULL; 1733 1714 } 1734 1715 1735 1716 /** 1736 - * __vmalloc_node_range - allocate virtually contiguous memory 1737 - * @size: allocation size 1738 - * @align: desired alignment 1739 - * @start: vm area range start 1740 - * @end: vm area range end 1741 - * @gfp_mask: flags for the page level allocator 1742 - * @prot: protection mask for the allocated pages 1743 - * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) 1744 - * @node: node to use for allocation or NUMA_NO_NODE 1745 - * @caller: caller's return address 1717 + * __vmalloc_node_range - allocate virtually contiguous memory 1718 + * @size: allocation size 1719 + * @align: desired alignment 1720 + * @start: vm area range start 1721 + * @end: vm area range end 1722 + * @gfp_mask: flags for the page level allocator 1723 + * @prot: protection mask for the allocated pages 1724 + * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) 1725 + * @node: node to use for allocation or NUMA_NO_NODE 1726 + * @caller: caller's return address 1746 1727 * 1747 - * Allocate enough pages to cover @size from the page level 1748 - * allocator with @gfp_mask flags. Map them into contiguous 1749 - * kernel virtual space, using a pagetable protection of @prot. 1728 + * Allocate enough pages to cover @size from the page level 1729 + * allocator with @gfp_mask flags. Map them into contiguous 1730 + * kernel virtual space, using a pagetable protection of @prot. 1731 + * 1732 + * Return: the address of the area or %NULL on failure 1750 1733 */ 1751 1734 void *__vmalloc_node_range(unsigned long size, unsigned long align, 1752 1735 unsigned long start, unsigned long end, gfp_t gfp_mask, ··· 1789 1768 return NULL; 1790 1769 } 1791 1770 1771 + /* 1772 + * This is only for performance analysis of vmalloc and stress purpose. 1773 + * It is required by vmalloc test module, therefore do not use it other 1774 + * than that. 1775 + */ 1776 + #ifdef CONFIG_TEST_VMALLOC_MODULE 1777 + EXPORT_SYMBOL_GPL(__vmalloc_node_range); 1778 + #endif 1779 + 1792 1780 /** 1793 - * __vmalloc_node - allocate virtually contiguous memory 1794 - * @size: allocation size 1795 - * @align: desired alignment 1796 - * @gfp_mask: flags for the page level allocator 1797 - * @prot: protection mask for the allocated pages 1798 - * @node: node to use for allocation or NUMA_NO_NODE 1799 - * @caller: caller's return address 1781 + * __vmalloc_node - allocate virtually contiguous memory 1782 + * @size: allocation size 1783 + * @align: desired alignment 1784 + * @gfp_mask: flags for the page level allocator 1785 + * @prot: protection mask for the allocated pages 1786 + * @node: node to use for allocation or NUMA_NO_NODE 1787 + * @caller: caller's return address 1800 1788 * 1801 - * Allocate enough pages to cover @size from the page level 1802 - * allocator with @gfp_mask flags. Map them into contiguous 1803 - * kernel virtual space, using a pagetable protection of @prot. 1789 + * Allocate enough pages to cover @size from the page level 1790 + * allocator with @gfp_mask flags. Map them into contiguous 1791 + * kernel virtual space, using a pagetable protection of @prot. 1804 1792 * 1805 - * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL 1806 - * and __GFP_NOFAIL are not supported 1793 + * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL 1794 + * and __GFP_NOFAIL are not supported 1807 1795 * 1808 - * Any use of gfp flags outside of GFP_KERNEL should be consulted 1809 - * with mm people. 1796 + * Any use of gfp flags outside of GFP_KERNEL should be consulted 1797 + * with mm people. 1810 1798 * 1799 + * Return: pointer to the allocated memory or %NULL on error 1811 1800 */ 1812 1801 static void *__vmalloc_node(unsigned long size, unsigned long align, 1813 1802 gfp_t gfp_mask, pgprot_t prot, ··· 1849 1818 } 1850 1819 1851 1820 /** 1852 - * vmalloc - allocate virtually contiguous memory 1853 - * @size: allocation size 1854 - * Allocate enough pages to cover @size from the page level 1855 - * allocator and map them into contiguous kernel virtual space. 1821 + * vmalloc - allocate virtually contiguous memory 1822 + * @size: allocation size 1856 1823 * 1857 - * For tight control over page level allocator and protection flags 1858 - * use __vmalloc() instead. 1824 + * Allocate enough pages to cover @size from the page level 1825 + * allocator and map them into contiguous kernel virtual space. 1826 + * 1827 + * For tight control over page level allocator and protection flags 1828 + * use __vmalloc() instead. 1829 + * 1830 + * Return: pointer to the allocated memory or %NULL on error 1859 1831 */ 1860 1832 void *vmalloc(unsigned long size) 1861 1833 { ··· 1868 1834 EXPORT_SYMBOL(vmalloc); 1869 1835 1870 1836 /** 1871 - * vzalloc - allocate virtually contiguous memory with zero fill 1872 - * @size: allocation size 1873 - * Allocate enough pages to cover @size from the page level 1874 - * allocator and map them into contiguous kernel virtual space. 1875 - * The memory allocated is set to zero. 1837 + * vzalloc - allocate virtually contiguous memory with zero fill 1838 + * @size: allocation size 1876 1839 * 1877 - * For tight control over page level allocator and protection flags 1878 - * use __vmalloc() instead. 1840 + * Allocate enough pages to cover @size from the page level 1841 + * allocator and map them into contiguous kernel virtual space. 1842 + * The memory allocated is set to zero. 1843 + * 1844 + * For tight control over page level allocator and protection flags 1845 + * use __vmalloc() instead. 1846 + * 1847 + * Return: pointer to the allocated memory or %NULL on error 1879 1848 */ 1880 1849 void *vzalloc(unsigned long size) 1881 1850 { ··· 1893 1856 * 1894 1857 * The resulting memory area is zeroed so it can be mapped to userspace 1895 1858 * without leaking data. 1859 + * 1860 + * Return: pointer to the allocated memory or %NULL on error 1896 1861 */ 1897 1862 void *vmalloc_user(unsigned long size) 1898 1863 { 1899 - struct vm_struct *area; 1900 - void *ret; 1901 - 1902 - ret = __vmalloc_node(size, SHMLBA, 1903 - GFP_KERNEL | __GFP_ZERO, 1904 - PAGE_KERNEL, NUMA_NO_NODE, 1905 - __builtin_return_address(0)); 1906 - if (ret) { 1907 - area = find_vm_area(ret); 1908 - area->flags |= VM_USERMAP; 1909 - } 1910 - return ret; 1864 + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 1865 + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, 1866 + VM_USERMAP, NUMA_NO_NODE, 1867 + __builtin_return_address(0)); 1911 1868 } 1912 1869 EXPORT_SYMBOL(vmalloc_user); 1913 1870 1914 1871 /** 1915 - * vmalloc_node - allocate memory on a specific node 1916 - * @size: allocation size 1917 - * @node: numa node 1872 + * vmalloc_node - allocate memory on a specific node 1873 + * @size: allocation size 1874 + * @node: numa node 1918 1875 * 1919 - * Allocate enough pages to cover @size from the page level 1920 - * allocator and map them into contiguous kernel virtual space. 1876 + * Allocate enough pages to cover @size from the page level 1877 + * allocator and map them into contiguous kernel virtual space. 1921 1878 * 1922 - * For tight control over page level allocator and protection flags 1923 - * use __vmalloc() instead. 1879 + * For tight control over page level allocator and protection flags 1880 + * use __vmalloc() instead. 1881 + * 1882 + * Return: pointer to the allocated memory or %NULL on error 1924 1883 */ 1925 1884 void *vmalloc_node(unsigned long size, int node) 1926 1885 { ··· 1936 1903 * 1937 1904 * For tight control over page level allocator and protection flags 1938 1905 * use __vmalloc_node() instead. 1906 + * 1907 + * Return: pointer to the allocated memory or %NULL on error 1939 1908 */ 1940 1909 void *vzalloc_node(unsigned long size, int node) 1941 1910 { ··· 1947 1912 EXPORT_SYMBOL(vzalloc_node); 1948 1913 1949 1914 /** 1950 - * vmalloc_exec - allocate virtually contiguous, executable memory 1951 - * @size: allocation size 1915 + * vmalloc_exec - allocate virtually contiguous, executable memory 1916 + * @size: allocation size 1952 1917 * 1953 - * Kernel-internal function to allocate enough pages to cover @size 1954 - * the page level allocator and map them into contiguous and 1955 - * executable kernel virtual space. 1918 + * Kernel-internal function to allocate enough pages to cover @size 1919 + * the page level allocator and map them into contiguous and 1920 + * executable kernel virtual space. 1956 1921 * 1957 - * For tight control over page level allocator and protection flags 1958 - * use __vmalloc() instead. 1922 + * For tight control over page level allocator and protection flags 1923 + * use __vmalloc() instead. 1924 + * 1925 + * Return: pointer to the allocated memory or %NULL on error 1959 1926 */ 1960 - 1961 1927 void *vmalloc_exec(unsigned long size) 1962 1928 { 1963 1929 return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, ··· 1978 1942 #endif 1979 1943 1980 1944 /** 1981 - * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 1982 - * @size: allocation size 1945 + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 1946 + * @size: allocation size 1983 1947 * 1984 - * Allocate enough 32bit PA addressable pages to cover @size from the 1985 - * page level allocator and map them into contiguous kernel virtual space. 1948 + * Allocate enough 32bit PA addressable pages to cover @size from the 1949 + * page level allocator and map them into contiguous kernel virtual space. 1950 + * 1951 + * Return: pointer to the allocated memory or %NULL on error 1986 1952 */ 1987 1953 void *vmalloc_32(unsigned long size) 1988 1954 { ··· 1995 1957 1996 1958 /** 1997 1959 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory 1998 - * @size: allocation size 1960 + * @size: allocation size 1999 1961 * 2000 1962 * The resulting memory area is 32bit addressable and zeroed so it can be 2001 1963 * mapped to userspace without leaking data. 1964 + * 1965 + * Return: pointer to the allocated memory or %NULL on error 2002 1966 */ 2003 1967 void *vmalloc_32_user(unsigned long size) 2004 1968 { 2005 - struct vm_struct *area; 2006 - void *ret; 2007 - 2008 - ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 2009 - NUMA_NO_NODE, __builtin_return_address(0)); 2010 - if (ret) { 2011 - area = find_vm_area(ret); 2012 - area->flags |= VM_USERMAP; 2013 - } 2014 - return ret; 1969 + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 1970 + GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 1971 + VM_USERMAP, NUMA_NO_NODE, 1972 + __builtin_return_address(0)); 2015 1973 } 2016 1974 EXPORT_SYMBOL(vmalloc_32_user); 2017 1975 ··· 2093 2059 } 2094 2060 2095 2061 /** 2096 - * vread() - read vmalloc area in a safe way. 2097 - * @buf: buffer for reading data 2098 - * @addr: vm address. 2099 - * @count: number of bytes to be read. 2062 + * vread() - read vmalloc area in a safe way. 2063 + * @buf: buffer for reading data 2064 + * @addr: vm address. 2065 + * @count: number of bytes to be read. 2100 2066 * 2101 - * Returns # of bytes which addr and buf should be increased. 2102 - * (same number to @count). Returns 0 if [addr...addr+count) doesn't 2103 - * includes any intersect with alive vmalloc area. 2067 + * This function checks that addr is a valid vmalloc'ed area, and 2068 + * copy data from that area to a given buffer. If the given memory range 2069 + * of [addr...addr+count) includes some valid address, data is copied to 2070 + * proper area of @buf. If there are memory holes, they'll be zero-filled. 2071 + * IOREMAP area is treated as memory hole and no copy is done. 2104 2072 * 2105 - * This function checks that addr is a valid vmalloc'ed area, and 2106 - * copy data from that area to a given buffer. If the given memory range 2107 - * of [addr...addr+count) includes some valid address, data is copied to 2108 - * proper area of @buf. If there are memory holes, they'll be zero-filled. 2109 - * IOREMAP area is treated as memory hole and no copy is done. 2073 + * If [addr...addr+count) doesn't includes any intersects with alive 2074 + * vm_struct area, returns 0. @buf should be kernel's buffer. 2110 2075 * 2111 - * If [addr...addr+count) doesn't includes any intersects with alive 2112 - * vm_struct area, returns 0. @buf should be kernel's buffer. 2076 + * Note: In usual ops, vread() is never necessary because the caller 2077 + * should know vmalloc() area is valid and can use memcpy(). 2078 + * This is for routines which have to access vmalloc area without 2079 + * any informaion, as /dev/kmem. 2113 2080 * 2114 - * Note: In usual ops, vread() is never necessary because the caller 2115 - * should know vmalloc() area is valid and can use memcpy(). 2116 - * This is for routines which have to access vmalloc area without 2117 - * any informaion, as /dev/kmem. 2118 - * 2081 + * Return: number of bytes for which addr and buf should be increased 2082 + * (same number as @count) or %0 if [addr...addr+count) doesn't 2083 + * include any intersection with valid vmalloc area 2119 2084 */ 2120 - 2121 2085 long vread(char *buf, char *addr, unsigned long count) 2122 2086 { 2123 2087 struct vmap_area *va; ··· 2172 2140 } 2173 2141 2174 2142 /** 2175 - * vwrite() - write vmalloc area in a safe way. 2176 - * @buf: buffer for source data 2177 - * @addr: vm address. 2178 - * @count: number of bytes to be read. 2143 + * vwrite() - write vmalloc area in a safe way. 2144 + * @buf: buffer for source data 2145 + * @addr: vm address. 2146 + * @count: number of bytes to be read. 2179 2147 * 2180 - * Returns # of bytes which addr and buf should be incresed. 2181 - * (same number to @count). 2182 - * If [addr...addr+count) doesn't includes any intersect with valid 2183 - * vmalloc area, returns 0. 2148 + * This function checks that addr is a valid vmalloc'ed area, and 2149 + * copy data from a buffer to the given addr. If specified range of 2150 + * [addr...addr+count) includes some valid address, data is copied from 2151 + * proper area of @buf. If there are memory holes, no copy to hole. 2152 + * IOREMAP area is treated as memory hole and no copy is done. 2184 2153 * 2185 - * This function checks that addr is a valid vmalloc'ed area, and 2186 - * copy data from a buffer to the given addr. If specified range of 2187 - * [addr...addr+count) includes some valid address, data is copied from 2188 - * proper area of @buf. If there are memory holes, no copy to hole. 2189 - * IOREMAP area is treated as memory hole and no copy is done. 2154 + * If [addr...addr+count) doesn't includes any intersects with alive 2155 + * vm_struct area, returns 0. @buf should be kernel's buffer. 2190 2156 * 2191 - * If [addr...addr+count) doesn't includes any intersects with alive 2192 - * vm_struct area, returns 0. @buf should be kernel's buffer. 2157 + * Note: In usual ops, vwrite() is never necessary because the caller 2158 + * should know vmalloc() area is valid and can use memcpy(). 2159 + * This is for routines which have to access vmalloc area without 2160 + * any informaion, as /dev/kmem. 2193 2161 * 2194 - * Note: In usual ops, vwrite() is never necessary because the caller 2195 - * should know vmalloc() area is valid and can use memcpy(). 2196 - * This is for routines which have to access vmalloc area without 2197 - * any informaion, as /dev/kmem. 2162 + * Return: number of bytes for which addr and buf should be 2163 + * increased (same number as @count) or %0 if [addr...addr+count) 2164 + * doesn't include any intersection with valid vmalloc area 2198 2165 */ 2199 - 2200 2166 long vwrite(char *buf, char *addr, unsigned long count) 2201 2167 { 2202 2168 struct vmap_area *va; ··· 2246 2216 } 2247 2217 2248 2218 /** 2249 - * remap_vmalloc_range_partial - map vmalloc pages to userspace 2250 - * @vma: vma to cover 2251 - * @uaddr: target user address to start at 2252 - * @kaddr: virtual address of vmalloc kernel memory 2253 - * @size: size of map area 2219 + * remap_vmalloc_range_partial - map vmalloc pages to userspace 2220 + * @vma: vma to cover 2221 + * @uaddr: target user address to start at 2222 + * @kaddr: virtual address of vmalloc kernel memory 2223 + * @size: size of map area 2254 2224 * 2255 - * Returns: 0 for success, -Exxx on failure 2225 + * Returns: 0 for success, -Exxx on failure 2256 2226 * 2257 - * This function checks that @kaddr is a valid vmalloc'ed area, 2258 - * and that it is big enough to cover the range starting at 2259 - * @uaddr in @vma. Will return failure if that criteria isn't 2260 - * met. 2227 + * This function checks that @kaddr is a valid vmalloc'ed area, 2228 + * and that it is big enough to cover the range starting at 2229 + * @uaddr in @vma. Will return failure if that criteria isn't 2230 + * met. 2261 2231 * 2262 - * Similar to remap_pfn_range() (see mm/memory.c) 2232 + * Similar to remap_pfn_range() (see mm/memory.c) 2263 2233 */ 2264 2234 int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, 2265 2235 void *kaddr, unsigned long size) ··· 2278 2248 if (!(area->flags & VM_USERMAP)) 2279 2249 return -EINVAL; 2280 2250 2281 - if (kaddr + size > area->addr + area->size) 2251 + if (kaddr + size > area->addr + get_vm_area_size(area)) 2282 2252 return -EINVAL; 2283 2253 2284 2254 do { ··· 2301 2271 EXPORT_SYMBOL(remap_vmalloc_range_partial); 2302 2272 2303 2273 /** 2304 - * remap_vmalloc_range - map vmalloc pages to userspace 2305 - * @vma: vma to cover (map full range of vma) 2306 - * @addr: vmalloc memory 2307 - * @pgoff: number of pages into addr before first page to map 2274 + * remap_vmalloc_range - map vmalloc pages to userspace 2275 + * @vma: vma to cover (map full range of vma) 2276 + * @addr: vmalloc memory 2277 + * @pgoff: number of pages into addr before first page to map 2308 2278 * 2309 - * Returns: 0 for success, -Exxx on failure 2279 + * Returns: 0 for success, -Exxx on failure 2310 2280 * 2311 - * This function checks that addr is a valid vmalloc'ed area, and 2312 - * that it is big enough to cover the vma. Will return failure if 2313 - * that criteria isn't met. 2281 + * This function checks that addr is a valid vmalloc'ed area, and 2282 + * that it is big enough to cover the vma. Will return failure if 2283 + * that criteria isn't met. 2314 2284 * 2315 - * Similar to remap_pfn_range() (see mm/memory.c) 2285 + * Similar to remap_pfn_range() (see mm/memory.c) 2316 2286 */ 2317 2287 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 2318 2288 unsigned long pgoff) ··· 2344 2314 } 2345 2315 2346 2316 /** 2347 - * alloc_vm_area - allocate a range of kernel address space 2348 - * @size: size of the area 2349 - * @ptes: returns the PTEs for the address space 2317 + * alloc_vm_area - allocate a range of kernel address space 2318 + * @size: size of the area 2319 + * @ptes: returns the PTEs for the address space 2350 2320 * 2351 - * Returns: NULL on failure, vm_struct on success 2321 + * Returns: NULL on failure, vm_struct on success 2352 2322 * 2353 - * This function reserves a range of kernel address space, and 2354 - * allocates pagetables to map that range. No actual mappings 2355 - * are created. 2323 + * This function reserves a range of kernel address space, and 2324 + * allocates pagetables to map that range. No actual mappings 2325 + * are created. 2356 2326 * 2357 - * If @ptes is non-NULL, pointers to the PTEs (in init_mm) 2358 - * allocated for the VM area are returned. 2327 + * If @ptes is non-NULL, pointers to the PTEs (in init_mm) 2328 + * allocated for the VM area are returned. 2359 2329 */ 2360 2330 struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) 2361 2331 { ··· 2781 2751 module_init(proc_vmalloc_init); 2782 2752 2783 2753 #endif 2784 -

+31 -55

mm/vmscan.c

··· 374 374 */ 375 375 int prealloc_shrinker(struct shrinker *shrinker) 376 376 { 377 - size_t size = sizeof(*shrinker->nr_deferred); 377 + unsigned int size = sizeof(*shrinker->nr_deferred); 378 378 379 379 if (shrinker->flags & SHRINKER_NUMA_AWARE) 380 380 size *= nr_node_ids; ··· 952 952 */ 953 953 if (reclaimed && page_is_file_cache(page) && 954 954 !mapping_exiting(mapping) && !dax_mapping(mapping)) 955 - shadow = workingset_eviction(mapping, page); 955 + shadow = workingset_eviction(page); 956 956 __delete_from_page_cache(page, shadow); 957 957 xa_unlock_irqrestore(&mapping->i_pages, flags); 958 958 ··· 1106 1106 { 1107 1107 LIST_HEAD(ret_pages); 1108 1108 LIST_HEAD(free_pages); 1109 - int pgactivate = 0; 1110 - unsigned nr_unqueued_dirty = 0; 1111 - unsigned nr_dirty = 0; 1112 - unsigned nr_congested = 0; 1113 1109 unsigned nr_reclaimed = 0; 1114 - unsigned nr_writeback = 0; 1115 - unsigned nr_immediate = 0; 1116 - unsigned nr_ref_keep = 0; 1117 - unsigned nr_unmap_fail = 0; 1118 1110 1111 + memset(stat, 0, sizeof(*stat)); 1119 1112 cond_resched(); 1120 1113 1121 1114 while (!list_empty(page_list)) { ··· 1152 1159 */ 1153 1160 page_check_dirty_writeback(page, &dirty, &writeback); 1154 1161 if (dirty || writeback) 1155 - nr_dirty++; 1162 + stat->nr_dirty++; 1156 1163 1157 1164 if (dirty && !writeback) 1158 - nr_unqueued_dirty++; 1165 + stat->nr_unqueued_dirty++; 1159 1166 1160 1167 /* 1161 1168 * Treat this page as congested if the underlying BDI is or if ··· 1167 1174 if (((dirty || writeback) && mapping && 1168 1175 inode_write_congested(mapping->host)) || 1169 1176 (writeback && PageReclaim(page))) 1170 - nr_congested++; 1177 + stat->nr_congested++; 1171 1178 1172 1179 /* 1173 1180 * If a page at the tail of the LRU is under writeback, there ··· 1216 1223 if (current_is_kswapd() && 1217 1224 PageReclaim(page) && 1218 1225 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { 1219 - nr_immediate++; 1226 + stat->nr_immediate++; 1220 1227 goto activate_locked; 1221 1228 1222 1229 /* Case 2 above */ ··· 1234 1241 * and it's also appropriate in global reclaim. 1235 1242 */ 1236 1243 SetPageReclaim(page); 1237 - nr_writeback++; 1244 + stat->nr_writeback++; 1238 1245 goto activate_locked; 1239 1246 1240 1247 /* Case 3 above */ ··· 1254 1261 case PAGEREF_ACTIVATE: 1255 1262 goto activate_locked; 1256 1263 case PAGEREF_KEEP: 1257 - nr_ref_keep++; 1264 + stat->nr_ref_keep++; 1258 1265 goto keep_locked; 1259 1266 case PAGEREF_RECLAIM: 1260 1267 case PAGEREF_RECLAIM_CLEAN: ··· 1319 1326 if (unlikely(PageTransHuge(page))) 1320 1327 flags |= TTU_SPLIT_HUGE_PMD; 1321 1328 if (!try_to_unmap(page, flags)) { 1322 - nr_unmap_fail++; 1329 + stat->nr_unmap_fail++; 1323 1330 goto activate_locked; 1324 1331 } 1325 1332 } ··· 1467 1474 VM_BUG_ON_PAGE(PageActive(page), page); 1468 1475 if (!PageMlocked(page)) { 1469 1476 SetPageActive(page); 1470 - pgactivate++; 1477 + stat->nr_activate++; 1471 1478 count_memcg_page_event(page, PGACTIVATE); 1472 1479 } 1473 1480 keep_locked: ··· 1482 1489 free_unref_page_list(&free_pages); 1483 1490 1484 1491 list_splice(&ret_pages, page_list); 1485 - count_vm_events(PGACTIVATE, pgactivate); 1492 + count_vm_events(PGACTIVATE, stat->nr_activate); 1486 1493 1487 - if (stat) { 1488 - stat->nr_dirty = nr_dirty; 1489 - stat->nr_congested = nr_congested; 1490 - stat->nr_unqueued_dirty = nr_unqueued_dirty; 1491 - stat->nr_writeback = nr_writeback; 1492 - stat->nr_immediate = nr_immediate; 1493 - stat->nr_activate = pgactivate; 1494 - stat->nr_ref_keep = nr_ref_keep; 1495 - stat->nr_unmap_fail = nr_unmap_fail; 1496 - } 1497 1494 return nr_reclaimed; 1498 1495 } 1499 1496 ··· 1495 1512 .priority = DEF_PRIORITY, 1496 1513 .may_unmap = 1, 1497 1514 }; 1515 + struct reclaim_stat dummy_stat; 1498 1516 unsigned long ret; 1499 1517 struct page *page, *next; 1500 1518 LIST_HEAD(clean_pages); ··· 1509 1525 } 1510 1526 1511 1527 ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, 1512 - TTU_IGNORE_ACCESS, NULL, true); 1528 + TTU_IGNORE_ACCESS, &dummy_stat, true); 1513 1529 list_splice(&clean_pages, page_list); 1514 1530 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); 1515 1531 return ret; ··· 1614 1630 1615 1631 } 1616 1632 1617 - /* 1618 - * zone_lru_lock is heavily contended. Some of the functions that 1633 + /** 1634 + * pgdat->lru_lock is heavily contended. Some of the functions that 1619 1635 * shrink the lists perform better by taking out a batch of pages 1620 1636 * and working on them outside the LRU lock. 1621 1637 * ··· 1637 1653 static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1638 1654 struct lruvec *lruvec, struct list_head *dst, 1639 1655 unsigned long *nr_scanned, struct scan_control *sc, 1640 - isolate_mode_t mode, enum lru_list lru) 1656 + enum lru_list lru) 1641 1657 { 1642 1658 struct list_head *src = &lruvec->lists[lru]; 1643 1659 unsigned long nr_taken = 0; ··· 1646 1662 unsigned long skipped = 0; 1647 1663 unsigned long scan, total_scan, nr_pages; 1648 1664 LIST_HEAD(pages_skipped); 1665 + isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED); 1649 1666 1650 1667 scan = 0; 1651 1668 for (total_scan = 0; ··· 1750 1765 WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); 1751 1766 1752 1767 if (PageLRU(page)) { 1753 - struct zone *zone = page_zone(page); 1768 + pg_data_t *pgdat = page_pgdat(page); 1754 1769 struct lruvec *lruvec; 1755 1770 1756 - spin_lock_irq(zone_lru_lock(zone)); 1757 - lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 1771 + spin_lock_irq(&pgdat->lru_lock); 1772 + lruvec = mem_cgroup_page_lruvec(page, pgdat); 1758 1773 if (PageLRU(page)) { 1759 1774 int lru = page_lru(page); 1760 1775 get_page(page); ··· 1762 1777 del_page_from_lru_list(page, lruvec, lru); 1763 1778 ret = 0; 1764 1779 } 1765 - spin_unlock_irq(zone_lru_lock(zone)); 1780 + spin_unlock_irq(&pgdat->lru_lock); 1766 1781 } 1767 1782 return ret; 1768 1783 } ··· 1884 1899 unsigned long nr_scanned; 1885 1900 unsigned long nr_reclaimed = 0; 1886 1901 unsigned long nr_taken; 1887 - struct reclaim_stat stat = {}; 1888 - isolate_mode_t isolate_mode = 0; 1902 + struct reclaim_stat stat; 1889 1903 int file = is_file_lru(lru); 1890 1904 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 1891 1905 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; ··· 1905 1921 1906 1922 lru_add_drain(); 1907 1923 1908 - if (!sc->may_unmap) 1909 - isolate_mode |= ISOLATE_UNMAPPED; 1910 - 1911 1924 spin_lock_irq(&pgdat->lru_lock); 1912 1925 1913 1926 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, 1914 - &nr_scanned, sc, isolate_mode, lru); 1927 + &nr_scanned, sc, lru); 1915 1928 1916 1929 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); 1917 1930 reclaim_stat->recent_scanned[file] += nr_taken; ··· 1990 2009 * processes, from rmap. 1991 2010 * 1992 2011 * If the pages are mostly unmapped, the processing is fast and it is 1993 - * appropriate to hold zone_lru_lock across the whole operation. But if 2012 + * appropriate to hold pgdat->lru_lock across the whole operation. But if 1994 2013 * the pages are mapped, the processing is slow (page_referenced()) so we 1995 - * should drop zone_lru_lock around each page. It's impossible to balance 2014 + * should drop pgdat->lru_lock around each page. It's impossible to balance 1996 2015 * this, so instead we remove the pages from the LRU while processing them. 1997 2016 * It is safe to rely on PG_active against the non-LRU pages in here because 1998 2017 * nobody will play with that bit on a non-LRU page. ··· 2065 2084 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 2066 2085 unsigned nr_deactivate, nr_activate; 2067 2086 unsigned nr_rotated = 0; 2068 - isolate_mode_t isolate_mode = 0; 2069 2087 int file = is_file_lru(lru); 2070 2088 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 2071 2089 2072 2090 lru_add_drain(); 2073 2091 2074 - if (!sc->may_unmap) 2075 - isolate_mode |= ISOLATE_UNMAPPED; 2076 - 2077 2092 spin_lock_irq(&pgdat->lru_lock); 2078 2093 2079 2094 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, 2080 - &nr_scanned, sc, isolate_mode, lru); 2095 + &nr_scanned, sc, lru); 2081 2096 2082 2097 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); 2083 2098 reclaim_stat->recent_scanned[file] += nr_taken; ··· 2731 2754 sc->nr_reclaimed - reclaimed); 2732 2755 2733 2756 /* 2734 - * Direct reclaim and kswapd have to scan all memory 2735 - * cgroups to fulfill the overall scan target for the 2736 - * node. 2757 + * Kswapd have to scan all memory cgroups to fulfill 2758 + * the overall scan target for the node. 2737 2759 * 2738 2760 * Limit reclaim, on the other hand, only cares about 2739 2761 * nr_to_reclaim pages to be reclaimed and it will 2740 2762 * retry with decreasing priority if one round over the 2741 2763 * whole hierarchy is not sufficient. 2742 2764 */ 2743 - if (!global_reclaim(sc) && 2765 + if (!current_is_kswapd() && 2744 2766 sc->nr_reclaimed >= sc->nr_to_reclaim) { 2745 2767 mem_cgroup_iter_break(root, memcg); 2746 2768 break; ··· 3503 3527 * 3504 3528 * kswapd scans the zones in the highmem->normal->dma direction. It skips 3505 3529 * zones which have free_pages > high_wmark_pages(zone), but once a zone is 3506 - * found to have free_pages <= high_wmark_pages(zone), any page is that zone 3530 + * found to have free_pages <= high_wmark_pages(zone), any page in that zone 3507 3531 * or lower is eligible for reclaim until at least one usable zone is 3508 3532 * balanced. 3509 3533 */

+4 -11

mm/vmstat.c

··· 2121 2121 struct dentry *extfrag_debug_root; 2122 2122 2123 2123 extfrag_debug_root = debugfs_create_dir("extfrag", NULL); 2124 - if (!extfrag_debug_root) 2125 - return -ENOMEM; 2126 2124 2127 - if (!debugfs_create_file("unusable_index", 0444, 2128 - extfrag_debug_root, NULL, &unusable_file_ops)) 2129 - goto fail; 2125 + debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL, 2126 + &unusable_file_ops); 2130 2127 2131 - if (!debugfs_create_file("extfrag_index", 0444, 2132 - extfrag_debug_root, NULL, &extfrag_file_ops)) 2133 - goto fail; 2128 + debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL, 2129 + &extfrag_file_ops); 2134 2130 2135 2131 return 0; 2136 - fail: 2137 - debugfs_remove_recursive(extfrag_debug_root); 2138 - return -ENOMEM; 2139 2132 } 2140 2133 2141 2134 module_init(extfrag_debug_init);

+2 -3

mm/workingset.c

··· 215 215 216 216 /** 217 217 * workingset_eviction - note the eviction of a page from memory 218 - * @mapping: address space the page was backing 219 218 * @page: the page being evicted 220 219 * 221 - * Returns a shadow entry to be stored in @mapping->i_pages in place 220 + * Returns a shadow entry to be stored in @page->mapping->i_pages in place 222 221 * of the evicted @page so that a later refault can be detected. 223 222 */ 224 - void *workingset_eviction(struct address_space *mapping, struct page *page) 223 + void *workingset_eviction(struct page *page) 225 224 { 226 225 struct pglist_data *pgdat = page_pgdat(page); 227 226 struct mem_cgroup *memcg = page_memcg(page);

+2 -1

net/core/pktgen.c

··· 158 158 #include <linux/etherdevice.h> 159 159 #include <linux/kthread.h> 160 160 #include <linux/prefetch.h> 161 + #include <linux/mmzone.h> 161 162 #include <net/net_namespace.h> 162 163 #include <net/checksum.h> 163 164 #include <net/ipv6.h> ··· 3626 3625 pkt_dev->svlan_cfi = 0; 3627 3626 pkt_dev->svlan_id = 0xffff; 3628 3627 pkt_dev->burst = 1; 3629 - pkt_dev->node = -1; 3628 + pkt_dev->node = NUMA_NO_NODE; 3630 3629 3631 3630 err = pktgen_setup_dev(t->net, pkt_dev, ifname); 3632 3631 if (err)

+2 -1

net/qrtr/qrtr.c

··· 15 15 #include <linux/netlink.h> 16 16 #include <linux/qrtr.h> 17 17 #include <linux/termios.h> /* For TIOCINQ/OUTQ */ 18 + #include <linux/numa.h> 18 19 19 20 #include <net/sock.h> 20 21 ··· 102 101 return container_of(sk, struct qrtr_sock, sk); 103 102 } 104 103 105 - static unsigned int qrtr_local_nid = -1; 104 + static unsigned int qrtr_local_nid = NUMA_NO_NODE; 106 105 107 106 /* for node ids */ 108 107 static RADIX_TREE(qrtr_nodes, GFP_KERNEL);

-5

scripts/Makefile.kasan

··· 27 27 $(call cc-param,asan-globals=1) \ 28 28 $(call cc-param,asan-instrumentation-with-call-threshold=$(call_threshold)) \ 29 29 $(call cc-param,asan-stack=$(CONFIG_KASAN_STACK)) \ 30 - $(call cc-param,asan-use-after-scope=1) \ 31 30 $(call cc-param,asan-instrument-allocas=1) 32 - endif 33 - 34 - ifdef CONFIG_KASAN_EXTRA 35 - CFLAGS_KASAN += $(call cc-option, -fsanitize-address-use-after-scope) 36 31 endif 37 32 38 33 endif # CONFIG_KASAN_GENERIC

+8 -1

scripts/decode_stacktrace.sh

··· 37 37 symbol=${symbol#$} 38 38 symbol=${symbol%$} 39 39 40 + # Strip segment 41 + local segment 42 + if [[ $symbol == *:* ]] ; then 43 + segment=${symbol%%:*}: 44 + symbol=${symbol#*:} 45 + fi 46 + 40 47 # Strip the symbol name so that we could look it up 41 48 local name=${symbol%+*} 42 49 ··· 91 84 code=${code//$'\n'/' '} 92 85 93 86 # Replace old address with pretty line numbers 94 - symbol="$name ($code)" 87 + symbol="$segment$name ($code)" 95 88 } 96 89 97 90 decode_code() {

-4

scripts/gcc-plugins/Kconfig

··· 68 68 69 69 config GCC_PLUGIN_STRUCTLEAK 70 70 bool "Force initialization of variables containing userspace addresses" 71 - # Currently STRUCTLEAK inserts initialization out of live scope of 72 - # variables from KASAN point of view. This leads to KASAN false 73 - # positive reports. Prohibit this combination for now. 74 - depends on !KASAN_EXTRA 75 71 help 76 72 This plugin zero-initializes any structures containing a 77 73 __user attribute. This can prevent some classes of information

+16

tools/include/linux/numa.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _LINUX_NUMA_H 3 + #define _LINUX_NUMA_H 4 + 5 + 6 + #ifdef CONFIG_NODES_SHIFT 7 + #define NODES_SHIFT CONFIG_NODES_SHIFT 8 + #else 9 + #define NODES_SHIFT 0 10 + #endif 11 + 12 + #define MAX_NUMNODES (1 << NODES_SHIFT) 13 + 14 + #define NUMA_NO_NODE (-1) 15 + 16 + #endif /* _LINUX_NUMA_H */

+4 -3

tools/perf/bench/numa.c

··· 34 34 #include <sys/types.h> 35 35 #include <linux/kernel.h> 36 36 #include <linux/time64.h> 37 + #include <linux/numa.h> 37 38 38 39 #include <numa.h> 39 40 #include <numaif.h> ··· 299 298 300 299 CPU_ZERO(&mask); 301 300 302 - if (target_node == -1) { 301 + if (target_node == NUMA_NO_NODE) { 303 302 for (cpu = 0; cpu < g->p.nr_cpus; cpu++) 304 303 CPU_SET(cpu, &mask); 305 304 } else { ··· 340 339 unsigned long nodemask; 341 340 int ret; 342 341 343 - if (node == -1) 342 + if (node == NUMA_NO_NODE) 344 343 return; 345 344 346 345 BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)*8); ··· 1364 1363 int cpu; 1365 1364 1366 1365 /* Allow all nodes by default: */ 1367 - td->bind_node = -1; 1366 + td->bind_node = NUMA_NO_NODE; 1368 1367 1369 1368 /* Allow all CPUs by default: */ 1370 1369 CPU_ZERO(&td->bind_cpumask);

+1

tools/testing/selftests/Makefile

··· 48 48 ifneq (1, $(quicktest)) 49 49 TARGETS += timers 50 50 endif 51 + TARGETS += tmpfs 51 52 TARGETS += user 52 53 TARGETS += vm 53 54 TARGETS += x86

+74

tools/testing/selftests/memfd/memfd_test.c

··· 54 54 return fd; 55 55 } 56 56 57 + static int mfd_assert_reopen_fd(int fd_in) 58 + { 59 + int r, fd; 60 + char path[100]; 61 + 62 + sprintf(path, "/proc/self/fd/%d", fd_in); 63 + 64 + fd = open(path, O_RDWR); 65 + if (fd < 0) { 66 + printf("re-open of existing fd %d failed\n", fd_in); 67 + abort(); 68 + } 69 + 70 + return fd; 71 + } 72 + 57 73 static void mfd_fail_new(const char *name, unsigned int flags) 58 74 { 59 75 int r; ··· 262 246 mfd_def_size, 263 247 PROT_READ | PROT_WRITE, 264 248 MAP_PRIVATE, 249 + fd, 250 + 0); 251 + if (p == MAP_FAILED) { 252 + printf("mmap() failed: %m\n"); 253 + abort(); 254 + } 255 + munmap(p, mfd_def_size); 256 + } 257 + 258 + /* Test that PROT_READ + MAP_SHARED mappings work. */ 259 + static void mfd_assert_read_shared(int fd) 260 + { 261 + void *p; 262 + 263 + /* verify PROT_READ and MAP_SHARED *is* allowed */ 264 + p = mmap(NULL, 265 + mfd_def_size, 266 + PROT_READ, 267 + MAP_SHARED, 265 268 fd, 266 269 0); 267 270 if (p == MAP_FAILED) { ··· 728 693 } 729 694 730 695 /* 696 + * Test SEAL_FUTURE_WRITE 697 + * Test whether SEAL_FUTURE_WRITE actually prevents modifications. 698 + */ 699 + static void test_seal_future_write(void) 700 + { 701 + int fd, fd2; 702 + void *p; 703 + 704 + printf("%s SEAL-FUTURE-WRITE\n", memfd_str); 705 + 706 + fd = mfd_assert_new("kern_memfd_seal_future_write", 707 + mfd_def_size, 708 + MFD_CLOEXEC | MFD_ALLOW_SEALING); 709 + 710 + p = mfd_assert_mmap_shared(fd); 711 + 712 + mfd_assert_has_seals(fd, 0); 713 + 714 + mfd_assert_add_seals(fd, F_SEAL_FUTURE_WRITE); 715 + mfd_assert_has_seals(fd, F_SEAL_FUTURE_WRITE); 716 + 717 + /* read should pass, writes should fail */ 718 + mfd_assert_read(fd); 719 + mfd_assert_read_shared(fd); 720 + mfd_fail_write(fd); 721 + 722 + fd2 = mfd_assert_reopen_fd(fd); 723 + /* read should pass, writes should still fail */ 724 + mfd_assert_read(fd2); 725 + mfd_assert_read_shared(fd2); 726 + mfd_fail_write(fd2); 727 + 728 + munmap(p, mfd_def_size); 729 + close(fd2); 730 + close(fd); 731 + } 732 + 733 + /* 731 734 * Test SEAL_SHRINK 732 735 * Test whether SEAL_SHRINK actually prevents shrinking 733 736 */ ··· 1018 945 test_basic(); 1019 946 1020 947 test_seal_write(); 948 + test_seal_future_write(); 1021 949 test_seal_shrink(); 1022 950 test_seal_grow(); 1023 951 test_seal_resize();

+1

tools/testing/selftests/proc/.gitignore

··· 2 2 /fd-002-posix-eq 3 3 /fd-003-kthread 4 4 /proc-loadavg-001 5 + /proc-pid-vm 5 6 /proc-self-map-files-001 6 7 /proc-self-map-files-002 7 8 /proc-self-syscall

+1

tools/testing/selftests/proc/Makefile

··· 6 6 TEST_GEN_PROGS += fd-002-posix-eq 7 7 TEST_GEN_PROGS += fd-003-kthread 8 8 TEST_GEN_PROGS += proc-loadavg-001 9 + TEST_GEN_PROGS += proc-pid-vm 9 10 TEST_GEN_PROGS += proc-self-map-files-001 10 11 TEST_GEN_PROGS += proc-self-map-files-002 11 12 TEST_GEN_PROGS += proc-self-syscall

+1 -1

tools/testing/selftests/proc/proc-loadavg-001.c

··· 30 30 31 31 if (unshare(CLONE_NEWPID) == -1) { 32 32 if (errno == ENOSYS || errno == EPERM) 33 - return 2; 33 + return 4; 34 34 return 1; 35 35 } 36 36

+406

tools/testing/selftests/proc/proc-pid-vm.c

··· 1 + /* 2 + * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com> 3 + * 4 + * Permission to use, copy, modify, and distribute this software for any 5 + * purpose with or without fee is hereby granted, provided that the above 6 + * copyright notice and this permission notice appear in all copies. 7 + * 8 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 + */ 16 + /* 17 + * Fork and exec tiny 1 page executable which precisely controls its VM. 18 + * Test /proc/$PID/maps 19 + * Test /proc/$PID/smaps 20 + * Test /proc/$PID/smaps_rollup 21 + * Test /proc/$PID/statm 22 + * 23 + * FIXME require CONFIG_TMPFS which can be disabled 24 + * FIXME test other values from "smaps" 25 + * FIXME support other archs 26 + */ 27 + #undef NDEBUG 28 + #include <assert.h> 29 + #include <errno.h> 30 + #include <sched.h> 31 + #include <signal.h> 32 + #include <stdint.h> 33 + #include <stdio.h> 34 + #include <string.h> 35 + #include <stdlib.h> 36 + #include <sys/mount.h> 37 + #include <sys/types.h> 38 + #include <sys/stat.h> 39 + #include <fcntl.h> 40 + #include <unistd.h> 41 + #include <sys/syscall.h> 42 + #include <sys/uio.h> 43 + #include <linux/kdev_t.h> 44 + 45 + static inline long sys_execveat(int dirfd, const char *pathname, char **argv, char **envp, int flags) 46 + { 47 + return syscall(SYS_execveat, dirfd, pathname, argv, envp, flags); 48 + } 49 + 50 + static void make_private_tmp(void) 51 + { 52 + if (unshare(CLONE_NEWNS) == -1) { 53 + if (errno == ENOSYS || errno == EPERM) { 54 + exit(4); 55 + } 56 + exit(1); 57 + } 58 + if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) { 59 + exit(1); 60 + } 61 + if (mount(NULL, "/tmp", "tmpfs", 0, NULL) == -1) { 62 + exit(1); 63 + } 64 + } 65 + 66 + static pid_t pid = -1; 67 + static void ate(void) 68 + { 69 + if (pid > 0) { 70 + kill(pid, SIGTERM); 71 + } 72 + } 73 + 74 + struct elf64_hdr { 75 + uint8_t e_ident[16]; 76 + uint16_t e_type; 77 + uint16_t e_machine; 78 + uint32_t e_version; 79 + uint64_t e_entry; 80 + uint64_t e_phoff; 81 + uint64_t e_shoff; 82 + uint32_t e_flags; 83 + uint16_t e_ehsize; 84 + uint16_t e_phentsize; 85 + uint16_t e_phnum; 86 + uint16_t e_shentsize; 87 + uint16_t e_shnum; 88 + uint16_t e_shstrndx; 89 + }; 90 + 91 + struct elf64_phdr { 92 + uint32_t p_type; 93 + uint32_t p_flags; 94 + uint64_t p_offset; 95 + uint64_t p_vaddr; 96 + uint64_t p_paddr; 97 + uint64_t p_filesz; 98 + uint64_t p_memsz; 99 + uint64_t p_align; 100 + }; 101 + 102 + #ifdef __x86_64__ 103 + #define PAGE_SIZE 4096 104 + #define VADDR (1UL << 32) 105 + #define MAPS_OFFSET 73 106 + 107 + #define syscall 0x0f, 0x05 108 + #define mov_rdi(x) \ 109 + 0x48, 0xbf, \ 110 + (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff, \ 111 + ((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff 112 + 113 + #define mov_rsi(x) \ 114 + 0x48, 0xbe, \ 115 + (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff, \ 116 + ((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff 117 + 118 + #define mov_eax(x) \ 119 + 0xb8, (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff 120 + 121 + static const uint8_t payload[] = { 122 + /* Casually unmap stack, vDSO and everything else. */ 123 + /* munmap */ 124 + mov_rdi(VADDR + 4096), 125 + mov_rsi((1ULL << 47) - 4096 - VADDR - 4096), 126 + mov_eax(11), 127 + syscall, 128 + 129 + /* Ping parent. */ 130 + /* write(0, &c, 1); */ 131 + 0x31, 0xff, /* xor edi, edi */ 132 + 0x48, 0x8d, 0x35, 0x00, 0x00, 0x00, 0x00, /* lea rsi, [rip] */ 133 + 0xba, 0x01, 0x00, 0x00, 0x00, /* mov edx, 1 */ 134 + mov_eax(1), 135 + syscall, 136 + 137 + /* 1: pause(); */ 138 + mov_eax(34), 139 + syscall, 140 + 141 + 0xeb, 0xf7, /* jmp 1b */ 142 + }; 143 + 144 + static int make_exe(const uint8_t *payload, size_t len) 145 + { 146 + struct elf64_hdr h; 147 + struct elf64_phdr ph; 148 + 149 + struct iovec iov[3] = { 150 + {&h, sizeof(struct elf64_hdr)}, 151 + {&ph, sizeof(struct elf64_phdr)}, 152 + {(void *)payload, len}, 153 + }; 154 + int fd, fd1; 155 + char buf[64]; 156 + 157 + memset(&h, 0, sizeof(h)); 158 + h.e_ident[0] = 0x7f; 159 + h.e_ident[1] = 'E'; 160 + h.e_ident[2] = 'L'; 161 + h.e_ident[3] = 'F'; 162 + h.e_ident[4] = 2; 163 + h.e_ident[5] = 1; 164 + h.e_ident[6] = 1; 165 + h.e_ident[7] = 0; 166 + h.e_type = 2; 167 + h.e_machine = 0x3e; 168 + h.e_version = 1; 169 + h.e_entry = VADDR + sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr); 170 + h.e_phoff = sizeof(struct elf64_hdr); 171 + h.e_shoff = 0; 172 + h.e_flags = 0; 173 + h.e_ehsize = sizeof(struct elf64_hdr); 174 + h.e_phentsize = sizeof(struct elf64_phdr); 175 + h.e_phnum = 1; 176 + h.e_shentsize = 0; 177 + h.e_shnum = 0; 178 + h.e_shstrndx = 0; 179 + 180 + memset(&ph, 0, sizeof(ph)); 181 + ph.p_type = 1; 182 + ph.p_flags = (1<<2)|1; 183 + ph.p_offset = 0; 184 + ph.p_vaddr = VADDR; 185 + ph.p_paddr = 0; 186 + ph.p_filesz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + sizeof(payload); 187 + ph.p_memsz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + sizeof(payload); 188 + ph.p_align = 4096; 189 + 190 + fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_EXCL|O_TMPFILE, 0700); 191 + if (fd == -1) { 192 + exit(1); 193 + } 194 + 195 + if (writev(fd, iov, 3) != sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len) { 196 + exit(1); 197 + } 198 + 199 + /* Avoid ETXTBSY on exec. */ 200 + snprintf(buf, sizeof(buf), "/proc/self/fd/%u", fd); 201 + fd1 = open(buf, O_RDONLY|O_CLOEXEC); 202 + close(fd); 203 + 204 + return fd1; 205 + } 206 + #endif 207 + 208 + #ifdef __x86_64__ 209 + int main(void) 210 + { 211 + int pipefd[2]; 212 + int exec_fd; 213 + 214 + atexit(ate); 215 + 216 + make_private_tmp(); 217 + 218 + /* Reserve fd 0 for 1-byte pipe ping from child. */ 219 + close(0); 220 + if (open("/", O_RDONLY|O_DIRECTORY|O_PATH) != 0) { 221 + return 1; 222 + } 223 + 224 + exec_fd = make_exe(payload, sizeof(payload)); 225 + 226 + if (pipe(pipefd) == -1) { 227 + return 1; 228 + } 229 + if (dup2(pipefd[1], 0) != 0) { 230 + return 1; 231 + } 232 + 233 + pid = fork(); 234 + if (pid == -1) { 235 + return 1; 236 + } 237 + if (pid == 0) { 238 + sys_execveat(exec_fd, "", NULL, NULL, AT_EMPTY_PATH); 239 + return 1; 240 + } 241 + 242 + char _; 243 + if (read(pipefd[0], &_, 1) != 1) { 244 + return 1; 245 + } 246 + 247 + struct stat st; 248 + if (fstat(exec_fd, &st) == -1) { 249 + return 1; 250 + } 251 + 252 + /* Generate "head -n1 /proc/$PID/maps" */ 253 + char buf0[256]; 254 + memset(buf0, ' ', sizeof(buf0)); 255 + int len = snprintf(buf0, sizeof(buf0), 256 + "%08lx-%08lx r-xp 00000000 %02lx:%02lx %llu", 257 + VADDR, VADDR + PAGE_SIZE, 258 + MAJOR(st.st_dev), MINOR(st.st_dev), 259 + (unsigned long long)st.st_ino); 260 + buf0[len] = ' '; 261 + snprintf(buf0 + MAPS_OFFSET, sizeof(buf0) - MAPS_OFFSET, 262 + "/tmp/#%llu (deleted)\n", (unsigned long long)st.st_ino); 263 + 264 + 265 + /* Test /proc/$PID/maps */ 266 + { 267 + char buf[256]; 268 + ssize_t rv; 269 + int fd; 270 + 271 + snprintf(buf, sizeof(buf), "/proc/%u/maps", pid); 272 + fd = open(buf, O_RDONLY); 273 + if (fd == -1) { 274 + return 1; 275 + } 276 + rv = read(fd, buf, sizeof(buf)); 277 + assert(rv == strlen(buf0)); 278 + assert(memcmp(buf, buf0, strlen(buf0)) == 0); 279 + } 280 + 281 + /* Test /proc/$PID/smaps */ 282 + { 283 + char buf[1024]; 284 + ssize_t rv; 285 + int fd; 286 + 287 + snprintf(buf, sizeof(buf), "/proc/%u/smaps", pid); 288 + fd = open(buf, O_RDONLY); 289 + if (fd == -1) { 290 + return 1; 291 + } 292 + rv = read(fd, buf, sizeof(buf)); 293 + assert(0 <= rv && rv <= sizeof(buf)); 294 + 295 + assert(rv >= strlen(buf0)); 296 + assert(memcmp(buf, buf0, strlen(buf0)) == 0); 297 + 298 + #define RSS1 "Rss: 4 kB\n" 299 + #define RSS2 "Rss: 0 kB\n" 300 + #define PSS1 "Pss: 4 kB\n" 301 + #define PSS2 "Pss: 0 kB\n" 302 + assert(memmem(buf, rv, RSS1, strlen(RSS1)) || 303 + memmem(buf, rv, RSS2, strlen(RSS2))); 304 + assert(memmem(buf, rv, PSS1, strlen(PSS1)) || 305 + memmem(buf, rv, PSS2, strlen(PSS2))); 306 + 307 + static const char *S[] = { 308 + "Size: 4 kB\n", 309 + "KernelPageSize: 4 kB\n", 310 + "MMUPageSize: 4 kB\n", 311 + "Anonymous: 0 kB\n", 312 + "AnonHugePages: 0 kB\n", 313 + "Shared_Hugetlb: 0 kB\n", 314 + "Private_Hugetlb: 0 kB\n", 315 + "Locked: 0 kB\n", 316 + }; 317 + int i; 318 + 319 + for (i = 0; i < sizeof(S)/sizeof(S[0]); i++) { 320 + assert(memmem(buf, rv, S[i], strlen(S[i]))); 321 + } 322 + } 323 + 324 + /* Test /proc/$PID/smaps_rollup */ 325 + { 326 + char bufr[256]; 327 + memset(bufr, ' ', sizeof(bufr)); 328 + len = snprintf(bufr, sizeof(bufr), 329 + "%08lx-%08lx ---p 00000000 00:00 0", 330 + VADDR, VADDR + PAGE_SIZE); 331 + bufr[len] = ' '; 332 + snprintf(bufr + MAPS_OFFSET, sizeof(bufr) - MAPS_OFFSET, 333 + "[rollup]\n"); 334 + 335 + char buf[1024]; 336 + ssize_t rv; 337 + int fd; 338 + 339 + snprintf(buf, sizeof(buf), "/proc/%u/smaps_rollup", pid); 340 + fd = open(buf, O_RDONLY); 341 + if (fd == -1) { 342 + return 1; 343 + } 344 + rv = read(fd, buf, sizeof(buf)); 345 + assert(0 <= rv && rv <= sizeof(buf)); 346 + 347 + assert(rv >= strlen(bufr)); 348 + assert(memcmp(buf, bufr, strlen(bufr)) == 0); 349 + 350 + assert(memmem(buf, rv, RSS1, strlen(RSS1)) || 351 + memmem(buf, rv, RSS2, strlen(RSS2))); 352 + assert(memmem(buf, rv, PSS1, strlen(PSS1)) || 353 + memmem(buf, rv, PSS2, strlen(PSS2))); 354 + 355 + static const char *S[] = { 356 + "Anonymous: 0 kB\n", 357 + "AnonHugePages: 0 kB\n", 358 + "Shared_Hugetlb: 0 kB\n", 359 + "Private_Hugetlb: 0 kB\n", 360 + "Locked: 0 kB\n", 361 + }; 362 + int i; 363 + 364 + for (i = 0; i < sizeof(S)/sizeof(S[0]); i++) { 365 + assert(memmem(buf, rv, S[i], strlen(S[i]))); 366 + } 367 + } 368 + 369 + /* Test /proc/$PID/statm */ 370 + { 371 + char buf[64]; 372 + ssize_t rv; 373 + int fd; 374 + 375 + snprintf(buf, sizeof(buf), "/proc/%u/statm", pid); 376 + fd = open(buf, O_RDONLY); 377 + if (fd == -1) { 378 + return 1; 379 + } 380 + rv = read(fd, buf, sizeof(buf)); 381 + assert(rv == 7 * 2); 382 + 383 + assert(buf[0] == '1'); /* ->total_vm */ 384 + assert(buf[1] == ' '); 385 + assert(buf[2] == '0' || buf[2] == '1'); /* rss */ 386 + assert(buf[3] == ' '); 387 + assert(buf[4] == '0' || buf[2] == '1'); /* file rss */ 388 + assert(buf[5] == ' '); 389 + assert(buf[6] == '1'); /* ELF executable segments */ 390 + assert(buf[7] == ' '); 391 + assert(buf[8] == '0'); 392 + assert(buf[9] == ' '); 393 + assert(buf[10] == '0'); /* ->data_vm + ->stack_vm */ 394 + assert(buf[11] == ' '); 395 + assert(buf[12] == '0'); 396 + assert(buf[13] == '\n'); 397 + } 398 + 399 + return 0; 400 + } 401 + #else 402 + int main(void) 403 + { 404 + return 4; 405 + } 406 + #endif

+1 -1

tools/testing/selftests/proc/proc-self-map-files-002.c

··· 63 63 p = mmap((void *)va, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE|MAP_FIXED, fd, 0); 64 64 if (p == MAP_FAILED) { 65 65 if (errno == EPERM) 66 - return 2; 66 + return 4; 67 67 return 1; 68 68 } 69 69

+1 -2

tools/testing/selftests/proc/proc-self-syscall.c

··· 20 20 #include <sys/stat.h> 21 21 #include <fcntl.h> 22 22 #include <errno.h> 23 - #include <unistd.h> 24 23 #include <string.h> 25 24 #include <stdio.h> 26 25 ··· 38 39 fd = open("/proc/self/syscall", O_RDONLY); 39 40 if (fd == -1) { 40 41 if (errno == ENOENT) 41 - return 2; 42 + return 4; 42 43 return 1; 43 44 } 44 45

+1 -1

tools/testing/selftests/proc/proc-self-wchan.c

··· 27 27 fd = open("/proc/self/wchan", O_RDONLY); 28 28 if (fd == -1) { 29 29 if (errno == ENOENT) 30 - return 2; 30 + return 4; 31 31 return 1; 32 32 } 33 33

+14

tools/testing/selftests/proc/read.c

··· 26 26 #include <dirent.h> 27 27 #include <stdbool.h> 28 28 #include <stdlib.h> 29 + #include <stdio.h> 29 30 #include <string.h> 30 31 #include <sys/stat.h> 32 + #include <sys/vfs.h> 31 33 #include <fcntl.h> 32 34 #include <unistd.h> 33 35 ··· 125 123 int main(void) 126 124 { 127 125 DIR *d; 126 + struct statfs sfs; 128 127 129 128 d = opendir("/proc"); 130 129 if (!d) 130 + return 4; 131 + 132 + /* Ensure /proc is proc. */ 133 + if (fstatfs(dirfd(d), &sfs) == -1) { 134 + return 1; 135 + } 136 + if (sfs.f_type != 0x9fa0) { 137 + fprintf(stderr, "error: unexpected f_type %lx\n", (long)sfs.f_type); 131 138 return 2; 139 + } 140 + 132 141 f(d, 0); 142 + 133 143 return 0; 134 144 }

+1

tools/testing/selftests/tmpfs/.gitignore

··· 1 + /bug-link-o-tmpfile

+7

tools/testing/selftests/tmpfs/Makefile

··· 1 + CFLAGS += -Wall -O2 2 + CFLAGS += -D_GNU_SOURCE 3 + 4 + TEST_GEN_PROGS := 5 + TEST_GEN_PROGS += bug-link-o-tmpfile 6 + 7 + include ../lib.mk

+67

tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c

··· 1 + /* 2 + * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com> 3 + * 4 + * Permission to use, copy, modify, and distribute this software for any 5 + * purpose with or without fee is hereby granted, provided that the above 6 + * copyright notice and this permission notice appear in all copies. 7 + * 8 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 + */ 16 + /* Test that open(O_TMPFILE), linkat() doesn't screw accounting. */ 17 + #include <errno.h> 18 + #include <sched.h> 19 + #include <stdio.h> 20 + #include <sys/types.h> 21 + #include <sys/stat.h> 22 + #include <fcntl.h> 23 + #include <sys/mount.h> 24 + #include <unistd.h> 25 + 26 + int main(void) 27 + { 28 + int fd; 29 + 30 + if (unshare(CLONE_NEWNS) == -1) { 31 + if (errno == ENOSYS || errno == EPERM) { 32 + fprintf(stderr, "error: unshare, errno %d\n", errno); 33 + return 4; 34 + } 35 + fprintf(stderr, "error: unshare, errno %d\n", errno); 36 + return 1; 37 + } 38 + if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) { 39 + fprintf(stderr, "error: mount '/', errno %d\n", errno); 40 + return 1; 41 + } 42 + 43 + /* Our heroes: 1 root inode, 1 O_TMPFILE inode, 1 permanent inode. */ 44 + if (mount(NULL, "/tmp", "tmpfs", 0, "nr_inodes=3") == -1) { 45 + fprintf(stderr, "error: mount tmpfs, errno %d\n", errno); 46 + return 1; 47 + } 48 + 49 + fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_TMPFILE, 0600); 50 + if (fd == -1) { 51 + fprintf(stderr, "error: open 1, errno %d\n", errno); 52 + return 1; 53 + } 54 + if (linkat(fd, "", AT_FDCWD, "/tmp/1", AT_EMPTY_PATH) == -1) { 55 + fprintf(stderr, "error: linkat, errno %d\n", errno); 56 + return 1; 57 + } 58 + close(fd); 59 + 60 + fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_TMPFILE, 0600); 61 + if (fd == -1) { 62 + fprintf(stderr, "error: open 2, errno %d\n", errno); 63 + return 1; 64 + } 65 + 66 + return 0; 67 + }

+16

tools/testing/selftests/vm/run_vmtests

··· 211 211 echo "[PASS]" 212 212 fi 213 213 214 + echo "------------------------------------" 215 + echo "running vmalloc stability smoke test" 216 + echo "------------------------------------" 217 + ./test_vmalloc.sh smoke 218 + ret_val=$? 219 + 220 + if [ $ret_val -eq 0 ]; then 221 + echo "[PASS]" 222 + elif [ $ret_val -eq $ksft_skip ]; then 223 + echo "[SKIP]" 224 + exitcode=$ksft_skip 225 + else 226 + echo "[FAIL]" 227 + exitcode=1 228 + fi 229 + 214 230 exit $exitcode

+176

tools/testing/selftests/vm/test_vmalloc.sh

··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + # 4 + # Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com> 5 + # 6 + # This is a test script for the kernel test driver to analyse vmalloc 7 + # allocator. Therefore it is just a kernel module loader. You can specify 8 + # and pass different parameters in order to: 9 + # a) analyse performance of vmalloc allocations; 10 + # b) stressing and stability check of vmalloc subsystem. 11 + 12 + TEST_NAME="vmalloc" 13 + DRIVER="test_${TEST_NAME}" 14 + 15 + # 1 if fails 16 + exitcode=1 17 + 18 + # Kselftest framework requirement - SKIP code is 4. 19 + ksft_skip=4 20 + 21 + # 22 + # Static templates for performance, stressing and smoke tests. 23 + # Also it is possible to pass any supported parameters manualy. 24 + # 25 + PERF_PARAM="single_cpu_test=1 sequential_test_order=1 test_repeat_count=3" 26 + SMOKE_PARAM="single_cpu_test=1 test_loop_count=10000 test_repeat_count=10" 27 + STRESS_PARAM="test_repeat_count=20" 28 + 29 + check_test_requirements() 30 + { 31 + uid=$(id -u) 32 + if [ $uid -ne 0 ]; then 33 + echo "$0: Must be run as root" 34 + exit $ksft_skip 35 + fi 36 + 37 + if ! which modprobe > /dev/null 2>&1; then 38 + echo "$0: You need modprobe installed" 39 + exit $ksft_skip 40 + fi 41 + 42 + if ! modinfo $DRIVER > /dev/null 2>&1; then 43 + echo "$0: You must have the following enabled in your kernel:" 44 + echo "CONFIG_TEST_VMALLOC=m" 45 + exit $ksft_skip 46 + fi 47 + } 48 + 49 + run_perfformance_check() 50 + { 51 + echo "Run performance tests to evaluate how fast vmalloc allocation is." 52 + echo "It runs all test cases on one single CPU with sequential order." 53 + 54 + modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1 55 + echo "Done." 56 + echo "Ccheck the kernel message buffer to see the summary." 57 + } 58 + 59 + run_stability_check() 60 + { 61 + echo "Run stability tests. In order to stress vmalloc subsystem we run" 62 + echo "all available test cases on all available CPUs simultaneously." 63 + echo "It will take time, so be patient." 64 + 65 + modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1 66 + echo "Done." 67 + echo "Check the kernel ring buffer to see the summary." 68 + } 69 + 70 + run_smoke_check() 71 + { 72 + echo "Run smoke test. Note, this test provides basic coverage." 73 + echo "Please check $0 output how it can be used" 74 + echo "for deep performance analysis as well as stress testing." 75 + 76 + modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1 77 + echo "Done." 78 + echo "Check the kernel ring buffer to see the summary." 79 + } 80 + 81 + usage() 82 + { 83 + echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | " 84 + echo "manual parameters" 85 + echo 86 + echo "Valid tests and parameters:" 87 + echo 88 + modinfo $DRIVER 89 + echo 90 + echo "Example usage:" 91 + echo 92 + echo "# Shows help message" 93 + echo "./${DRIVER}.sh" 94 + echo 95 + echo "# Runs 1 test(id_1), repeats it 5 times on all online CPUs" 96 + echo "./${DRIVER}.sh run_test_mask=1 test_repeat_count=5" 97 + echo 98 + echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with " 99 + echo "sequential order" 100 + echo -n "./${DRIVER}.sh single_cpu_test=1 sequential_test_order=1 " 101 + echo "run_test_mask=23" 102 + echo 103 + echo -n "# Runs all tests on all online CPUs, shuffled order, repeats " 104 + echo "20 times" 105 + echo "./${DRIVER}.sh test_repeat_count=20" 106 + echo 107 + echo "# Performance analysis" 108 + echo "./${DRIVER}.sh performance" 109 + echo 110 + echo "# Stress testing" 111 + echo "./${DRIVER}.sh stress" 112 + echo 113 + exit 0 114 + } 115 + 116 + function validate_passed_args() 117 + { 118 + VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'` 119 + 120 + # 121 + # Something has been passed, check it. 122 + # 123 + for passed_arg in $@; do 124 + key=${passed_arg//=*/} 125 + val="${passed_arg:$((${#key}+1))}" 126 + valid=0 127 + 128 + for valid_arg in $VALID_ARGS; do 129 + if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then 130 + valid=1 131 + break 132 + fi 133 + done 134 + 135 + if [[ $valid -ne 1 ]]; then 136 + echo "Error: key or value is not correct: ${key} $val" 137 + exit $exitcode 138 + fi 139 + done 140 + } 141 + 142 + function run_manual_check() 143 + { 144 + # 145 + # Validate passed parameters. If there is wrong one, 146 + # the script exists and does not execute further. 147 + # 148 + validate_passed_args $@ 149 + 150 + echo "Run the test with following parameters: $@" 151 + modprobe $DRIVER $@ > /dev/null 2>&1 152 + echo "Done." 153 + echo "Check the kernel ring buffer to see the summary." 154 + } 155 + 156 + function run_test() 157 + { 158 + if [ $# -eq 0 ]; then 159 + usage 160 + else 161 + if [[ "$1" = "performance" ]]; then 162 + run_perfformance_check 163 + elif [[ "$1" = "stress" ]]; then 164 + run_stability_check 165 + elif [[ "$1" = "smoke" ]]; then 166 + run_smoke_check 167 + else 168 + run_manual_check $@ 169 + fi 170 + fi 171 + } 172 + 173 + check_test_requirements 174 + run_test $@ 175 + 176 + exit 0

+1 -1

tools/vm/page-types.c

··· 133 133 [KPF_NOPAGE] = "n:nopage", 134 134 [KPF_KSM] = "x:ksm", 135 135 [KPF_THP] = "t:thp", 136 - [KPF_BALLOON] = "o:balloon", 136 + [KPF_OFFLINE] = "o:offline", 137 137 [KPF_PGTABLE] = "g:pgtable", 138 138 [KPF_ZERO_PAGE] = "z:zero_page", 139 139 [KPF_IDLE] = "i:idle_page",

+19 -16

tools/vm/slabinfo.c

··· 110 110 static void usage(void) 111 111 { 112 112 printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n" 113 - "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n" 113 + "slabinfo [-aADefhilnosrStTvz1LXBU] [N=K] [-dafzput] [slab-regexp]\n" 114 114 "-a|--aliases Show aliases\n" 115 115 "-A|--activity Most active slabs first\n" 116 - "-d<options>|--debug=<options> Set/Clear Debug options\n" 116 + "-B|--Bytes Show size in bytes\n" 117 117 "-D|--display-active Switch line format to activity\n" 118 118 "-e|--empty Show empty slabs\n" 119 119 "-f|--first-alias Show first alias\n" 120 120 "-h|--help Show usage information\n" 121 121 "-i|--inverted Inverted list\n" 122 122 "-l|--slabs Show slabs\n" 123 + "-L|--Loss Sort by loss\n" 123 124 "-n|--numa Show NUMA information\n" 124 - "-o|--ops Show kmem_cache_ops\n" 125 + "-N|--lines=K Show the first K slabs\n" 126 + "-o|--ops Show kmem_cache_ops\n" 127 + "-r|--report Detailed report on single slabs\n" 125 128 "-s|--shrink Shrink slabs\n" 126 - "-r|--report Detailed report on single slabs\n" 127 129 "-S|--Size Sort by size\n" 128 130 "-t|--tracking Show alloc/free information\n" 129 131 "-T|--Totals Show summary information\n" 132 + "-U|--Unreclaim Show unreclaimable slabs only\n" 130 133 "-v|--validate Validate slabs\n" 131 134 "-z|--zero Include empty slabs\n" 132 135 "-1|--1ref Single reference\n" 133 - "-N|--lines=K Show the first K slabs\n" 134 - "-L|--Loss Sort by loss\n" 135 136 "-X|--Xtotals Show extended summary information\n" 136 - "-B|--Bytes Show size in bytes\n" 137 - "-U|--Unreclaim Show unreclaimable slabs only\n" 138 - "\nValid debug options (FZPUT may be combined)\n" 139 - "a / A Switch on all debug options (=FZUP)\n" 140 - "- Switch off all debug options\n" 141 - "f / F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n" 142 - "z / Z Redzoning\n" 143 - "p / P Poisoning\n" 144 - "u / U Tracking\n" 145 - "t / T Tracing\n" 137 + 138 + "\n" 139 + "-d | --debug Switch off all debug options\n" 140 + "-da | --debug=a Switch on all debug options (--debug=FZPU)\n" 141 + 142 + "\n" 143 + "-d[afzput] | --debug=[afzput]\n" 144 + " f | F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n" 145 + " z | Z Redzoning\n" 146 + " p | P Poisoning\n" 147 + " u | U Tracking\n" 148 + " t | T Tracing\n" 146 149 ); 147 150 } 148 151