Merge branch 'akpm' (patches from Andrew)

+12

Documentation/sysctl/vm.txt

··· 61 61 - stat_refresh 62 62 - numa_stat 63 63 - swappiness 64 + - unprivileged_userfaultfd 64 65 - user_reserve_kbytes 65 66 - vfs_cache_pressure 66 67 - watermark_boost_factor ··· 816 815 than the high water mark in a zone. 817 816 818 817 The default value is 60. 818 + 819 + ============================================================== 820 + 821 + unprivileged_userfaultfd 822 + 823 + This flag controls whether unprivileged users can use the userfaultfd 824 + system calls. Set this to 1 to allow unprivileged users to use the 825 + userfaultfd system calls, or set this to 0 to restrict userfaultfd to only 826 + privileged users (with SYS_CAP_PTRACE capability). 827 + 828 + The default value is 1. 819 829 820 830 ============================================================== 821 831

+4 -3

Documentation/trace/postprocess/trace-vmscan-postprocess.pl

··· 113 113 my $regex_kswapd_sleep_default = 'nid=([0-9]*)'; 114 114 my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)'; 115 115 my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)'; 116 - my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; 116 + my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate_anon=([0-9]*) nr_activate_file=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; 117 117 my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)'; 118 118 my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)'; 119 119 ··· 212 212 "vmscan/mm_vmscan_lru_shrink_inactive", 213 213 $regex_lru_shrink_inactive_default, 214 214 "nid", "nr_scanned", "nr_reclaimed", "nr_dirty", "nr_writeback", 215 - "nr_congested", "nr_immediate", "nr_activate", "nr_ref_keep", 215 + "nr_congested", "nr_immediate", "nr_activate_anon", 216 + "nr_activate_file", "nr_ref_keep", 216 217 "nr_unmap_fail", "priority", "flags"); 217 218 $regex_lru_shrink_active = generate_traceevent_regex( 218 219 "vmscan/mm_vmscan_lru_shrink_active", ··· 408 407 } 409 408 410 409 my $nr_reclaimed = $3; 411 - my $flags = $12; 410 + my $flags = $13; 412 411 my $file = 0; 413 412 if ($flags =~ /RECLAIM_WB_FILE/) { 414 413 $file = 1;

+74 -18

Documentation/vm/hmm.rst

··· 189 189 When the device driver wants to populate a range of virtual addresses, it can 190 190 use either:: 191 191 192 - int hmm_vma_get_pfns(struct vm_area_struct *vma, 193 - struct hmm_range *range, 194 - unsigned long start, 195 - unsigned long end, 196 - hmm_pfn_t *pfns); 197 - int hmm_vma_fault(struct vm_area_struct *vma, 198 - struct hmm_range *range, 199 - unsigned long start, 200 - unsigned long end, 201 - hmm_pfn_t *pfns, 202 - bool write, 203 - bool block); 192 + long hmm_range_snapshot(struct hmm_range *range); 193 + long hmm_range_fault(struct hmm_range *range, bool block); 204 194 205 - The first one (hmm_vma_get_pfns()) will only fetch present CPU page table 195 + The first one (hmm_range_snapshot()) will only fetch present CPU page table 206 196 entries and will not trigger a page fault on missing or non-present entries. 207 197 The second one does trigger a page fault on missing or read-only entry if the 208 198 write parameter is true. Page faults use the generic mm page fault code path ··· 210 220 { 211 221 struct hmm_range range; 212 222 ... 223 + 224 + range.start = ...; 225 + range.end = ...; 226 + range.pfns = ...; 227 + range.flags = ...; 228 + range.values = ...; 229 + range.pfn_shift = ...; 230 + hmm_range_register(&range); 231 + 232 + /* 233 + * Just wait for range to be valid, safe to ignore return value as we 234 + * will use the return value of hmm_range_snapshot() below under the 235 + * mmap_sem to ascertain the validity of the range. 236 + */ 237 + hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); 238 + 213 239 again: 214 - ret = hmm_vma_get_pfns(vma, &range, start, end, pfns); 215 - if (ret) 240 + down_read(&mm->mmap_sem); 241 + ret = hmm_range_snapshot(&range); 242 + if (ret) { 243 + up_read(&mm->mmap_sem); 244 + if (ret == -EAGAIN) { 245 + /* 246 + * No need to check hmm_range_wait_until_valid() return value 247 + * on retry we will get proper error with hmm_range_snapshot() 248 + */ 249 + hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); 250 + goto again; 251 + } 252 + hmm_mirror_unregister(&range); 216 253 return ret; 254 + } 217 255 take_lock(driver->update); 218 - if (!hmm_vma_range_done(vma, &range)) { 256 + if (!range.valid) { 219 257 release_lock(driver->update); 258 + up_read(&mm->mmap_sem); 220 259 goto again; 221 260 } 222 261 223 262 // Use pfns array content to update device page table 224 263 264 + hmm_mirror_unregister(&range); 225 265 release_lock(driver->update); 266 + up_read(&mm->mmap_sem); 226 267 return 0; 227 268 } 228 269 229 270 The driver->update lock is the same lock that the driver takes inside its 230 - update() callback. That lock must be held before hmm_vma_range_done() to avoid 231 - any race with a concurrent CPU page table update. 271 + update() callback. That lock must be held before checking the range.valid 272 + field to avoid any race with a concurrent CPU page table update. 232 273 233 274 HMM implements all this on top of the mmu_notifier API because we wanted a 234 275 simpler API and also to be able to perform optimizations latter on like doing ··· 274 253 buffer can happen concurrently for multiple devices. Waiting for each device to 275 254 report commands as executed is serialized (there is no point in doing this 276 255 concurrently). 256 + 257 + 258 + Leverage default_flags and pfn_flags_mask 259 + ========================================= 260 + 261 + The hmm_range struct has 2 fields default_flags and pfn_flags_mask that allows 262 + to set fault or snapshot policy for a whole range instead of having to set them 263 + for each entries in the range. 264 + 265 + For instance if the device flags for device entries are: 266 + VALID (1 << 63) 267 + WRITE (1 << 62) 268 + 269 + Now let say that device driver wants to fault with at least read a range then 270 + it does set: 271 + range->default_flags = (1 << 63) 272 + range->pfn_flags_mask = 0; 273 + 274 + and calls hmm_range_fault() as described above. This will fill fault all page 275 + in the range with at least read permission. 276 + 277 + Now let say driver wants to do the same except for one page in the range for 278 + which its want to have write. Now driver set: 279 + range->default_flags = (1 << 63); 280 + range->pfn_flags_mask = (1 << 62); 281 + range->pfns[index_of_write] = (1 << 62); 282 + 283 + With this HMM will fault in all page with at least read (ie valid) and for the 284 + address == range->start + (index_of_write << PAGE_SHIFT) it will fault with 285 + write permission ie if the CPU pte does not have write permission set then HMM 286 + will call handle_mm_fault(). 287 + 288 + Note that HMM will populate the pfns array with write permission for any entry 289 + that have write permission within the CPU pte no matter what are the values set 290 + in default_flags or pfn_flags_mask. 277 291 278 292 279 293 Represent and manage device memory from core kernel point of view

+1

MAINTAINERS

··· 11746 11746 ORACLE CLUSTER FILESYSTEM 2 (OCFS2) 11747 11747 M: Mark Fasheh <mark@fasheh.com> 11748 11748 M: Joel Becker <jlbec@evilplan.org> 11749 + M: Joseph Qi <joseph.qi@linux.alibaba.com> 11749 11750 L: ocfs2-devel@oss.oracle.com (moderated for non-subscribers) 11750 11751 W: http://ocfs2.wiki.kernel.org 11751 11752 S: Supported

+7

arch/Kconfig

··· 245 245 An architecture should select this when it can successfully 246 246 build and run with CONFIG_FORTIFY_SOURCE. 247 247 248 + # 249 + # Select if the arch provides a historic keepinit alias for the retain_initrd 250 + # command line option 251 + # 252 + config ARCH_HAS_KEEPINITRD 253 + bool 254 + 248 255 # Select if arch has all set_memory_ro/rw/x/nx() functions in asm/cacheflush.h 249 256 config ARCH_HAS_SET_MEMORY 250 257 bool

-14

arch/alpha/mm/init.c

··· 285 285 memblock_free_all(); 286 286 mem_init_print_info(NULL); 287 287 } 288 - 289 - void 290 - free_initmem(void) 291 - { 292 - free_initmem_default(-1); 293 - } 294 - 295 - #ifdef CONFIG_BLK_DEV_INITRD 296 - void 297 - free_initrd_mem(unsigned long start, unsigned long end) 298 - { 299 - free_reserved_area((void *)start, (void *)end, -1, "initrd"); 300 - } 301 - #endif

-15

arch/arc/mm/init.c

··· 206 206 memblock_free_all(); 207 207 mem_init_print_info(NULL); 208 208 } 209 - 210 - /* 211 - * free_initmem: Free all the __init memory. 212 - */ 213 - void __ref free_initmem(void) 214 - { 215 - free_initmem_default(-1); 216 - } 217 - 218 - #ifdef CONFIG_BLK_DEV_INITRD 219 - void __init free_initrd_mem(unsigned long start, unsigned long end) 220 - { 221 - free_reserved_area((void *)start, (void *)end, -1, "initrd"); 222 - } 223 - #endif

+2 -1

arch/arm/Kconfig

··· 4 4 default y 5 5 select ARCH_32BIT_OFF_T 6 6 select ARCH_CLOCKSOURCE_DATA 7 - select ARCH_DISCARD_MEMBLOCK if !HAVE_ARCH_PFN_VALID && !KEXEC 8 7 select ARCH_HAS_DEBUG_VIRTUAL if MMU 9 8 select ARCH_HAS_DEVMEM_IS_ALLOWED 10 9 select ARCH_HAS_ELF_RANDOMIZE 11 10 select ARCH_HAS_FORTIFY_SOURCE 11 + select ARCH_HAS_KEEPINITRD 12 12 select ARCH_HAS_KCOV 13 13 select ARCH_HAS_MEMBARRIER_SYNC_CORE 14 14 select ARCH_HAS_PTE_SPECIAL if ARM_LPAE ··· 21 21 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST 22 22 select ARCH_HAVE_CUSTOM_GPIO_H 23 23 select ARCH_HAS_GCOV_PROFILE_ALL 24 + select ARCH_KEEP_MEMBLOCK if HAVE_ARCH_PFN_VALID || KEXEC 24 25 select ARCH_MIGHT_HAVE_PC_PARPORT 25 26 select ARCH_NO_SG_CHAIN if !ARM_HAS_SG_CHAIN 26 27 select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX

+6 -16

arch/arm/mm/dma-mapping.c

··· 1577 1577 void *cpu_addr, dma_addr_t dma_addr, size_t size, 1578 1578 unsigned long attrs) 1579 1579 { 1580 - unsigned long uaddr = vma->vm_start; 1581 - unsigned long usize = vma->vm_end - vma->vm_start; 1582 1580 struct page **pages = __iommu_get_pages(cpu_addr, attrs); 1583 1581 unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; 1584 - unsigned long off = vma->vm_pgoff; 1582 + int err; 1585 1583 1586 1584 if (!pages) 1587 1585 return -ENXIO; 1588 1586 1589 - if (off >= nr_pages || (usize >> PAGE_SHIFT) > nr_pages - off) 1587 + if (vma->vm_pgoff >= nr_pages) 1590 1588 return -ENXIO; 1591 1589 1592 - pages += off; 1590 + err = vm_map_pages(vma, pages, nr_pages); 1591 + if (err) 1592 + pr_err("Remapping memory failed: %d\n", err); 1593 1593 1594 - do { 1595 - int ret = vm_insert_page(vma, uaddr, *pages++); 1596 - if (ret) { 1597 - pr_err("Remapping memory failed: %d\n", ret); 1598 - return ret; 1599 - } 1600 - uaddr += PAGE_SIZE; 1601 - usize -= PAGE_SIZE; 1602 - } while (usize > 0); 1603 - 1604 - return 0; 1594 + return err; 1605 1595 } 1606 1596 static int arm_iommu_mmap_attrs(struct device *dev, 1607 1597 struct vm_area_struct *vma, void *cpu_addr,

+6 -19

arch/arm/mm/init.c

··· 695 695 } 696 696 697 697 #ifdef CONFIG_BLK_DEV_INITRD 698 - 699 - static int keep_initrd; 700 - 701 698 void free_initrd_mem(unsigned long start, unsigned long end) 702 699 { 703 - if (!keep_initrd) { 704 - if (start == initrd_start) 705 - start = round_down(start, PAGE_SIZE); 706 - if (end == initrd_end) 707 - end = round_up(end, PAGE_SIZE); 700 + if (start == initrd_start) 701 + start = round_down(start, PAGE_SIZE); 702 + if (end == initrd_end) 703 + end = round_up(end, PAGE_SIZE); 708 704 709 - poison_init_mem((void *)start, PAGE_ALIGN(end) - start); 710 - free_reserved_area((void *)start, (void *)end, -1, "initrd"); 711 - } 705 + poison_init_mem((void *)start, PAGE_ALIGN(end) - start); 706 + free_reserved_area((void *)start, (void *)end, -1, "initrd"); 712 707 } 713 - 714 - static int __init keepinitrd_setup(char *__unused) 715 - { 716 - keep_initrd = 1; 717 - return 1; 718 - } 719 - 720 - __setup("keepinitrd", keepinitrd_setup); 721 708 #endif

+3 -1

arch/arm64/Kconfig

··· 19 19 select ARCH_HAS_FAST_MULTIPLIER 20 20 select ARCH_HAS_FORTIFY_SOURCE 21 21 select ARCH_HAS_GCOV_PROFILE_ALL 22 - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA 22 + select ARCH_HAS_GIGANTIC_PAGE 23 23 select ARCH_HAS_KCOV 24 + select ARCH_HAS_KEEPINITRD 24 25 select ARCH_HAS_MEMBARRIER_SYNC_CORE 25 26 select ARCH_HAS_PTE_SPECIAL 26 27 select ARCH_HAS_SETUP_DMA_OPS ··· 60 59 select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPT 61 60 select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPT 62 61 select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPT 62 + select ARCH_KEEP_MEMBLOCK 63 63 select ARCH_USE_CMPXCHG_LOCKREF 64 64 select ARCH_USE_QUEUED_RWLOCKS 65 65 select ARCH_USE_QUEUED_SPINLOCKS

-4

arch/arm64/include/asm/hugetlb.h

··· 70 70 71 71 #include <asm-generic/hugetlb.h> 72 72 73 - #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE 74 - static inline bool gigantic_page_supported(void) { return true; } 75 - #endif 76 - 77 73 #endif /* __ASM_HUGETLB_H */

+2 -15

arch/arm64/mm/init.c

··· 578 578 } 579 579 580 580 #ifdef CONFIG_BLK_DEV_INITRD 581 - 582 - static int keep_initrd __initdata; 583 - 584 581 void __init free_initrd_mem(unsigned long start, unsigned long end) 585 582 { 586 - if (!keep_initrd) { 587 - free_reserved_area((void *)start, (void *)end, 0, "initrd"); 588 - memblock_free(__virt_to_phys(start), end - start); 589 - } 583 + free_reserved_area((void *)start, (void *)end, 0, "initrd"); 584 + memblock_free(__virt_to_phys(start), end - start); 590 585 } 591 - 592 - static int __init keepinitrd_setup(char *__unused) 593 - { 594 - keep_initrd = 1; 595 - return 1; 596 - } 597 - 598 - __setup("keepinitrd", keepinitrd_setup); 599 586 #endif 600 587 601 588 /*

+3 -3

arch/arm64/mm/mmu.c

··· 1065 1065 } 1066 1066 1067 1067 #ifdef CONFIG_MEMORY_HOTPLUG 1068 - int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, 1069 - bool want_memblock) 1068 + int arch_add_memory(int nid, u64 start, u64 size, 1069 + struct mhp_restrictions *restrictions) 1070 1070 { 1071 1071 int flags = 0; 1072 1072 ··· 1077 1077 size, PAGE_KERNEL, __pgd_pgtable_alloc, flags); 1078 1078 1079 1079 return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, 1080 - altmap, want_memblock); 1080 + restrictions); 1081 1081 } 1082 1082 #endif

-12

arch/c6x/mm/init.c

··· 68 68 69 69 mem_init_print_info(NULL); 70 70 } 71 - 72 - #ifdef CONFIG_BLK_DEV_INITRD 73 - void __init free_initrd_mem(unsigned long start, unsigned long end) 74 - { 75 - free_reserved_area((void *)start, (void *)end, -1, "initrd"); 76 - } 77 - #endif 78 - 79 - void __init free_initmem(void) 80 - { 81 - free_initmem_default(-1); 82 - }

-14

arch/h8300/mm/init.c

··· 102 102 103 103 mem_init_print_info(NULL); 104 104 } 105 - 106 - 107 - #ifdef CONFIG_BLK_DEV_INITRD 108 - void free_initrd_mem(unsigned long start, unsigned long end) 109 - { 110 - free_reserved_area((void *)start, (void *)end, -1, "initrd"); 111 - } 112 - #endif 113 - 114 - void 115 - free_initmem(void) 116 - { 117 - free_initmem_default(-1); 118 - }

-1

arch/hexagon/Kconfig

··· 22 22 select GENERIC_IRQ_SHOW 23 23 select HAVE_ARCH_KGDB 24 24 select HAVE_ARCH_TRACEHOOK 25 - select ARCH_DISCARD_MEMBLOCK 26 25 select NEED_SG_DMA_LENGTH 27 26 select NO_IOPORT_MAP 28 27 select GENERIC_IOMAP

-10

arch/hexagon/mm/init.c

··· 85 85 } 86 86 87 87 /* 88 - * free_initmem - frees memory used by stuff declared with __init 89 - * 90 - * Todo: free pages between __init_begin and __init_end; possibly 91 - * some devtree related stuff as well. 92 - */ 93 - void __ref free_initmem(void) 94 - { 95 - } 96 - 97 - /* 98 88 * free_initrd_mem - frees... initrd memory. 99 89 * @start - start of init memory 100 90 * @end - end of init memory

-1

arch/ia64/Kconfig

··· 33 33 select ARCH_HAS_DMA_COHERENT_TO_PFN if SWIOTLB 34 34 select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB 35 35 select VIRT_TO_BUS 36 - select ARCH_DISCARD_MEMBLOCK 37 36 select GENERIC_IRQ_PROBE 38 37 select GENERIC_PENDING_IRQ if SMP 39 38 select GENERIC_IRQ_SHOW

+6 -11

arch/ia64/mm/init.c

··· 666 666 } 667 667 668 668 #ifdef CONFIG_MEMORY_HOTPLUG 669 - int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, 670 - bool want_memblock) 669 + int arch_add_memory(int nid, u64 start, u64 size, 670 + struct mhp_restrictions *restrictions) 671 671 { 672 672 unsigned long start_pfn = start >> PAGE_SHIFT; 673 673 unsigned long nr_pages = size >> PAGE_SHIFT; 674 674 int ret; 675 675 676 - ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); 676 + ret = __add_pages(nid, start_pfn, nr_pages, restrictions); 677 677 if (ret) 678 678 printk("%s: Problem encountered in __add_pages() as ret=%d\n", 679 679 __func__, ret); ··· 682 682 } 683 683 684 684 #ifdef CONFIG_MEMORY_HOTREMOVE 685 - int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) 685 + void arch_remove_memory(int nid, u64 start, u64 size, 686 + struct vmem_altmap *altmap) 686 687 { 687 688 unsigned long start_pfn = start >> PAGE_SHIFT; 688 689 unsigned long nr_pages = size >> PAGE_SHIFT; 689 690 struct zone *zone; 690 - int ret; 691 691 692 692 zone = page_zone(pfn_to_page(start_pfn)); 693 - ret = __remove_pages(zone, start_pfn, nr_pages, altmap); 694 - if (ret) 695 - pr_warn("%s: Problem encountered in __remove_pages() as" 696 - " ret=%d\n", __func__, ret); 697 - 698 - return ret; 693 + __remove_pages(zone, start_pfn, nr_pages, altmap); 699 694 } 700 695 #endif 701 696 #endif

-1

arch/m68k/Kconfig

··· 26 26 select MODULES_USE_ELF_RELA 27 27 select OLD_SIGSUSPEND3 28 28 select OLD_SIGACTION 29 - select ARCH_DISCARD_MEMBLOCK 30 29 select MMU_GATHER_NO_RANGE if MMU 31 30 32 31 config CPU_BIG_ENDIAN

-7

arch/m68k/mm/init.c

··· 147 147 init_pointer_tables(); 148 148 mem_init_print_info(NULL); 149 149 } 150 - 151 - #ifdef CONFIG_BLK_DEV_INITRD 152 - void free_initrd_mem(unsigned long start, unsigned long end) 153 - { 154 - free_reserved_area((void *)start, (void *)end, -1, "initrd"); 155 - } 156 - #endif

-12

arch/microblaze/mm/init.c

··· 186 186 paging_init(); 187 187 } 188 188 189 - #ifdef CONFIG_BLK_DEV_INITRD 190 - void free_initrd_mem(unsigned long start, unsigned long end) 191 - { 192 - free_reserved_area((void *)start, (void *)end, -1, "initrd"); 193 - } 194 - #endif 195 - 196 - void free_initmem(void) 197 - { 198 - free_initmem_default(-1); 199 - } 200 - 201 189 void __init mem_init(void) 202 190 { 203 191 high_memory = (void *)__va(memory_start + lowmem_size - 1);

-1

arch/mips/Kconfig

··· 5 5 select ARCH_32BIT_OFF_T if !64BIT 6 6 select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT 7 7 select ARCH_CLOCKSOURCE_DATA 8 - select ARCH_DISCARD_MEMBLOCK 9 8 select ARCH_HAS_ELF_RANDOMIZE 10 9 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST 11 10 select ARCH_HAS_UBSAN_SANITIZE_ALL

+6 -5

arch/mips/mm/gup.c

··· 235 235 * get_user_pages_fast() - pin user pages in memory 236 236 * @start: starting user address 237 237 * @nr_pages: number of pages from start to pin 238 - * @write: whether pages will be written to 238 + * @gup_flags: flags modifying pin behaviour 239 239 * @pages: array that receives pointers to the pages pinned. 240 240 * Should be at least nr_pages long. 241 241 * ··· 247 247 * requested. If nr_pages is 0 or negative, returns 0. If no pages 248 248 * were pinned, returns -errno. 249 249 */ 250 - int get_user_pages_fast(unsigned long start, int nr_pages, int write, 251 - struct page **pages) 250 + int get_user_pages_fast(unsigned long start, int nr_pages, 251 + unsigned int gup_flags, struct page **pages) 252 252 { 253 253 struct mm_struct *mm = current->mm; 254 254 unsigned long addr, len, end; ··· 273 273 next = pgd_addr_end(addr, end); 274 274 if (pgd_none(pgd)) 275 275 goto slow; 276 - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) 276 + if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, 277 + pages, &nr)) 277 278 goto slow; 278 279 } while (pgdp++, addr = next, addr != end); 279 280 local_irq_enable(); ··· 290 289 pages += nr; 291 290 292 291 ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, 293 - pages, write ? FOLL_WRITE : 0); 292 + pages, gup_flags); 294 293 295 294 /* Have to be a bit careful with return values */ 296 295 if (nr > 0) {

-8

arch/mips/mm/init.c

··· 504 504 printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); 505 505 } 506 506 507 - #ifdef CONFIG_BLK_DEV_INITRD 508 - void free_initrd_mem(unsigned long start, unsigned long end) 509 - { 510 - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, 511 - "initrd"); 512 - } 513 - #endif 514 - 515 507 void (*free_init_pages_eva)(void *begin, void *end) = NULL; 516 508 517 509 void __ref free_initmem(void)

-12

arch/nds32/mm/init.c

··· 252 252 return; 253 253 } 254 254 255 - void free_initmem(void) 256 - { 257 - free_initmem_default(-1); 258 - } 259 - 260 - #ifdef CONFIG_BLK_DEV_INITRD 261 - void free_initrd_mem(unsigned long start, unsigned long end) 262 - { 263 - free_reserved_area((void *)start, (void *)end, -1, "initrd"); 264 - } 265 - #endif 266 - 267 255 void __set_fixmap(enum fixed_addresses idx, 268 256 phys_addr_t phys, pgprot_t flags) 269 257 {

-1

arch/nios2/Kconfig

··· 23 23 select SPARSE_IRQ 24 24 select USB_ARCH_HAS_HCD if USB_SUPPORT 25 25 select CPU_NO_EFFICIENT_FFS 26 - select ARCH_DISCARD_MEMBLOCK 27 26 select MMU_GATHER_NO_RANGE if MMU 28 27 29 28 config GENERIC_CSUM

-12

arch/nios2/mm/init.c

··· 82 82 flush_tlb_all(); 83 83 } 84 84 85 - #ifdef CONFIG_BLK_DEV_INITRD 86 - void __init free_initrd_mem(unsigned long start, unsigned long end) 87 - { 88 - free_reserved_area((void *)start, (void *)end, -1, "initrd"); 89 - } 90 - #endif 91 - 92 - void __ref free_initmem(void) 93 - { 94 - free_initmem_default(-1); 95 - } 96 - 97 85 #define __page_aligned(order) __aligned(PAGE_SIZE << (order)) 98 86 pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned(PGD_ORDER); 99 87 pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned(PTE_ORDER);

-12

arch/openrisc/mm/init.c

··· 223 223 mem_init_done = 1; 224 224 return; 225 225 } 226 - 227 - #ifdef CONFIG_BLK_DEV_INITRD 228 - void free_initrd_mem(unsigned long start, unsigned long end) 229 - { 230 - free_reserved_area((void *)start, (void *)end, -1, "initrd"); 231 - } 232 - #endif 233 - 234 - void free_initmem(void) 235 - { 236 - free_initmem_default(-1); 237 - }

-7

arch/parisc/mm/init.c

··· 917 917 spin_unlock(&sid_lock); 918 918 } 919 919 #endif 920 - 921 - #ifdef CONFIG_BLK_DEV_INITRD 922 - void free_initrd_mem(unsigned long start, unsigned long end) 923 - { 924 - free_reserved_area((void *)start, (void *)end, -1, "initrd"); 925 - } 926 - #endif

+1

arch/powerpc/Kconfig

··· 137 137 select ARCH_HAS_UBSAN_SANITIZE_ALL 138 138 select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64 139 139 select ARCH_HAVE_NMI_SAFE_CMPXCHG 140 + select ARCH_KEEP_MEMBLOCK 140 141 select ARCH_MIGHT_HAVE_PC_PARPORT 141 142 select ARCH_MIGHT_HAVE_PC_SERIO 142 143 select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX

+2 -3

arch/powerpc/include/asm/book3s/64/hugetlb.h

··· 36 36 } 37 37 } 38 38 39 - #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE 40 - static inline bool gigantic_page_supported(void) 39 + #define __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED 40 + static inline bool gigantic_page_runtime_supported(void) 41 41 { 42 42 /* 43 43 * We used gigantic page reservation with hypervisor assist in some case. ··· 49 49 50 50 return true; 51 51 } 52 - #endif 53 52 54 53 /* hugepd entry valid bit */ 55 54 #define HUGEPD_VAL_BITS (0x8000000000000000UL)

+2 -2

arch/powerpc/kvm/book3s_64_mmu_hv.c

··· 600 600 /* If writing != 0, then the HPTE must allow writing, if we get here */ 601 601 write_ok = writing; 602 602 hva = gfn_to_hva_memslot(memslot, gfn); 603 - npages = get_user_pages_fast(hva, 1, writing, pages); 603 + npages = get_user_pages_fast(hva, 1, writing ? FOLL_WRITE : 0, pages); 604 604 if (npages < 1) { 605 605 /* Check if it's an I/O mapping */ 606 606 down_read(&current->mm->mmap_sem); ··· 1193 1193 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 1194 1194 goto err; 1195 1195 hva = gfn_to_hva_memslot(memslot, gfn); 1196 - npages = get_user_pages_fast(hva, 1, 1, pages); 1196 + npages = get_user_pages_fast(hva, 1, FOLL_WRITE, pages); 1197 1197 if (npages < 1) 1198 1198 goto err; 1199 1199 page = pages[0];

+1 -1

arch/powerpc/kvm/e500_mmu.c

··· 783 783 if (!pages) 784 784 return -ENOMEM; 785 785 786 - ret = get_user_pages_fast(cfg->array, num_pages, 1, pages); 786 + ret = get_user_pages_fast(cfg->array, num_pages, FOLL_WRITE, pages); 787 787 if (ret < 0) 788 788 goto free_pages; 789 789

+3 -2

arch/powerpc/mm/book3s64/iommu_api.c

··· 141 141 for (entry = 0; entry < entries; entry += chunk) { 142 142 unsigned long n = min(entries - entry, chunk); 143 143 144 - ret = get_user_pages_longterm(ua + (entry << PAGE_SHIFT), n, 145 - FOLL_WRITE, mem->hpages + entry, NULL); 144 + ret = get_user_pages(ua + (entry << PAGE_SHIFT), n, 145 + FOLL_WRITE | FOLL_LONGTERM, 146 + mem->hpages + entry, NULL); 146 147 if (ret == n) { 147 148 pinned += n; 148 149 continue;

+6 -16

arch/powerpc/mm/mem.c

··· 109 109 return -ENODEV; 110 110 } 111 111 112 - int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, 113 - bool want_memblock) 112 + int __ref arch_add_memory(int nid, u64 start, u64 size, 113 + struct mhp_restrictions *restrictions) 114 114 { 115 115 unsigned long start_pfn = start >> PAGE_SHIFT; 116 116 unsigned long nr_pages = size >> PAGE_SHIFT; ··· 127 127 } 128 128 flush_inval_dcache_range(start, start + size); 129 129 130 - return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); 130 + return __add_pages(nid, start_pfn, nr_pages, restrictions); 131 131 } 132 132 133 133 #ifdef CONFIG_MEMORY_HOTREMOVE 134 - int __ref arch_remove_memory(int nid, u64 start, u64 size, 134 + void __ref arch_remove_memory(int nid, u64 start, u64 size, 135 135 struct vmem_altmap *altmap) 136 136 { 137 137 unsigned long start_pfn = start >> PAGE_SHIFT; ··· 147 147 if (altmap) 148 148 page += vmem_altmap_offset(altmap); 149 149 150 - ret = __remove_pages(page_zone(page), start_pfn, nr_pages, altmap); 151 - if (ret) 152 - return ret; 150 + __remove_pages(page_zone(page), start_pfn, nr_pages, altmap); 153 151 154 152 /* Remove htab bolted mappings for this section of memory */ 155 153 start = (unsigned long)__va(start); 156 154 flush_inval_dcache_range(start, start + size); 157 155 ret = remove_section_mapping(start, start + size); 156 + WARN_ON_ONCE(ret); 158 157 159 158 /* Ensure all vmalloc mappings are flushed in case they also 160 159 * hit that section of memory ··· 162 163 163 164 if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC) 164 165 pr_warn("Hash collision while resizing HPT\n"); 165 - 166 - return ret; 167 166 } 168 167 #endif 169 168 #endif /* CONFIG_MEMORY_HOTPLUG */ ··· 334 337 init_mem_is_free = true; 335 338 free_initmem_default(POISON_FREE_INITMEM); 336 339 } 337 - 338 - #ifdef CONFIG_BLK_DEV_INITRD 339 - void __init free_initrd_mem(unsigned long start, unsigned long end) 340 - { 341 - free_reserved_area((void *)start, (void *)end, -1, "initrd"); 342 - } 343 - #endif 344 340 345 341 /* 346 342 * This is called when a page has been modified by the kernel.

+1 -1

arch/powerpc/platforms/Kconfig.cputype

··· 331 331 config PPC_RADIX_MMU 332 332 bool "Radix MMU Support" 333 333 depends on PPC_BOOK3S_64 && HUGETLB_PAGE 334 - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA 334 + select ARCH_HAS_GIGANTIC_PAGE 335 335 select PPC_HAVE_KUEP 336 336 select PPC_HAVE_KUAP 337 337 default y

-5

arch/riscv/mm/init.c

··· 66 66 mem_init_print_info(NULL); 67 67 } 68 68 69 - void free_initmem(void) 70 - { 71 - free_initmem_default(0); 72 - } 73 - 74 69 #ifdef CONFIG_BLK_DEV_INITRD 75 70 static void __init setup_initrd(void) 76 71 {

+2 -1

arch/s390/Kconfig

··· 63 63 select ARCH_HAS_ELF_RANDOMIZE 64 64 select ARCH_HAS_FORTIFY_SOURCE 65 65 select ARCH_HAS_GCOV_PROFILE_ALL 66 - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA 66 + select ARCH_HAS_GIGANTIC_PAGE 67 67 select ARCH_HAS_KCOV 68 68 select ARCH_HAS_PTE_SPECIAL 69 69 select ARCH_HAS_SET_MEMORY ··· 100 100 select ARCH_INLINE_WRITE_UNLOCK_BH 101 101 select ARCH_INLINE_WRITE_UNLOCK_IRQ 102 102 select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE 103 + select ARCH_KEEP_MEMBLOCK 103 104 select ARCH_SAVE_PAGE_KEYS if HIBERNATION 104 105 select ARCH_SUPPORTS_ATOMIC_RMW 105 106 select ARCH_SUPPORTS_NUMA_BALANCING

+5 -3

arch/s390/include/asm/hugetlb.h

··· 116 116 return pte_modify(pte, newprot); 117 117 } 118 118 119 - #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE 120 - static inline bool gigantic_page_supported(void) { return true; } 121 - #endif 119 + static inline bool gigantic_page_runtime_supported(void) 120 + { 121 + return true; 122 + } 123 + 122 124 #endif /* _ASM_S390_HUGETLB_H */

+1 -1

arch/s390/kvm/interrupt.c

··· 2376 2376 ret = -EFAULT; 2377 2377 goto out; 2378 2378 } 2379 - ret = get_user_pages_fast(map->addr, 1, 1, &map->page); 2379 + ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page); 2380 2380 if (ret < 0) 2381 2381 goto out; 2382 2382 BUG_ON(ret != 1);

+6 -13

arch/s390/mm/init.c

··· 157 157 free_initmem_default(POISON_FREE_INITMEM); 158 158 } 159 159 160 - #ifdef CONFIG_BLK_DEV_INITRD 161 - void __init free_initrd_mem(unsigned long start, unsigned long end) 162 - { 163 - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, 164 - "initrd"); 165 - } 166 - #endif 167 - 168 160 unsigned long memory_block_size_bytes(void) 169 161 { 170 162 /* ··· 219 227 220 228 #endif /* CONFIG_CMA */ 221 229 222 - int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, 223 - bool want_memblock) 230 + int arch_add_memory(int nid, u64 start, u64 size, 231 + struct mhp_restrictions *restrictions) 224 232 { 225 233 unsigned long start_pfn = PFN_DOWN(start); 226 234 unsigned long size_pages = PFN_DOWN(size); ··· 230 238 if (rc) 231 239 return rc; 232 240 233 - rc = __add_pages(nid, start_pfn, size_pages, altmap, want_memblock); 241 + rc = __add_pages(nid, start_pfn, size_pages, restrictions); 234 242 if (rc) 235 243 vmem_remove_mapping(start, size); 236 244 return rc; 237 245 } 238 246 239 247 #ifdef CONFIG_MEMORY_HOTREMOVE 240 - int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) 248 + void arch_remove_memory(int nid, u64 start, u64 size, 249 + struct vmem_altmap *altmap) 241 250 { 242 251 /* 243 252 * There is no hardware or firmware interface which could trigger a 244 253 * hot memory remove on s390. So there is nothing that needs to be 245 254 * implemented. 246 255 */ 247 - return -EBUSY; 256 + BUG(); 248 257 } 249 258 #endif 250 259 #endif /* CONFIG_MEMORY_HOTPLUG */

+1 -1

arch/sh/Kconfig

··· 10 10 select DMA_DECLARE_COHERENT 11 11 select HAVE_IDE if HAS_IOPORT_MAP 12 12 select HAVE_MEMBLOCK_NODE_MAP 13 - select ARCH_DISCARD_MEMBLOCK 14 13 select HAVE_OPROFILE 15 14 select HAVE_ARCH_TRACEHOOK 16 15 select HAVE_PERF_EVENTS ··· 52 53 select HAVE_FUTEX_CMPXCHG if FUTEX 53 54 select HAVE_NMI 54 55 select NEED_SG_DMA_LENGTH 56 + select ARCH_HAS_GIGANTIC_PAGE 55 57 56 58 help 57 59 The SuperH is a RISC processor targeted for use in embedded systems

-1

arch/sh/boards/mach-dreamcast/irq.c

··· 10 10 */ 11 11 #include <linux/irq.h> 12 12 #include <linux/io.h> 13 - #include <linux/irq.h> 14 13 #include <linux/export.h> 15 14 #include <linux/err.h> 16 15 #include <mach/sysasic.h>

+6 -5

arch/sh/mm/gup.c

··· 204 204 * get_user_pages_fast() - pin user pages in memory 205 205 * @start: starting user address 206 206 * @nr_pages: number of pages from start to pin 207 - * @write: whether pages will be written to 207 + * @gup_flags: flags modifying pin behaviour 208 208 * @pages: array that receives pointers to the pages pinned. 209 209 * Should be at least nr_pages long. 210 210 * ··· 216 216 * requested. If nr_pages is 0 or negative, returns 0. If no pages 217 217 * were pinned, returns -errno. 218 218 */ 219 - int get_user_pages_fast(unsigned long start, int nr_pages, int write, 220 - struct page **pages) 219 + int get_user_pages_fast(unsigned long start, int nr_pages, 220 + unsigned int gup_flags, struct page **pages) 221 221 { 222 222 struct mm_struct *mm = current->mm; 223 223 unsigned long addr, len, end; ··· 241 241 next = pgd_addr_end(addr, end); 242 242 if (pgd_none(pgd)) 243 243 goto slow; 244 - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) 244 + if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, 245 + pages, &nr)) 245 246 goto slow; 246 247 } while (pgdp++, addr = next, addr != end); 247 248 local_irq_enable(); ··· 262 261 263 262 ret = get_user_pages_unlocked(start, 264 263 (end - start) >> PAGE_SHIFT, pages, 265 - write ? FOLL_WRITE : 0); 264 + gup_flags); 266 265 267 266 /* Have to be a bit careful with return values */ 268 267 if (nr > 0) {

+6 -23

arch/sh/mm/init.c

··· 403 403 mem_init_done = 1; 404 404 } 405 405 406 - void free_initmem(void) 407 - { 408 - free_initmem_default(-1); 409 - } 410 - 411 - #ifdef CONFIG_BLK_DEV_INITRD 412 - void free_initrd_mem(unsigned long start, unsigned long end) 413 - { 414 - free_reserved_area((void *)start, (void *)end, -1, "initrd"); 415 - } 416 - #endif 417 - 418 406 #ifdef CONFIG_MEMORY_HOTPLUG 419 - int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, 420 - bool want_memblock) 407 + int arch_add_memory(int nid, u64 start, u64 size, 408 + struct mhp_restrictions *restrictions) 421 409 { 422 410 unsigned long start_pfn = PFN_DOWN(start); 423 411 unsigned long nr_pages = size >> PAGE_SHIFT; 424 412 int ret; 425 413 426 414 /* We only have ZONE_NORMAL, so this is easy.. */ 427 - ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); 415 + ret = __add_pages(nid, start_pfn, nr_pages, restrictions); 428 416 if (unlikely(ret)) 429 417 printk("%s: Failed, __add_pages() == %d\n", __func__, ret); 430 418 ··· 429 441 #endif 430 442 431 443 #ifdef CONFIG_MEMORY_HOTREMOVE 432 - int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) 444 + void arch_remove_memory(int nid, u64 start, u64 size, 445 + struct vmem_altmap *altmap) 433 446 { 434 447 unsigned long start_pfn = PFN_DOWN(start); 435 448 unsigned long nr_pages = size >> PAGE_SHIFT; 436 449 struct zone *zone; 437 - int ret; 438 450 439 451 zone = page_zone(pfn_to_page(start_pfn)); 440 - ret = __remove_pages(zone, start_pfn, nr_pages, altmap); 441 - if (unlikely(ret)) 442 - pr_warn("%s: Failed, __remove_pages() == %d\n", __func__, 443 - ret); 444 - 445 - return ret; 452 + __remove_pages(zone, start_pfn, nr_pages, altmap); 446 453 } 447 454 #endif 448 455 #endif /* CONFIG_MEMORY_HOTPLUG */

+1

arch/sparc/Kconfig

··· 92 92 select ARCH_CLOCKSOURCE_DATA 93 93 select ARCH_HAS_PTE_SPECIAL 94 94 select PCI_DOMAINS if PCI 95 + select ARCH_HAS_GIGANTIC_PAGE 95 96 96 97 config ARCH_DEFCONFIG 97 98 string

-30

arch/sparc/include/asm/pgtable_64.h

··· 231 231 extern struct page *mem_map_zero; 232 232 #define ZERO_PAGE(vaddr) (mem_map_zero) 233 233 234 - /* This macro must be updated when the size of struct page grows above 80 235 - * or reduces below 64. 236 - * The idea that compiler optimizes out switch() statement, and only 237 - * leaves clrx instructions 238 - */ 239 - #define mm_zero_struct_page(pp) do { \ 240 - unsigned long *_pp = (void *)(pp); \ 241 - \ 242 - /* Check that struct page is either 64, 72, or 80 bytes */ \ 243 - BUILD_BUG_ON(sizeof(struct page) & 7); \ 244 - BUILD_BUG_ON(sizeof(struct page) < 64); \ 245 - BUILD_BUG_ON(sizeof(struct page) > 80); \ 246 - \ 247 - switch (sizeof(struct page)) { \ 248 - case 80: \ 249 - _pp[9] = 0; /* fallthrough */ \ 250 - case 72: \ 251 - _pp[8] = 0; /* fallthrough */ \ 252 - default: \ 253 - _pp[7] = 0; \ 254 - _pp[6] = 0; \ 255 - _pp[5] = 0; \ 256 - _pp[4] = 0; \ 257 - _pp[3] = 0; \ 258 - _pp[2] = 0; \ 259 - _pp[1] = 0; \ 260 - _pp[0] = 0; \ 261 - } \ 262 - } while (0) 263 - 264 234 /* PFNs are real physical page numbers. However, mem_map only begins to record 265 235 * per-page information starting at pfn_base. This is to handle systems where 266 236 * the first physical page in the machine is at some huge physical address,

+5 -4

arch/sparc/mm/gup.c

··· 245 245 return nr; 246 246 } 247 247 248 - int get_user_pages_fast(unsigned long start, int nr_pages, int write, 249 - struct page **pages) 248 + int get_user_pages_fast(unsigned long start, int nr_pages, 249 + unsigned int gup_flags, struct page **pages) 250 250 { 251 251 struct mm_struct *mm = current->mm; 252 252 unsigned long addr, len, end; ··· 303 303 next = pgd_addr_end(addr, end); 304 304 if (pgd_none(pgd)) 305 305 goto slow; 306 - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) 306 + if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, 307 + pages, &nr)) 307 308 goto slow; 308 309 } while (pgdp++, addr = next, addr != end); 309 310 ··· 325 324 326 325 ret = get_user_pages_unlocked(start, 327 326 (end - start) >> PAGE_SHIFT, pages, 328 - write ? FOLL_WRITE : 0); 327 + gup_flags); 329 328 330 329 /* Have to be a bit careful with return values */ 331 330 if (nr > 0) {

-13

arch/sparc/mm/init_32.c

··· 294 294 mem_init_print_info(NULL); 295 295 } 296 296 297 - void free_initmem (void) 298 - { 299 - free_initmem_default(POISON_FREE_INITMEM); 300 - } 301 - 302 - #ifdef CONFIG_BLK_DEV_INITRD 303 - void free_initrd_mem(unsigned long start, unsigned long end) 304 - { 305 - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, 306 - "initrd"); 307 - } 308 - #endif 309 - 310 297 void sparc_flush_page_to_ram(struct page *page) 311 298 { 312 299 unsigned long vaddr = (unsigned long)page_address(page);

-8

arch/sparc/mm/init_64.c

··· 2572 2572 } 2573 2573 } 2574 2574 2575 - #ifdef CONFIG_BLK_DEV_INITRD 2576 - void free_initrd_mem(unsigned long start, unsigned long end) 2577 - { 2578 - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, 2579 - "initrd"); 2580 - } 2581 - #endif 2582 - 2583 2575 pgprot_t PAGE_KERNEL __read_mostly; 2584 2576 EXPORT_SYMBOL(PAGE_KERNEL); 2585 2577

-7

arch/um/kernel/mem.c

··· 188 188 { 189 189 } 190 190 191 - #ifdef CONFIG_BLK_DEV_INITRD 192 - void free_initrd_mem(unsigned long start, unsigned long end) 193 - { 194 - free_reserved_area((void *)start, (void *)end, -1, "initrd"); 195 - } 196 - #endif 197 - 198 191 /* Allocate and free page tables. */ 199 192 200 193 pgd_t *pgd_alloc(struct mm_struct *mm)

+1

arch/unicore32/Kconfig

··· 3 3 def_bool y 4 4 select ARCH_32BIT_OFF_T 5 5 select ARCH_HAS_DEVMEM_IS_ALLOWED 6 + select ARCH_HAS_KEEPINITRD 6 7 select ARCH_MIGHT_HAVE_PC_PARPORT 7 8 select ARCH_MIGHT_HAVE_PC_SERIO 8 9 select HAVE_KERNEL_GZIP

-24

arch/unicore32/mm/init.c

··· 287 287 sysctl_overcommit_memory = OVERCOMMIT_ALWAYS; 288 288 } 289 289 } 290 - 291 - void free_initmem(void) 292 - { 293 - free_initmem_default(-1); 294 - } 295 - 296 - #ifdef CONFIG_BLK_DEV_INITRD 297 - 298 - static int keep_initrd; 299 - 300 - void free_initrd_mem(unsigned long start, unsigned long end) 301 - { 302 - if (!keep_initrd) 303 - free_reserved_area((void *)start, (void *)end, -1, "initrd"); 304 - } 305 - 306 - static int __init keepinitrd_setup(char *__unused) 307 - { 308 - keep_initrd = 1; 309 - return 1; 310 - } 311 - 312 - __setup("keepinitrd", keepinitrd_setup); 313 - #endif

+1 -2

arch/x86/Kconfig

··· 22 22 def_bool y 23 23 depends on 64BIT 24 24 # Options that are inherently 64-bit kernel only: 25 - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA 25 + select ARCH_HAS_GIGANTIC_PAGE 26 26 select ARCH_SUPPORTS_INT128 27 27 select ARCH_USE_CMPXCHG_LOCKREF 28 28 select HAVE_ARCH_SOFT_DIRTY ··· 47 47 select ARCH_32BIT_OFF_T if X86_32 48 48 select ARCH_CLOCKSOURCE_DATA 49 49 select ARCH_CLOCKSOURCE_INIT 50 - select ARCH_DISCARD_MEMBLOCK 51 50 select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI 52 51 select ARCH_HAS_DEBUG_VIRTUAL 53 52 select ARCH_HAS_DEVMEM_IS_ALLOWED

-4

arch/x86/include/asm/hugetlb.h

··· 17 17 { 18 18 } 19 19 20 - #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE 21 - static inline bool gigantic_page_supported(void) { return true; } 22 - #endif 23 - 24 20 #endif /* _ASM_X86_HUGETLB_H */

+1 -1

arch/x86/kvm/paging_tmpl.h

··· 140 140 pt_element_t *table; 141 141 struct page *page; 142 142 143 - npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page); 143 + npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page); 144 144 /* Check if the user is doing something meaningless. */ 145 145 if (unlikely(npages != 1)) 146 146 return -EFAULT;

+1 -1

arch/x86/kvm/svm.c

··· 1805 1805 return NULL; 1806 1806 1807 1807 /* Pin the user virtual address. */ 1808 - npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); 1808 + npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages); 1809 1809 if (npinned != npages) { 1810 1810 pr_err("SEV: Failure locking %lu pages.\n", npages); 1811 1811 goto err;

+1 -1

arch/x86/mm/hugetlbpage.c

··· 203 203 } 204 204 __setup("hugepagesz=", setup_hugepagesz); 205 205 206 - #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) 206 + #ifdef CONFIG_CONTIG_ALLOC 207 207 static __init int gigantic_pages_init(void) 208 208 { 209 209 /* With compaction or CMA we can allocate gigantic pages at runtime */

+6 -5

arch/x86/mm/init_32.c

··· 850 850 } 851 851 852 852 #ifdef CONFIG_MEMORY_HOTPLUG 853 - int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, 854 - bool want_memblock) 853 + int arch_add_memory(int nid, u64 start, u64 size, 854 + struct mhp_restrictions *restrictions) 855 855 { 856 856 unsigned long start_pfn = start >> PAGE_SHIFT; 857 857 unsigned long nr_pages = size >> PAGE_SHIFT; 858 858 859 - return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); 859 + return __add_pages(nid, start_pfn, nr_pages, restrictions); 860 860 } 861 861 862 862 #ifdef CONFIG_MEMORY_HOTREMOVE 863 - int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) 863 + void arch_remove_memory(int nid, u64 start, u64 size, 864 + struct vmem_altmap *altmap) 864 865 { 865 866 unsigned long start_pfn = start >> PAGE_SHIFT; 866 867 unsigned long nr_pages = size >> PAGE_SHIFT; 867 868 struct zone *zone; 868 869 869 870 zone = page_zone(pfn_to_page(start_pfn)); 870 - return __remove_pages(zone, start_pfn, nr_pages, altmap); 871 + __remove_pages(zone, start_pfn, nr_pages, altmap); 871 872 } 872 873 #endif 873 874 #endif

+8 -12

arch/x86/mm/init_64.c

··· 777 777 } 778 778 779 779 int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, 780 - struct vmem_altmap *altmap, bool want_memblock) 780 + struct mhp_restrictions *restrictions) 781 781 { 782 782 int ret; 783 783 784 - ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); 784 + ret = __add_pages(nid, start_pfn, nr_pages, restrictions); 785 785 WARN_ON_ONCE(ret); 786 786 787 787 /* update max_pfn, max_low_pfn and high_memory */ ··· 791 791 return ret; 792 792 } 793 793 794 - int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, 795 - bool want_memblock) 794 + int arch_add_memory(int nid, u64 start, u64 size, 795 + struct mhp_restrictions *restrictions) 796 796 { 797 797 unsigned long start_pfn = start >> PAGE_SHIFT; 798 798 unsigned long nr_pages = size >> PAGE_SHIFT; 799 799 800 800 init_memory_mapping(start, start + size); 801 801 802 - return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); 802 + return add_pages(nid, start_pfn, nr_pages, restrictions); 803 803 } 804 804 805 805 #define PAGE_INUSE 0xFD ··· 1141 1141 remove_pagetable(start, end, true, NULL); 1142 1142 } 1143 1143 1144 - int __ref arch_remove_memory(int nid, u64 start, u64 size, 1145 - struct vmem_altmap *altmap) 1144 + void __ref arch_remove_memory(int nid, u64 start, u64 size, 1145 + struct vmem_altmap *altmap) 1146 1146 { 1147 1147 unsigned long start_pfn = start >> PAGE_SHIFT; 1148 1148 unsigned long nr_pages = size >> PAGE_SHIFT; 1149 1149 struct page *page = pfn_to_page(start_pfn); 1150 1150 struct zone *zone; 1151 - int ret; 1152 1151 1153 1152 /* With altmap the first mapped page is offset from @start */ 1154 1153 if (altmap) 1155 1154 page += vmem_altmap_offset(altmap); 1156 1155 zone = page_zone(page); 1157 - ret = __remove_pages(zone, start_pfn, nr_pages, altmap); 1158 - WARN_ON_ONCE(ret); 1156 + __remove_pages(zone, start_pfn, nr_pages, altmap); 1159 1157 kernel_physical_mapping_remove(start, start + size); 1160 - 1161 - return ret; 1162 1158 } 1163 1159 #endif 1164 1160 #endif /* CONFIG_MEMORY_HOTPLUG */

-5

arch/xtensa/mm/init.c

··· 216 216 } 217 217 #endif 218 218 219 - void free_initmem(void) 220 - { 221 - free_initmem_default(-1); 222 - } 223 - 224 219 static void __init parse_memmap_one(char *p) 225 220 { 226 221 char *oldp;

+9 -15

drivers/base/memory.c

··· 231 231 * OK to have direct references to sparsemem variables in here. 232 232 */ 233 233 static int 234 - memory_block_action(unsigned long phys_index, unsigned long action, int online_type) 234 + memory_block_action(unsigned long start_section_nr, unsigned long action, 235 + int online_type) 235 236 { 236 237 unsigned long start_pfn; 237 238 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 238 239 int ret; 239 240 240 - start_pfn = section_nr_to_pfn(phys_index); 241 + start_pfn = section_nr_to_pfn(start_section_nr); 241 242 242 243 switch (action) { 243 244 case MEM_ONLINE: ··· 252 251 break; 253 252 default: 254 253 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 255 - "%ld\n", __func__, phys_index, action, action); 254 + "%ld\n", __func__, start_section_nr, action, action); 256 255 ret = -EINVAL; 257 256 } 258 257 ··· 734 733 { 735 734 BUG_ON(memory->dev.bus != &memory_subsys); 736 735 737 - /* drop the ref. we got in remove_memory_section() */ 736 + /* drop the ref. we got via find_memory_block() */ 738 737 put_device(&memory->dev); 739 738 device_unregister(&memory->dev); 740 739 } 741 740 742 - static int remove_memory_section(unsigned long node_id, 743 - struct mem_section *section, int phys_device) 741 + void unregister_memory_section(struct mem_section *section) 744 742 { 745 743 struct memory_block *mem; 744 + 745 + if (WARN_ON_ONCE(!present_section(section))) 746 + return; 746 747 747 748 mutex_lock(&mem_sysfs_mutex); 748 749 ··· 766 763 767 764 out_unlock: 768 765 mutex_unlock(&mem_sysfs_mutex); 769 - return 0; 770 - } 771 - 772 - int unregister_memory_section(struct mem_section *section) 773 - { 774 - if (!present_section(section)) 775 - return -EINVAL; 776 - 777 - return remove_memory_section(0, section, 0); 778 766 } 779 767 #endif /* CONFIG_MEMORY_HOTREMOVE */ 780 768

+2 -4

drivers/dax/device.c

··· 184 184 185 185 *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); 186 186 187 - return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, *pfn, 188 - vmf->flags & FAULT_FLAG_WRITE); 187 + return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); 189 188 } 190 189 191 190 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD ··· 234 235 235 236 *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); 236 237 237 - return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, *pfn, 238 - vmf->flags & FAULT_FLAG_WRITE); 238 + return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); 239 239 } 240 240 #else 241 241 static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,

+2 -13

drivers/firewire/core-iso.c

··· 107 107 int fw_iso_buffer_map_vma(struct fw_iso_buffer *buffer, 108 108 struct vm_area_struct *vma) 109 109 { 110 - unsigned long uaddr; 111 - int i, err; 112 - 113 - uaddr = vma->vm_start; 114 - for (i = 0; i < buffer->page_count; i++) { 115 - err = vm_insert_page(vma, uaddr, buffer->pages[i]); 116 - if (err) 117 - return err; 118 - 119 - uaddr += PAGE_SIZE; 120 - } 121 - 122 - return 0; 110 + return vm_map_pages_zero(vma, buffer->pages, 111 + buffer->page_count); 123 112 } 124 113 125 114 void fw_iso_buffer_destroy(struct fw_iso_buffer *buffer,

+1 -1

drivers/fpga/dfl-afu-dma-region.c

··· 102 102 goto unlock_vm; 103 103 } 104 104 105 - pinned = get_user_pages_fast(region->user_addr, npages, 1, 105 + pinned = get_user_pages_fast(region->user_addr, npages, FOLL_WRITE, 106 106 region->pages); 107 107 if (pinned < 0) { 108 108 ret = pinned;

+4 -4

drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c

··· 256 256 /* TODO we should be able to split locking for interval tree and 257 257 * amdgpu_mn_invalidate_node 258 258 */ 259 - if (amdgpu_mn_read_lock(amn, range->blockable)) 259 + if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range))) 260 260 return -EAGAIN; 261 261 262 262 it = interval_tree_iter_first(&amn->objects, range->start, end); 263 263 while (it) { 264 264 struct amdgpu_mn_node *node; 265 265 266 - if (!range->blockable) { 266 + if (!mmu_notifier_range_blockable(range)) { 267 267 amdgpu_mn_read_unlock(amn); 268 268 return -EAGAIN; 269 269 } ··· 299 299 /* notification is exclusive, but interval is inclusive */ 300 300 end = range->end - 1; 301 301 302 - if (amdgpu_mn_read_lock(amn, range->blockable)) 302 + if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range))) 303 303 return -EAGAIN; 304 304 305 305 it = interval_tree_iter_first(&amn->objects, range->start, end); ··· 307 307 struct amdgpu_mn_node *node; 308 308 struct amdgpu_bo *bo; 309 309 310 - if (!range->blockable) { 310 + if (!mmu_notifier_range_blockable(range)) { 311 311 amdgpu_mn_read_unlock(amn); 312 312 return -EAGAIN; 313 313 }

+1 -1

drivers/gpu/drm/i915/i915_gem_userptr.c

··· 122 122 while (it) { 123 123 struct drm_i915_gem_object *obj; 124 124 125 - if (!range->blockable) { 125 + if (!mmu_notifier_range_blockable(range)) { 126 126 ret = -EAGAIN; 127 127 break; 128 128 }

+2 -2

drivers/gpu/drm/radeon/radeon_mn.c

··· 133 133 /* TODO we should be able to split locking for interval tree and 134 134 * the tear down. 135 135 */ 136 - if (range->blockable) 136 + if (mmu_notifier_range_blockable(range)) 137 137 mutex_lock(&rmn->lock); 138 138 else if (!mutex_trylock(&rmn->lock)) 139 139 return -EAGAIN; ··· 144 144 struct radeon_bo *bo; 145 145 long r; 146 146 147 - if (!range->blockable) { 147 + if (!mmu_notifier_range_blockable(range)) { 148 148 ret = -EAGAIN; 149 149 goto out_unlock; 150 150 }

+2 -15

drivers/gpu/drm/rockchip/rockchip_drm_gem.c

··· 221 221 struct vm_area_struct *vma) 222 222 { 223 223 struct rockchip_gem_object *rk_obj = to_rockchip_obj(obj); 224 - unsigned int i, count = obj->size >> PAGE_SHIFT; 224 + unsigned int count = obj->size >> PAGE_SHIFT; 225 225 unsigned long user_count = vma_pages(vma); 226 - unsigned long uaddr = vma->vm_start; 227 - unsigned long offset = vma->vm_pgoff; 228 - unsigned long end = user_count + offset; 229 - int ret; 230 226 231 227 if (user_count == 0) 232 228 return -ENXIO; 233 - if (end > count) 234 - return -ENXIO; 235 229 236 - for (i = offset; i < end; i++) { 237 - ret = vm_insert_page(vma, uaddr, rk_obj->pages[i]); 238 - if (ret) 239 - return ret; 240 - uaddr += PAGE_SIZE; 241 - } 242 - 243 - return 0; 230 + return vm_map_pages(vma, rk_obj->pages, count); 244 231 } 245 232 246 233 static int rockchip_drm_gem_object_mmap_dma(struct drm_gem_object *obj,

+2 -1

drivers/gpu/drm/via/via_dmablit.c

··· 243 243 if (NULL == vsg->pages) 244 244 return -ENOMEM; 245 245 ret = get_user_pages_fast((unsigned long)xfer->mem_addr, 246 - vsg->num_pages, vsg->direction == DMA_FROM_DEVICE, 246 + vsg->num_pages, 247 + vsg->direction == DMA_FROM_DEVICE ? FOLL_WRITE : 0, 247 248 vsg->pages); 248 249 if (ret != vsg->num_pages) { 249 250 if (ret < 0)

+5 -13

drivers/gpu/drm/xen/xen_drm_front_gem.c

··· 224 224 static int gem_mmap_obj(struct xen_gem_object *xen_obj, 225 225 struct vm_area_struct *vma) 226 226 { 227 - unsigned long addr = vma->vm_start; 228 - int i; 227 + int ret; 229 228 230 229 /* 231 230 * clear the VM_PFNMAP flag that was set by drm_gem_mmap(), and set the ··· 251 252 * FIXME: as we insert all the pages now then no .fault handler must 252 253 * be called, so don't provide one 253 254 */ 254 - for (i = 0; i < xen_obj->num_pages; i++) { 255 - int ret; 255 + ret = vm_map_pages(vma, xen_obj->pages, xen_obj->num_pages); 256 + if (ret < 0) 257 + DRM_ERROR("Failed to map pages into vma: %d\n", ret); 256 258 257 - ret = vm_insert_page(vma, addr, xen_obj->pages[i]); 258 - if (ret < 0) { 259 - DRM_ERROR("Failed to insert pages into vma: %d\n", ret); 260 - return ret; 261 - } 262 - 263 - addr += PAGE_SIZE; 264 - } 265 - return 0; 259 + return ret; 266 260 } 267 261 268 262 int xen_drm_front_gem_mmap(struct file *filp, struct vm_area_struct *vma)

+3 -2

drivers/infiniband/core/umem.c

··· 295 295 296 296 while (npages) { 297 297 down_read(&mm->mmap_sem); 298 - ret = get_user_pages_longterm(cur_base, 298 + ret = get_user_pages(cur_base, 299 299 min_t(unsigned long, npages, 300 300 PAGE_SIZE / sizeof (struct page *)), 301 - gup_flags, page_list, NULL); 301 + gup_flags | FOLL_LONGTERM, 302 + page_list, NULL); 302 303 if (ret < 0) { 303 304 up_read(&mm->mmap_sem); 304 305 goto umem_release;

+3 -2

drivers/infiniband/core/umem_odp.c

··· 152 152 struct ib_ucontext_per_mm *per_mm = 153 153 container_of(mn, struct ib_ucontext_per_mm, mn); 154 154 155 - if (range->blockable) 155 + if (mmu_notifier_range_blockable(range)) 156 156 down_read(&per_mm->umem_rwsem); 157 157 else if (!down_read_trylock(&per_mm->umem_rwsem)) 158 158 return -EAGAIN; ··· 170 170 return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, 171 171 range->end, 172 172 invalidate_range_start_trampoline, 173 - range->blockable, NULL); 173 + mmu_notifier_range_blockable(range), 174 + NULL); 174 175 } 175 176 176 177 static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,

+2 -1

drivers/infiniband/hw/hfi1/user_pages.c

··· 104 104 bool writable, struct page **pages) 105 105 { 106 106 int ret; 107 + unsigned int gup_flags = FOLL_LONGTERM | (writable ? FOLL_WRITE : 0); 107 108 108 - ret = get_user_pages_fast(vaddr, npages, writable, pages); 109 + ret = get_user_pages_fast(vaddr, npages, gup_flags, pages); 109 110 if (ret < 0) 110 111 return ret; 111 112

+2 -1

drivers/infiniband/hw/mthca/mthca_memfree.c

··· 472 472 goto out; 473 473 } 474 474 475 - ret = get_user_pages_fast(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages); 475 + ret = get_user_pages_fast(uaddr & PAGE_MASK, 1, 476 + FOLL_WRITE | FOLL_LONGTERM, pages); 476 477 if (ret < 0) 477 478 goto out; 478 479

+4 -4

drivers/infiniband/hw/qib/qib_user_pages.c

··· 114 114 115 115 down_read(&current->mm->mmap_sem); 116 116 for (got = 0; got < num_pages; got += ret) { 117 - ret = get_user_pages_longterm(start_page + got * PAGE_SIZE, 118 - num_pages - got, 119 - FOLL_WRITE | FOLL_FORCE, 120 - p + got, NULL); 117 + ret = get_user_pages(start_page + got * PAGE_SIZE, 118 + num_pages - got, 119 + FOLL_LONGTERM | FOLL_WRITE | FOLL_FORCE, 120 + p + got, NULL); 121 121 if (ret < 0) { 122 122 up_read(&current->mm->mmap_sem); 123 123 goto bail_release;

+1 -1

drivers/infiniband/hw/qib/qib_user_sdma.c

··· 670 670 else 671 671 j = npages; 672 672 673 - ret = get_user_pages_fast(addr, j, 0, pages); 673 + ret = get_user_pages_fast(addr, j, FOLL_LONGTERM, pages); 674 674 if (ret != j) { 675 675 i = 0; 676 676 j = ret;

+5 -4

drivers/infiniband/hw/usnic/usnic_uiom.c

··· 143 143 ret = 0; 144 144 145 145 while (npages) { 146 - ret = get_user_pages_longterm(cur_base, 147 - min_t(unsigned long, npages, 148 - PAGE_SIZE / sizeof(struct page *)), 149 - gup_flags, page_list, NULL); 146 + ret = get_user_pages(cur_base, 147 + min_t(unsigned long, npages, 148 + PAGE_SIZE / sizeof(struct page *)), 149 + gup_flags | FOLL_LONGTERM, 150 + page_list, NULL); 150 151 151 152 if (ret < 0) 152 153 goto out;

+1 -11

drivers/iommu/dma-iommu.c

··· 619 619 620 620 int iommu_dma_mmap(struct page **pages, size_t size, struct vm_area_struct *vma) 621 621 { 622 - unsigned long uaddr = vma->vm_start; 623 - unsigned int i, count = PAGE_ALIGN(size) >> PAGE_SHIFT; 624 - int ret = -ENXIO; 625 - 626 - for (i = vma->vm_pgoff; i < count && uaddr < vma->vm_end; i++) { 627 - ret = vm_insert_page(vma, uaddr, pages[i]); 628 - if (ret) 629 - break; 630 - uaddr += PAGE_SIZE; 631 - } 632 - return ret; 622 + return vm_map_pages(vma, pages, PAGE_ALIGN(size) >> PAGE_SHIFT); 633 623 } 634 624 635 625 static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,

+7

drivers/media/common/videobuf2/videobuf2-core.c

··· 2201 2201 goto unlock; 2202 2202 } 2203 2203 2204 + /* 2205 + * vm_pgoff is treated in V4L2 API as a 'cookie' to select a buffer, 2206 + * not as a in-buffer offset. We always want to mmap a whole buffer 2207 + * from its beginning. 2208 + */ 2209 + vma->vm_pgoff = 0; 2210 + 2204 2211 ret = call_memop(vb, mmap, vb->planes[plane].mem_priv, vma); 2205 2212 2206 2213 unlock:

-6

drivers/media/common/videobuf2/videobuf2-dma-contig.c

··· 186 186 return -EINVAL; 187 187 } 188 188 189 - /* 190 - * dma_mmap_* uses vm_pgoff as in-buffer offset, but we want to 191 - * map whole buffer 192 - */ 193 - vma->vm_pgoff = 0; 194 - 195 189 ret = dma_mmap_attrs(buf->dev, vma, buf->cookie, 196 190 buf->dma_addr, buf->size, buf->attrs); 197 191

+6 -16

drivers/media/common/videobuf2/videobuf2-dma-sg.c

··· 328 328 static int vb2_dma_sg_mmap(void *buf_priv, struct vm_area_struct *vma) 329 329 { 330 330 struct vb2_dma_sg_buf *buf = buf_priv; 331 - unsigned long uaddr = vma->vm_start; 332 - unsigned long usize = vma->vm_end - vma->vm_start; 333 - int i = 0; 331 + int err; 334 332 335 333 if (!buf) { 336 334 printk(KERN_ERR "No memory to map\n"); 337 335 return -EINVAL; 338 336 } 339 337 340 - do { 341 - int ret; 342 - 343 - ret = vm_insert_page(vma, uaddr, buf->pages[i++]); 344 - if (ret) { 345 - printk(KERN_ERR "Remapping memory, error: %d\n", ret); 346 - return ret; 347 - } 348 - 349 - uaddr += PAGE_SIZE; 350 - usize -= PAGE_SIZE; 351 - } while (usize > 0); 352 - 338 + err = vm_map_pages(vma, buf->pages, buf->num_pages); 339 + if (err) { 340 + printk(KERN_ERR "Remapping memory, error: %d\n", err); 341 + return err; 342 + } 353 343 354 344 /* 355 345 * Use common vm_area operations to track buffer refcount.

+3 -3

drivers/media/v4l2-core/videobuf-dma-sg.c

··· 186 186 dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n", 187 187 data, size, dma->nr_pages); 188 188 189 - err = get_user_pages_longterm(data & PAGE_MASK, dma->nr_pages, 190 - flags, dma->pages, NULL); 189 + err = get_user_pages(data & PAGE_MASK, dma->nr_pages, 190 + flags | FOLL_LONGTERM, dma->pages, NULL); 191 191 192 192 if (err != dma->nr_pages) { 193 193 dma->nr_pages = (err >= 0) ? err : 0; 194 - dprintk(1, "get_user_pages_longterm: err=%d [%d]\n", err, 194 + dprintk(1, "get_user_pages: err=%d [%d]\n", err, 195 195 dma->nr_pages); 196 196 return err < 0 ? err : -EINVAL; 197 197 }

+1 -1

drivers/misc/genwqe/card_utils.c

··· 603 603 /* pin user pages in memory */ 604 604 rc = get_user_pages_fast(data & PAGE_MASK, /* page aligned addr */ 605 605 m->nr_pages, 606 - m->write, /* readable/writable */ 606 + m->write ? FOLL_WRITE : 0, /* readable/writable */ 607 607 m->page_list); /* ptrs to pages */ 608 608 if (rc < 0) 609 609 goto fail_get_user_pages;

+1 -1

drivers/misc/vmw_vmci/vmci_host.c

··· 242 242 /* 243 243 * Lock physical page backing a given user VA. 244 244 */ 245 - retval = get_user_pages_fast(uva, 1, 1, &context->notify_page); 245 + retval = get_user_pages_fast(uva, 1, FOLL_WRITE, &context->notify_page); 246 246 if (retval != 1) { 247 247 context->notify_page = NULL; 248 248 return VMCI_ERROR_GENERIC;

+4 -2

drivers/misc/vmw_vmci/vmci_queue_pair.c

··· 659 659 int err = VMCI_SUCCESS; 660 660 661 661 retval = get_user_pages_fast((uintptr_t) produce_uva, 662 - produce_q->kernel_if->num_pages, 1, 662 + produce_q->kernel_if->num_pages, 663 + FOLL_WRITE, 663 664 produce_q->kernel_if->u.h.header_page); 664 665 if (retval < (int)produce_q->kernel_if->num_pages) { 665 666 pr_debug("get_user_pages_fast(produce) failed (retval=%d)", ··· 672 671 } 673 672 674 673 retval = get_user_pages_fast((uintptr_t) consume_uva, 675 - consume_q->kernel_if->num_pages, 1, 674 + consume_q->kernel_if->num_pages, 675 + FOLL_WRITE, 676 676 consume_q->kernel_if->u.h.header_page); 677 677 if (retval < (int)consume_q->kernel_if->num_pages) { 678 678 pr_debug("get_user_pages_fast(consume) failed (retval=%d)",

+2 -1

drivers/platform/goldfish/goldfish_pipe.c

··· 274 274 *iter_last_page_size = last_page_size; 275 275 } 276 276 277 - ret = get_user_pages_fast(first_page, requested_pages, !is_write, 277 + ret = get_user_pages_fast(first_page, requested_pages, 278 + !is_write ? FOLL_WRITE : 0, 278 279 pages); 279 280 if (ret <= 0) 280 281 return -EFAULT;

+3 -1

drivers/rapidio/devices/rio_mport_cdev.c

··· 868 868 869 869 pinned = get_user_pages_fast( 870 870 (unsigned long)xfer->loc_addr & PAGE_MASK, 871 - nr_pages, dir == DMA_FROM_DEVICE, page_list); 871 + nr_pages, 872 + dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0, 873 + page_list); 872 874 873 875 if (pinned != nr_pages) { 874 876 if (pinned < 0) {

+1 -1

drivers/sbus/char/oradax.c

··· 437 437 438 438 dax_dbg("uva %p", va); 439 439 440 - ret = get_user_pages_fast((unsigned long)va, 1, 1, p); 440 + ret = get_user_pages_fast((unsigned long)va, 1, FOLL_WRITE, p); 441 441 if (ret == 1) { 442 442 dax_dbg("locked page %p, for VA %p", *p, va); 443 443 return 0;

+2 -1

drivers/scsi/st.c

··· 4922 4922 4923 4923 /* Try to fault in all of the necessary pages */ 4924 4924 /* rw==READ means read from drive, write into memory area */ 4925 - res = get_user_pages_fast(uaddr, nr_pages, rw == READ, pages); 4925 + res = get_user_pages_fast(uaddr, nr_pages, rw == READ ? FOLL_WRITE : 0, 4926 + pages); 4926 4927 4927 4928 /* Errors and no page mapped should return here */ 4928 4929 if (res < nr_pages)

+2 -2

drivers/staging/gasket/gasket_page_table.c

··· 486 486 ptes[i].dma_addr = pg_tbl->coherent_pages[0].paddr + 487 487 off + i * PAGE_SIZE; 488 488 } else { 489 - ret = get_user_pages_fast(page_addr - offset, 1, 1, 490 - &page); 489 + ret = get_user_pages_fast(page_addr - offset, 1, 490 + FOLL_WRITE, &page); 491 491 492 492 if (ret <= 0) { 493 493 dev_err(pg_tbl->device,

+1 -1

drivers/tee/tee_shm.c

··· 273 273 goto err; 274 274 } 275 275 276 - rc = get_user_pages_fast(start, num_pages, 1, shm->pages); 276 + rc = get_user_pages_fast(start, num_pages, FOLL_WRITE, shm->pages); 277 277 if (rc > 0) 278 278 shm->num_pages = rc; 279 279 if (rc != num_pages) {

+2 -1

drivers/vfio/vfio_iommu_spapr_tce.c

··· 532 532 enum dma_data_direction direction = iommu_tce_direction(tce); 533 533 534 534 if (get_user_pages_fast(tce & PAGE_MASK, 1, 535 - direction != DMA_TO_DEVICE, &page) != 1) 535 + direction != DMA_TO_DEVICE ? FOLL_WRITE : 0, 536 + &page) != 1) 536 537 return -EFAULT; 537 538 538 539 *hpa = __pa((unsigned long) page_address(page));

+2 -1

drivers/vfio/vfio_iommu_type1.c

··· 358 358 359 359 down_read(&mm->mmap_sem); 360 360 if (mm == current->mm) { 361 - ret = get_user_pages_longterm(vaddr, 1, flags, page, vmas); 361 + ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page, 362 + vmas); 362 363 } else { 363 364 ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, 364 365 vmas, NULL);

+1 -1

drivers/vhost/vhost.c

··· 1704 1704 int bit = nr + (log % PAGE_SIZE) * 8; 1705 1705 int r; 1706 1706 1707 - r = get_user_pages_fast(log, 1, 1, &page); 1707 + r = get_user_pages_fast(log, 1, FOLL_WRITE, &page); 1708 1708 if (r < 0) 1709 1709 return r; 1710 1710 BUG_ON(r != 1);

+1 -1

drivers/video/fbdev/pvr2fb.c

··· 686 686 if (!pages) 687 687 return -ENOMEM; 688 688 689 - ret = get_user_pages_fast((unsigned long)buf, nr_pages, true, pages); 689 + ret = get_user_pages_fast((unsigned long)buf, nr_pages, FOLL_WRITE, pages); 690 690 if (ret < nr_pages) { 691 691 nr_pages = ret; 692 692 ret = -EINVAL;

+1 -1

drivers/virt/fsl_hypervisor.c

··· 244 244 245 245 /* Get the physical addresses of the source buffer */ 246 246 num_pinned = get_user_pages_fast(param.local_vaddr - lb_offset, 247 - num_pages, param.source != -1, pages); 247 + num_pages, param.source != -1 ? FOLL_WRITE : 0, pages); 248 248 249 249 if (num_pinned != num_pages) { 250 250 /* get_user_pages() failed */

+8 -11

drivers/xen/gntdev.c

··· 526 526 struct gntdev_grant_map *map; 527 527 int ret = 0; 528 528 529 - if (range->blockable) 529 + if (mmu_notifier_range_blockable(range)) 530 530 mutex_lock(&priv->lock); 531 531 else if (!mutex_trylock(&priv->lock)) 532 532 return -EAGAIN; 533 533 534 534 list_for_each_entry(map, &priv->maps, next) { 535 535 ret = unmap_if_in_range(map, range->start, range->end, 536 - range->blockable); 536 + mmu_notifier_range_blockable(range)); 537 537 if (ret) 538 538 goto out_unlock; 539 539 } 540 540 list_for_each_entry(map, &priv->freeable_maps, next) { 541 541 ret = unmap_if_in_range(map, range->start, range->end, 542 - range->blockable); 542 + mmu_notifier_range_blockable(range)); 543 543 if (ret) 544 544 goto out_unlock; 545 545 } ··· 852 852 unsigned long xen_pfn; 853 853 int ret; 854 854 855 - ret = get_user_pages_fast(addr, 1, writeable, &page); 855 + ret = get_user_pages_fast(addr, 1, writeable ? FOLL_WRITE : 0, &page); 856 856 if (ret < 0) 857 857 return ret; 858 858 ··· 1084 1084 int index = vma->vm_pgoff; 1085 1085 int count = vma_pages(vma); 1086 1086 struct gntdev_grant_map *map; 1087 - int i, err = -EINVAL; 1087 + int err = -EINVAL; 1088 1088 1089 1089 if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) 1090 1090 return -EINVAL; ··· 1145 1145 goto out_put_map; 1146 1146 1147 1147 if (!use_ptemod) { 1148 - for (i = 0; i < count; i++) { 1149 - err = vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE, 1150 - map->pages[i]); 1151 - if (err) 1152 - goto out_put_map; 1153 - } 1148 + err = vm_map_pages(vma, map->pages, map->count); 1149 + if (err) 1150 + goto out_put_map; 1154 1151 } else { 1155 1152 #ifdef CONFIG_X86 1156 1153 /*

+2 -6

drivers/xen/privcmd-buf.c

··· 165 165 if (vma_priv->n_pages != count) 166 166 ret = -ENOMEM; 167 167 else 168 - for (i = 0; i < vma_priv->n_pages; i++) { 169 - ret = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE, 170 - vma_priv->pages[i]); 171 - if (ret) 172 - break; 173 - } 168 + ret = vm_map_pages_zero(vma, vma_priv->pages, 169 + vma_priv->n_pages); 174 170 175 171 if (ret) 176 172 privcmd_buf_vmapriv_free(vma_priv);

+3 -5

fs/dax.c

··· 814 814 goto unlock_pmd; 815 815 816 816 flush_cache_page(vma, address, pfn); 817 - pmd = pmdp_huge_clear_flush(vma, address, pmdp); 817 + pmd = pmdp_invalidate(vma, address, pmdp); 818 818 pmd = pmd_wrprotect(pmd); 819 819 pmd = pmd_mkclean(pmd); 820 820 set_pmd_at(vma->vm_mm, address, pmdp, pmd); ··· 1575 1575 } 1576 1576 1577 1577 trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry); 1578 - result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn, 1579 - write); 1578 + result = vmf_insert_pfn_pmd(vmf, pfn, write); 1580 1579 break; 1581 1580 case IOMAP_UNWRITTEN: 1582 1581 case IOMAP_HOLE: ··· 1685 1686 ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); 1686 1687 #ifdef CONFIG_FS_DAX_PMD 1687 1688 else if (order == PMD_ORDER) 1688 - ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, 1689 - pfn, true); 1689 + ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE); 1690 1690 #endif 1691 1691 else 1692 1692 ret = VM_FAULT_FALLBACK;

+11 -7

fs/hugetlbfs/inode.c

··· 440 440 u32 hash; 441 441 442 442 index = page->index; 443 - hash = hugetlb_fault_mutex_hash(h, current->mm, 444 - &pseudo_vma, 445 - mapping, index, 0); 443 + hash = hugetlb_fault_mutex_hash(h, mapping, index, 0); 446 444 mutex_lock(&hugetlb_fault_mutex_table[hash]); 447 445 448 446 /* ··· 497 499 struct resv_map *resv_map; 498 500 499 501 remove_inode_hugepages(inode, 0, LLONG_MAX); 500 - resv_map = (struct resv_map *)inode->i_mapping->private_data; 501 - /* root inode doesn't have the resv_map, so we should check it */ 502 + 503 + /* 504 + * Get the resv_map from the address space embedded in the inode. 505 + * This is the address space which points to any resv_map allocated 506 + * at inode creation time. If this is a device special inode, 507 + * i_mapping may not point to the original address space. 508 + */ 509 + resv_map = (struct resv_map *)(&inode->i_data)->private_data; 510 + /* Only regular and link inodes have associated reserve maps */ 502 511 if (resv_map) 503 512 resv_map_release(&resv_map->refs); 504 513 clear_inode(inode); ··· 644 639 addr = index * hpage_size; 645 640 646 641 /* mutex taken here, fault path and hole punch */ 647 - hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, 648 - index, addr); 642 + hash = hugetlb_fault_mutex_hash(h, mapping, index, addr); 649 643 mutex_lock(&hugetlb_fault_mutex_table[hash]); 650 644 651 645 /* See if already present in mapping to avoid alloc/free */

+3 -2

fs/io_uring.c

··· 2697 2697 2698 2698 ret = 0; 2699 2699 down_read(&current->mm->mmap_sem); 2700 - pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE, 2701 - pages, vmas); 2700 + pret = get_user_pages(ubuf, nr_pages, 2701 + FOLL_WRITE | FOLL_LONGTERM, 2702 + pages, vmas); 2702 2703 if (pret == nr_pages) { 2703 2704 /* don't support file backed memory */ 2704 2705 for (j = 0; j < nr_pages; j++) {

+4 -16

fs/ocfs2/dir.c

··· 69 69 #define NAMEI_RA_BLOCKS 4 70 70 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) 71 71 72 - static unsigned char ocfs2_filetype_table[] = { 73 - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 74 - }; 75 - 76 72 static int ocfs2_do_extend_dir(struct super_block *sb, 77 73 handle_t *handle, 78 74 struct inode *dir, ··· 1714 1718 de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); 1715 1719 de = de1; 1716 1720 } 1717 - de->file_type = OCFS2_FT_UNKNOWN; 1721 + de->file_type = FT_UNKNOWN; 1718 1722 if (blkno) { 1719 1723 de->inode = cpu_to_le64(blkno); 1720 1724 ocfs2_set_de_type(de, inode->i_mode); ··· 1799 1803 } 1800 1804 offset += le16_to_cpu(de->rec_len); 1801 1805 if (le64_to_cpu(de->inode)) { 1802 - unsigned char d_type = DT_UNKNOWN; 1803 - 1804 - if (de->file_type < OCFS2_FT_MAX) 1805 - d_type = ocfs2_filetype_table[de->file_type]; 1806 - 1807 1806 if (!dir_emit(ctx, de->name, de->name_len, 1808 - le64_to_cpu(de->inode), d_type)) 1807 + le64_to_cpu(de->inode), 1808 + fs_ftype_to_dtype(de->file_type))) 1809 1809 goto out; 1810 1810 } 1811 1811 ctx->pos += le16_to_cpu(de->rec_len); ··· 1892 1900 break; 1893 1901 } 1894 1902 if (le64_to_cpu(de->inode)) { 1895 - unsigned char d_type = DT_UNKNOWN; 1896 - 1897 - if (de->file_type < OCFS2_FT_MAX) 1898 - d_type = ocfs2_filetype_table[de->file_type]; 1899 1903 if (!dir_emit(ctx, de->name, 1900 1904 de->name_len, 1901 1905 le64_to_cpu(de->inode), 1902 - d_type)) { 1906 + fs_ftype_to_dtype(de->file_type))) { 1903 1907 brelse(bh); 1904 1908 return 0; 1905 1909 }

+29 -1

fs/ocfs2/export.c

··· 148 148 u64 blkno; 149 149 struct dentry *parent; 150 150 struct inode *dir = d_inode(child); 151 + int set; 151 152 152 153 trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name, 153 154 (unsigned long long)OCFS2_I(dir)->ip_blkno); 155 + 156 + status = ocfs2_nfs_sync_lock(OCFS2_SB(dir->i_sb), 1); 157 + if (status < 0) { 158 + mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status); 159 + parent = ERR_PTR(status); 160 + goto bail; 161 + } 154 162 155 163 status = ocfs2_inode_lock(dir, NULL, 0); 156 164 if (status < 0) { 157 165 if (status != -ENOENT) 158 166 mlog_errno(status); 159 167 parent = ERR_PTR(status); 160 - goto bail; 168 + goto unlock_nfs_sync; 161 169 } 162 170 163 171 status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno); ··· 174 166 goto bail_unlock; 175 167 } 176 168 169 + status = ocfs2_test_inode_bit(OCFS2_SB(dir->i_sb), blkno, &set); 170 + if (status < 0) { 171 + if (status == -EINVAL) { 172 + status = -ESTALE; 173 + } else 174 + mlog(ML_ERROR, "test inode bit failed %d\n", status); 175 + parent = ERR_PTR(status); 176 + goto bail_unlock; 177 + } 178 + 179 + trace_ocfs2_get_dentry_test_bit(status, set); 180 + if (!set) { 181 + status = -ESTALE; 182 + parent = ERR_PTR(status); 183 + goto bail_unlock; 184 + } 185 + 177 186 parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0)); 178 187 179 188 bail_unlock: 180 189 ocfs2_inode_unlock(dir, 0); 190 + 191 + unlock_nfs_sync: 192 + ocfs2_nfs_sync_unlock(OCFS2_SB(dir->i_sb), 1); 181 193 182 194 bail: 183 195 trace_ocfs2_get_parent_end(parent);

+1 -27

fs/ocfs2/ocfs2_fs.h

··· 392 392 #define OCFS2_HB_GLOBAL "heartbeat=global" 393 393 394 394 /* 395 - * OCFS2 directory file types. Only the low 3 bits are used. The 396 - * other bits are reserved for now. 397 - */ 398 - #define OCFS2_FT_UNKNOWN 0 399 - #define OCFS2_FT_REG_FILE 1 400 - #define OCFS2_FT_DIR 2 401 - #define OCFS2_FT_CHRDEV 3 402 - #define OCFS2_FT_BLKDEV 4 403 - #define OCFS2_FT_FIFO 5 404 - #define OCFS2_FT_SOCK 6 405 - #define OCFS2_FT_SYMLINK 7 406 - 407 - #define OCFS2_FT_MAX 8 408 - 409 - /* 410 395 * OCFS2_DIR_PAD defines the directory entries boundaries 411 396 * 412 397 * NOTE: It must be a multiple of 4 ··· 408 423 #define OCFS2_DX_LINK_MAX ((1U << 31) - 1U) 409 424 #define OCFS2_LINKS_HI_SHIFT 16 410 425 #define OCFS2_DX_ENTRIES_MAX (0xffffffffU) 411 - 412 - #define S_SHIFT 12 413 - static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { 414 - [S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE, 415 - [S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR, 416 - [S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV, 417 - [S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV, 418 - [S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO, 419 - [S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK, 420 - [S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK, 421 - }; 422 426 423 427 424 428 /* ··· 1603 1629 static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de, 1604 1630 umode_t mode) 1605 1631 { 1606 - de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; 1632 + de->file_type = fs_umode_to_ftype(mode); 1607 1633 } 1608 1634 1609 1635 static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)

+1 -1

fs/orangefs/orangefs-bufmap.c

··· 269 269 270 270 /* map the pages */ 271 271 ret = get_user_pages_fast((unsigned long)user_desc->ptr, 272 - bufmap->page_count, 1, bufmap->page_array); 272 + bufmap->page_count, FOLL_WRITE, bufmap->page_array); 273 273 274 274 if (ret < 0) 275 275 return ret;

+2 -1

fs/proc/task_mmu.c

··· 1169 1169 break; 1170 1170 } 1171 1171 1172 - mmu_notifier_range_init(&range, mm, 0, -1UL); 1172 + mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, 1173 + 0, NULL, mm, 0, -1UL); 1173 1174 mmu_notifier_invalidate_range_start(&range); 1174 1175 } 1175 1176 walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);

+15 -6

fs/sync.c

··· 292 292 } 293 293 294 294 if (flags & SYNC_FILE_RANGE_WRITE) { 295 + int sync_mode = WB_SYNC_NONE; 296 + 297 + if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) == 298 + SYNC_FILE_RANGE_WRITE_AND_WAIT) 299 + sync_mode = WB_SYNC_ALL; 300 + 295 301 ret = __filemap_fdatawrite_range(mapping, offset, endbyte, 296 - WB_SYNC_NONE); 302 + sync_mode); 297 303 if (ret < 0) 298 304 goto out; 299 305 } ··· 312 306 } 313 307 314 308 /* 315 - * sys_sync_file_range() permits finely controlled syncing over a segment of 309 + * ksys_sync_file_range() permits finely controlled syncing over a segment of 316 310 * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is 317 - * zero then sys_sync_file_range() will operate from offset out to EOF. 311 + * zero then ksys_sync_file_range() will operate from offset out to EOF. 318 312 * 319 313 * The flag bits are: 320 314 * ··· 331 325 * Useful combinations of the flag bits are: 332 326 * 333 327 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages 334 - * in the range which were dirty on entry to sys_sync_file_range() are placed 328 + * in the range which were dirty on entry to ksys_sync_file_range() are placed 335 329 * under writeout. This is a start-write-for-data-integrity operation. 336 330 * 337 331 * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which ··· 343 337 * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait 344 338 * for that operation to complete and to return the result. 345 339 * 346 - * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER: 340 + * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER 341 + * (a.k.a. SYNC_FILE_RANGE_WRITE_AND_WAIT): 347 342 * a traditional sync() operation. This is a write-for-data-integrity operation 348 343 * which will ensure that all pages in the range which were dirty on entry to 349 - * sys_sync_file_range() are committed to disk. 344 + * ksys_sync_file_range() are written to disk. It should be noted that disk 345 + * caches are not flushed by this call, so there are no guarantees here that the 346 + * data will be available on disk after a crash. 350 347 * 351 348 * 352 349 * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any

+5

fs/userfaultfd.c

··· 30 30 #include <linux/security.h> 31 31 #include <linux/hugetlb.h> 32 32 33 + int sysctl_unprivileged_userfaultfd __read_mostly = 1; 34 + 33 35 static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; 34 36 35 37 enum userfaultfd_state { ··· 1931 1929 { 1932 1930 struct userfaultfd_ctx *ctx; 1933 1931 int fd; 1932 + 1933 + if (!sysctl_unprivileged_userfaultfd && !capable(CAP_SYS_PTRACE)) 1934 + return -EPERM; 1934 1935 1935 1936 BUG_ON(!current->mm); 1936 1937

+7

include/asm-generic/hugetlb.h

··· 126 126 } 127 127 #endif 128 128 129 + #ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED 130 + static inline bool gigantic_page_runtime_supported(void) 131 + { 132 + return IS_ENABLED(CONFIG_ARCH_HAS_GIGANTIC_PAGE); 133 + } 134 + #endif /* __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED */ 135 + 129 136 #endif /* _ASM_GENERIC_HUGETLB_H */

-15

include/linux/balloon_compaction.h

··· 151 151 list_del(&page->lru); 152 152 } 153 153 154 - static inline bool __is_movable_balloon_page(struct page *page) 155 - { 156 - return false; 157 - } 158 - 159 - static inline bool balloon_page_movable(struct page *page) 160 - { 161 - return false; 162 - } 163 - 164 - static inline bool isolated_balloon_page(struct page *page) 165 - { 166 - return false; 167 - } 168 - 169 154 static inline bool balloon_page_isolate(struct page *page) 170 155 { 171 156 return false;

+2 -2

include/linux/gfp.h

··· 585 585 } 586 586 #endif /* CONFIG_PM_SLEEP */ 587 587 588 - #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) 588 + #ifdef CONFIG_CONTIG_ALLOC 589 589 /* The below functions must be run on a range from a single zone. */ 590 590 extern int alloc_contig_range(unsigned long start, unsigned long end, 591 591 unsigned migratetype, gfp_t gfp_mask); 592 - extern void free_contig_range(unsigned long pfn, unsigned nr_pages); 593 592 #endif 593 + void free_contig_range(unsigned long pfn, unsigned int nr_pages); 594 594 595 595 #ifdef CONFIG_CMA 596 596 /* CMA stuff */

+255 -65

include/linux/hmm.h

··· 77 77 #include <linux/migrate.h> 78 78 #include <linux/memremap.h> 79 79 #include <linux/completion.h> 80 + #include <linux/mmu_notifier.h> 80 81 81 - struct hmm; 82 + 83 + /* 84 + * struct hmm - HMM per mm struct 85 + * 86 + * @mm: mm struct this HMM struct is bound to 87 + * @lock: lock protecting ranges list 88 + * @ranges: list of range being snapshotted 89 + * @mirrors: list of mirrors for this mm 90 + * @mmu_notifier: mmu notifier to track updates to CPU page table 91 + * @mirrors_sem: read/write semaphore protecting the mirrors list 92 + * @wq: wait queue for user waiting on a range invalidation 93 + * @notifiers: count of active mmu notifiers 94 + * @dead: is the mm dead ? 95 + */ 96 + struct hmm { 97 + struct mm_struct *mm; 98 + struct kref kref; 99 + struct mutex lock; 100 + struct list_head ranges; 101 + struct list_head mirrors; 102 + struct mmu_notifier mmu_notifier; 103 + struct rw_semaphore mirrors_sem; 104 + wait_queue_head_t wq; 105 + long notifiers; 106 + bool dead; 107 + }; 82 108 83 109 /* 84 110 * hmm_pfn_flag_e - HMM flag enums ··· 157 131 /* 158 132 * struct hmm_range - track invalidation lock on virtual address range 159 133 * 134 + * @hmm: the core HMM structure this range is active against 160 135 * @vma: the vm area struct for the range 161 136 * @list: all range lock are on a list 162 137 * @start: range virtual start address (inclusive) ··· 165 138 * @pfns: array of pfns (big enough for the range) 166 139 * @flags: pfn flags to match device driver page table 167 140 * @values: pfn value for some special case (none, special, error, ...) 141 + * @default_flags: default flags for the range (write, read, ... see hmm doc) 142 + * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter 168 143 * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) 169 144 * @valid: pfns array did not change since it has been fill by an HMM function 170 145 */ 171 146 struct hmm_range { 147 + struct hmm *hmm; 172 148 struct vm_area_struct *vma; 173 149 struct list_head list; 174 150 unsigned long start; ··· 179 149 uint64_t *pfns; 180 150 const uint64_t *flags; 181 151 const uint64_t *values; 152 + uint64_t default_flags; 153 + uint64_t pfn_flags_mask; 154 + uint8_t page_shift; 182 155 uint8_t pfn_shift; 183 156 bool valid; 184 157 }; 185 158 186 159 /* 187 - * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn 188 - * @range: range use to decode HMM pfn value 189 - * @pfn: HMM pfn value to get corresponding struct page from 190 - * Returns: struct page pointer if pfn is a valid HMM pfn, NULL otherwise 191 - * 192 - * If the HMM pfn is valid (ie valid flag set) then return the struct page 193 - * matching the pfn value stored in the HMM pfn. Otherwise return NULL. 160 + * hmm_range_page_shift() - return the page shift for the range 161 + * @range: range being queried 162 + * Returns: page shift (page size = 1 << page shift) for the range 194 163 */ 195 - static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, 196 - uint64_t pfn) 164 + static inline unsigned hmm_range_page_shift(const struct hmm_range *range) 197 165 { 198 - if (pfn == range->values[HMM_PFN_NONE]) 199 - return NULL; 200 - if (pfn == range->values[HMM_PFN_ERROR]) 201 - return NULL; 202 - if (pfn == range->values[HMM_PFN_SPECIAL]) 203 - return NULL; 204 - if (!(pfn & range->flags[HMM_PFN_VALID])) 205 - return NULL; 206 - return pfn_to_page(pfn >> range->pfn_shift); 166 + return range->page_shift; 207 167 } 208 168 209 169 /* 210 - * hmm_pfn_to_pfn() - return pfn value store in a HMM pfn 211 - * @range: range use to decode HMM pfn value 212 - * @pfn: HMM pfn value to extract pfn from 213 - * Returns: pfn value if HMM pfn is valid, -1UL otherwise 170 + * hmm_range_page_size() - return the page size for the range 171 + * @range: range being queried 172 + * Returns: page size for the range in bytes 214 173 */ 215 - static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, 216 - uint64_t pfn) 174 + static inline unsigned long hmm_range_page_size(const struct hmm_range *range) 175 + { 176 + return 1UL << hmm_range_page_shift(range); 177 + } 178 + 179 + /* 180 + * hmm_range_wait_until_valid() - wait for range to be valid 181 + * @range: range affected by invalidation to wait on 182 + * @timeout: time out for wait in ms (ie abort wait after that period of time) 183 + * Returns: true if the range is valid, false otherwise. 184 + */ 185 + static inline bool hmm_range_wait_until_valid(struct hmm_range *range, 186 + unsigned long timeout) 187 + { 188 + /* Check if mm is dead ? */ 189 + if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) { 190 + range->valid = false; 191 + return false; 192 + } 193 + if (range->valid) 194 + return true; 195 + wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead, 196 + msecs_to_jiffies(timeout)); 197 + /* Return current valid status just in case we get lucky */ 198 + return range->valid; 199 + } 200 + 201 + /* 202 + * hmm_range_valid() - test if a range is valid or not 203 + * @range: range 204 + * Returns: true if the range is valid, false otherwise. 205 + */ 206 + static inline bool hmm_range_valid(struct hmm_range *range) 207 + { 208 + return range->valid; 209 + } 210 + 211 + /* 212 + * hmm_device_entry_to_page() - return struct page pointed to by a device entry 213 + * @range: range use to decode device entry value 214 + * @entry: device entry value to get corresponding struct page from 215 + * Returns: struct page pointer if entry is a valid, NULL otherwise 216 + * 217 + * If the device entry is valid (ie valid flag set) then return the struct page 218 + * matching the entry value. Otherwise return NULL. 219 + */ 220 + static inline struct page *hmm_device_entry_to_page(const struct hmm_range *range, 221 + uint64_t entry) 222 + { 223 + if (entry == range->values[HMM_PFN_NONE]) 224 + return NULL; 225 + if (entry == range->values[HMM_PFN_ERROR]) 226 + return NULL; 227 + if (entry == range->values[HMM_PFN_SPECIAL]) 228 + return NULL; 229 + if (!(entry & range->flags[HMM_PFN_VALID])) 230 + return NULL; 231 + return pfn_to_page(entry >> range->pfn_shift); 232 + } 233 + 234 + /* 235 + * hmm_device_entry_to_pfn() - return pfn value store in a device entry 236 + * @range: range use to decode device entry value 237 + * @entry: device entry to extract pfn from 238 + * Returns: pfn value if device entry is valid, -1UL otherwise 239 + */ 240 + static inline unsigned long 241 + hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn) 217 242 { 218 243 if (pfn == range->values[HMM_PFN_NONE]) 219 244 return -1UL; ··· 282 197 } 283 198 284 199 /* 285 - * hmm_pfn_from_page() - create a valid HMM pfn value from struct page 200 + * hmm_device_entry_from_page() - create a valid device entry for a page 286 201 * @range: range use to encode HMM pfn value 287 - * @page: struct page pointer for which to create the HMM pfn 288 - * Returns: valid HMM pfn for the page 202 + * @page: page for which to create the device entry 203 + * Returns: valid device entry for the page 289 204 */ 290 - static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range, 291 - struct page *page) 205 + static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range, 206 + struct page *page) 292 207 { 293 208 return (page_to_pfn(page) << range->pfn_shift) | 294 209 range->flags[HMM_PFN_VALID]; 295 210 } 296 211 297 212 /* 298 - * hmm_pfn_from_pfn() - create a valid HMM pfn value from pfn 213 + * hmm_device_entry_from_pfn() - create a valid device entry value from pfn 299 214 * @range: range use to encode HMM pfn value 300 - * @pfn: pfn value for which to create the HMM pfn 301 - * Returns: valid HMM pfn for the pfn 215 + * @pfn: pfn value for which to create the device entry 216 + * Returns: valid device entry for the pfn 302 217 */ 303 - static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, 304 - unsigned long pfn) 218 + static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, 219 + unsigned long pfn) 305 220 { 306 221 return (pfn << range->pfn_shift) | 307 222 range->flags[HMM_PFN_VALID]; 308 223 } 224 + 225 + /* 226 + * Old API: 227 + * hmm_pfn_to_page() 228 + * hmm_pfn_to_pfn() 229 + * hmm_pfn_from_page() 230 + * hmm_pfn_from_pfn() 231 + * 232 + * This are the OLD API please use new API, it is here to avoid cross-tree 233 + * merge painfullness ie we convert things to new API in stages. 234 + */ 235 + static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, 236 + uint64_t pfn) 237 + { 238 + return hmm_device_entry_to_page(range, pfn); 239 + } 240 + 241 + static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, 242 + uint64_t pfn) 243 + { 244 + return hmm_device_entry_to_pfn(range, pfn); 245 + } 246 + 247 + static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range, 248 + struct page *page) 249 + { 250 + return hmm_device_entry_from_page(range, page); 251 + } 252 + 253 + static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, 254 + unsigned long pfn) 255 + { 256 + return hmm_device_entry_from_pfn(range, pfn); 257 + } 258 + 309 259 310 260 311 261 #if IS_ENABLED(CONFIG_HMM_MIRROR) ··· 473 353 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm); 474 354 void hmm_mirror_unregister(struct hmm_mirror *mirror); 475 355 356 + /* 357 + * hmm_mirror_mm_is_alive() - test if mm is still alive 358 + * @mirror: the HMM mm mirror for which we want to lock the mmap_sem 359 + * Returns: false if the mm is dead, true otherwise 360 + * 361 + * This is an optimization it will not accurately always return -EINVAL if the 362 + * mm is dead ie there can be false negative (process is being kill but HMM is 363 + * not yet inform of that). It is only intented to be use to optimize out case 364 + * where driver is about to do something time consuming and it would be better 365 + * to skip it if the mm is dead. 366 + */ 367 + static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror) 368 + { 369 + struct mm_struct *mm; 370 + 371 + if (!mirror || !mirror->hmm) 372 + return false; 373 + mm = READ_ONCE(mirror->hmm->mm); 374 + if (mirror->hmm->dead || !mm) 375 + return false; 376 + 377 + return true; 378 + } 379 + 476 380 477 381 /* 478 - * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device 479 - * driver lock that serializes device page table updates, then call 480 - * hmm_vma_range_done(), to check if the snapshot is still valid. The same 481 - * device driver page table update lock must also be used in the 482 - * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page 483 - * table invalidation serializes on it. 484 - * 485 - * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL 486 - * hmm_vma_get_pfns() WITHOUT ERROR ! 487 - * 488 - * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID ! 382 + * Please see Documentation/vm/hmm.rst for how to use the range API. 489 383 */ 490 - int hmm_vma_get_pfns(struct hmm_range *range); 491 - bool hmm_vma_range_done(struct hmm_range *range); 492 - 384 + int hmm_range_register(struct hmm_range *range, 385 + struct mm_struct *mm, 386 + unsigned long start, 387 + unsigned long end, 388 + unsigned page_shift); 389 + void hmm_range_unregister(struct hmm_range *range); 390 + long hmm_range_snapshot(struct hmm_range *range); 391 + long hmm_range_fault(struct hmm_range *range, bool block); 392 + long hmm_range_dma_map(struct hmm_range *range, 393 + struct device *device, 394 + dma_addr_t *daddrs, 395 + bool block); 396 + long hmm_range_dma_unmap(struct hmm_range *range, 397 + struct vm_area_struct *vma, 398 + struct device *device, 399 + dma_addr_t *daddrs, 400 + bool dirty); 493 401 494 402 /* 495 - * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will 496 - * not migrate any device memory back to system memory. The HMM pfn array will 497 - * be updated with the fault result and current snapshot of the CPU page table 498 - * for the range. 403 + * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range 499 404 * 500 - * The mmap_sem must be taken in read mode before entering and it might be 501 - * dropped by the function if the block argument is false. In that case, the 502 - * function returns -EAGAIN. 503 - * 504 - * Return value does not reflect if the fault was successful for every single 505 - * address or not. Therefore, the caller must to inspect the HMM pfn array to 506 - * determine fault status for each address. 507 - * 508 - * Trying to fault inside an invalid vma will result in -EINVAL. 509 - * 510 - * See the function description in mm/hmm.c for further documentation. 405 + * When waiting for mmu notifiers we need some kind of time out otherwise we 406 + * could potentialy wait for ever, 1000ms ie 1s sounds like a long time to 407 + * wait already. 511 408 */ 512 - int hmm_vma_fault(struct hmm_range *range, bool block); 409 + #define HMM_RANGE_DEFAULT_TIMEOUT 1000 410 + 411 + /* This is a temporary helper to avoid merge conflict between trees. */ 412 + static inline bool hmm_vma_range_done(struct hmm_range *range) 413 + { 414 + bool ret = hmm_range_valid(range); 415 + 416 + hmm_range_unregister(range); 417 + return ret; 418 + } 419 + 420 + /* This is a temporary helper to avoid merge conflict between trees. */ 421 + static inline int hmm_vma_fault(struct hmm_range *range, bool block) 422 + { 423 + long ret; 424 + 425 + /* 426 + * With the old API the driver must set each individual entries with 427 + * the requested flags (valid, write, ...). So here we set the mask to 428 + * keep intact the entries provided by the driver and zero out the 429 + * default_flags. 430 + */ 431 + range->default_flags = 0; 432 + range->pfn_flags_mask = -1UL; 433 + 434 + ret = hmm_range_register(range, range->vma->vm_mm, 435 + range->start, range->end, 436 + PAGE_SHIFT); 437 + if (ret) 438 + return (int)ret; 439 + 440 + if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) { 441 + /* 442 + * The mmap_sem was taken by driver we release it here and 443 + * returns -EAGAIN which correspond to mmap_sem have been 444 + * drop in the old API. 445 + */ 446 + up_read(&range->vma->vm_mm->mmap_sem); 447 + return -EAGAIN; 448 + } 449 + 450 + ret = hmm_range_fault(range, block); 451 + if (ret <= 0) { 452 + if (ret == -EBUSY || !ret) { 453 + /* Same as above drop mmap_sem to match old API. */ 454 + up_read(&range->vma->vm_mm->mmap_sem); 455 + ret = -EBUSY; 456 + } else if (ret == -EAGAIN) 457 + ret = -EBUSY; 458 + hmm_range_unregister(range); 459 + return ret; 460 + } 461 + return 0; 462 + } 513 463 514 464 /* Below are for HMM internal use only! Not to be used by device driver! */ 515 465 void hmm_mm_destroy(struct mm_struct *mm);

+2 -4

include/linux/huge_mm.h

··· 47 47 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 48 48 unsigned long addr, pgprot_t newprot, 49 49 int prot_numa); 50 - vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 51 - pmd_t *pmd, pfn_t pfn, bool write); 52 - vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, 53 - pud_t *pud, pfn_t pfn, bool write); 50 + vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write); 51 + vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write); 54 52 enum transparent_hugepage_flag { 55 53 TRANSPARENT_HUGEPAGE_FLAG, 56 54 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,

+1 -3

include/linux/hugetlb.h

··· 123 123 void free_huge_page(struct page *page); 124 124 void hugetlb_fix_reserve_counts(struct inode *inode); 125 125 extern struct mutex *hugetlb_fault_mutex_table; 126 - u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, 127 - struct vm_area_struct *vma, 128 - struct address_space *mapping, 126 + u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, 129 127 pgoff_t idx, unsigned long address); 130 128 131 129 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);

+18

include/linux/list.h

··· 271 271 } 272 272 273 273 /** 274 + * list_rotate_to_front() - Rotate list to specific item. 275 + * @list: The desired new front of the list. 276 + * @head: The head of the list. 277 + * 278 + * Rotates list so that @list becomes the new front of the list. 279 + */ 280 + static inline void list_rotate_to_front(struct list_head *list, 281 + struct list_head *head) 282 + { 283 + /* 284 + * Deletes the list head from the list denoted by @head and 285 + * places it as the tail of @list, this effectively rotates the 286 + * list so that @list is at the front. 287 + */ 288 + list_move_tail(head, list); 289 + } 290 + 291 + /** 274 292 * list_is_singular - tests whether a list has just one entry. 275 293 * @head: the list to test. 276 294 */

+43 -1

include/linux/memblock.h

··· 96 96 extern struct memblock memblock; 97 97 extern int memblock_debug; 98 98 99 - #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK 99 + #ifndef CONFIG_ARCH_KEEP_MEMBLOCK 100 100 #define __init_memblock __meminit 101 101 #define __initdata_memblock __meminitdata 102 102 void memblock_discard(void); 103 103 #else 104 104 #define __init_memblock 105 105 #define __initdata_memblock 106 + static inline void memblock_discard(void) {} 106 107 #endif 107 108 108 109 #define memblock_dbg(fmt, ...) \ ··· 240 239 for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \ 241 240 i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid)) 242 241 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 242 + 243 + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 244 + void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, 245 + unsigned long *out_spfn, 246 + unsigned long *out_epfn); 247 + /** 248 + * for_each_free_mem_range_in_zone - iterate through zone specific free 249 + * memblock areas 250 + * @i: u64 used as loop variable 251 + * @zone: zone in which all of the memory blocks reside 252 + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 253 + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 254 + * 255 + * Walks over free (memory && !reserved) areas of memblock in a specific 256 + * zone. Available once memblock and an empty zone is initialized. The main 257 + * assumption is that the zone start, end, and pgdat have been associated. 258 + * This way we can use the zone to determine NUMA node, and if a given part 259 + * of the memblock is valid for the zone. 260 + */ 261 + #define for_each_free_mem_pfn_range_in_zone(i, zone, p_start, p_end) \ 262 + for (i = 0, \ 263 + __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end); \ 264 + i != U64_MAX; \ 265 + __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) 266 + 267 + /** 268 + * for_each_free_mem_range_in_zone_from - iterate through zone specific 269 + * free memblock areas from a given point 270 + * @i: u64 used as loop variable 271 + * @zone: zone in which all of the memory blocks reside 272 + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 273 + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 274 + * 275 + * Walks over free (memory && !reserved) areas of memblock in a specific 276 + * zone, continuing from current position. Available as soon as memblock is 277 + * initialized. 278 + */ 279 + #define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \ 280 + for (; i != U64_MAX; \ 281 + __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) 282 + #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 243 283 244 284 /** 245 285 * for_each_free_mem_range - iterate through free memblock areas

+6 -28

include/linux/memcontrol.h

··· 501 501 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 502 502 int zid, int nr_pages); 503 503 504 - unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 505 - int nid, unsigned int lru_mask); 506 - 507 - static inline 508 - unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) 509 - { 510 - struct mem_cgroup_per_node *mz; 511 - unsigned long nr_pages = 0; 512 - int zid; 513 - 514 - mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 515 - for (zid = 0; zid < MAX_NR_ZONES; zid++) 516 - nr_pages += mz->lru_zone_size[zid][lru]; 517 - return nr_pages; 518 - } 519 - 520 504 static inline 521 505 unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, 522 506 enum lru_list lru, int zone_idx) ··· 944 960 return true; 945 961 } 946 962 947 - static inline unsigned long 948 - mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) 949 - { 950 - return 0; 951 - } 952 963 static inline 953 964 unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, 954 965 enum lru_list lru, int zone_idx) 955 - { 956 - return 0; 957 - } 958 - 959 - static inline unsigned long 960 - mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 961 - int nid, unsigned int lru_mask) 962 966 { 963 967 return 0; 964 968 } ··· 1086 1114 static inline void count_memcg_events(struct mem_cgroup *memcg, 1087 1115 enum vm_event_item idx, 1088 1116 unsigned long count) 1117 + { 1118 + } 1119 + 1120 + static inline void __count_memcg_events(struct mem_cgroup *memcg, 1121 + enum vm_event_item idx, 1122 + unsigned long count) 1089 1123 { 1090 1124 } 1091 1125

+1 -1

include/linux/memory.h

··· 113 113 extern void unregister_memory_isolate_notifier(struct notifier_block *nb); 114 114 int hotplug_memory_register(int nid, struct mem_section *section); 115 115 #ifdef CONFIG_MEMORY_HOTREMOVE 116 - extern int unregister_memory_section(struct mem_section *); 116 + extern void unregister_memory_section(struct mem_section *); 117 117 #endif 118 118 extern int memory_dev_init(void); 119 119 extern int memory_notify(unsigned long val, void *v);

+30 -12

include/linux/memory_hotplug.h

··· 54 54 }; 55 55 56 56 /* 57 + * Restrictions for the memory hotplug: 58 + * flags: MHP_ flags 59 + * altmap: alternative allocator for memmap array 60 + */ 61 + struct mhp_restrictions { 62 + unsigned long flags; 63 + struct vmem_altmap *altmap; 64 + }; 65 + 66 + /* 57 67 * Zone resizing functions 58 68 * 59 69 * Note: any attempt to resize a zone should has pgdat_resize_lock() ··· 97 87 extern int online_pages(unsigned long, unsigned long, int); 98 88 extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, 99 89 unsigned long *valid_start, unsigned long *valid_end); 100 - extern void __offline_isolated_pages(unsigned long, unsigned long); 90 + extern unsigned long __offline_isolated_pages(unsigned long start_pfn, 91 + unsigned long end_pfn); 101 92 102 93 typedef void (*online_page_callback_t)(struct page *page, unsigned int order); 103 94 ··· 111 100 112 101 extern int try_online_node(int nid); 113 102 103 + extern int arch_add_memory(int nid, u64 start, u64 size, 104 + struct mhp_restrictions *restrictions); 114 105 extern u64 max_mem_size; 115 106 116 107 extern bool memhp_auto_online; ··· 124 111 } 125 112 126 113 #ifdef CONFIG_MEMORY_HOTREMOVE 127 - extern int arch_remove_memory(int nid, u64 start, u64 size, 128 - struct vmem_altmap *altmap); 129 - extern int __remove_pages(struct zone *zone, unsigned long start_pfn, 130 - unsigned long nr_pages, struct vmem_altmap *altmap); 114 + extern void arch_remove_memory(int nid, u64 start, u64 size, 115 + struct vmem_altmap *altmap); 116 + extern void __remove_pages(struct zone *zone, unsigned long start_pfn, 117 + unsigned long nr_pages, struct vmem_altmap *altmap); 131 118 #endif /* CONFIG_MEMORY_HOTREMOVE */ 119 + 120 + /* 121 + * Do we want sysfs memblock files created. This will allow userspace to online 122 + * and offline memory explicitly. Lack of this bit means that the caller has to 123 + * call move_pfn_range_to_zone to finish the initialization. 124 + */ 125 + 126 + #define MHP_MEMBLOCK_API (1<<0) 132 127 133 128 /* reasonably generic interface to expand the physical pages */ 134 129 extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, 135 - struct vmem_altmap *altmap, bool want_memblock); 130 + struct mhp_restrictions *restrictions); 136 131 137 132 #ifndef CONFIG_ARCH_HAS_ADD_PAGES 138 133 static inline int add_pages(int nid, unsigned long start_pfn, 139 - unsigned long nr_pages, struct vmem_altmap *altmap, 140 - bool want_memblock) 134 + unsigned long nr_pages, struct mhp_restrictions *restrictions) 141 135 { 142 - return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); 136 + return __add_pages(nid, start_pfn, nr_pages, restrictions); 143 137 } 144 138 #else /* ARCH_HAS_ADD_PAGES */ 145 139 int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, 146 - struct vmem_altmap *altmap, bool want_memblock); 140 + struct mhp_restrictions *restrictions); 147 141 #endif /* ARCH_HAS_ADD_PAGES */ 148 142 149 143 #ifdef CONFIG_NUMA ··· 351 331 extern int __add_memory(int nid, u64 start, u64 size); 352 332 extern int add_memory(int nid, u64 start, u64 size); 353 333 extern int add_memory_resource(int nid, struct resource *resource); 354 - extern int arch_add_memory(int nid, u64 start, u64 size, 355 - struct vmem_altmap *altmap, bool want_memblock); 356 334 extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, 357 335 unsigned long nr_pages, struct vmem_altmap *altmap); 358 336 extern bool is_memblock_offlined(struct memory_block *mem);

+96 -18

include/linux/mm.h

··· 124 124 125 125 /* 126 126 * On some architectures it is expensive to call memset() for small sizes. 127 - * Those architectures should provide their own implementation of "struct page" 128 - * zeroing by defining this macro in <asm/pgtable.h>. 127 + * If an architecture decides to implement their own version of 128 + * mm_zero_struct_page they should wrap the defines below in a #ifndef and 129 + * define their own version of this macro in <asm/pgtable.h> 129 130 */ 130 - #ifndef mm_zero_struct_page 131 + #if BITS_PER_LONG == 64 132 + /* This function must be updated when the size of struct page grows above 80 133 + * or reduces below 56. The idea that compiler optimizes out switch() 134 + * statement, and only leaves move/store instructions. Also the compiler can 135 + * combine write statments if they are both assignments and can be reordered, 136 + * this can result in several of the writes here being dropped. 137 + */ 138 + #define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) 139 + static inline void __mm_zero_struct_page(struct page *page) 140 + { 141 + unsigned long *_pp = (void *)page; 142 + 143 + /* Check that struct page is either 56, 64, 72, or 80 bytes */ 144 + BUILD_BUG_ON(sizeof(struct page) & 7); 145 + BUILD_BUG_ON(sizeof(struct page) < 56); 146 + BUILD_BUG_ON(sizeof(struct page) > 80); 147 + 148 + switch (sizeof(struct page)) { 149 + case 80: 150 + _pp[9] = 0; /* fallthrough */ 151 + case 72: 152 + _pp[8] = 0; /* fallthrough */ 153 + case 64: 154 + _pp[7] = 0; /* fallthrough */ 155 + case 56: 156 + _pp[6] = 0; 157 + _pp[5] = 0; 158 + _pp[4] = 0; 159 + _pp[3] = 0; 160 + _pp[2] = 0; 161 + _pp[1] = 0; 162 + _pp[0] = 0; 163 + } 164 + } 165 + #else 131 166 #define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) 132 167 #endif 133 168 ··· 1042 1007 __put_page(page); 1043 1008 } 1044 1009 1010 + /** 1011 + * put_user_page() - release a gup-pinned page 1012 + * @page: pointer to page to be released 1013 + * 1014 + * Pages that were pinned via get_user_pages*() must be released via 1015 + * either put_user_page(), or one of the put_user_pages*() routines 1016 + * below. This is so that eventually, pages that are pinned via 1017 + * get_user_pages*() can be separately tracked and uniquely handled. In 1018 + * particular, interactions with RDMA and filesystems need special 1019 + * handling. 1020 + * 1021 + * put_user_page() and put_page() are not interchangeable, despite this early 1022 + * implementation that makes them look the same. put_user_page() calls must 1023 + * be perfectly matched up with get_user_page() calls. 1024 + */ 1025 + static inline void put_user_page(struct page *page) 1026 + { 1027 + put_page(page); 1028 + } 1029 + 1030 + void put_user_pages_dirty(struct page **pages, unsigned long npages); 1031 + void put_user_pages_dirty_lock(struct page **pages, unsigned long npages); 1032 + void put_user_pages(struct page **pages, unsigned long npages); 1033 + 1045 1034 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 1046 1035 #define SECTION_IN_PAGE_FLAGS 1047 1036 #endif ··· 1564 1505 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, 1565 1506 struct page **pages, unsigned int gup_flags); 1566 1507 1567 - #if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA) 1568 - long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, 1569 - unsigned int gup_flags, struct page **pages, 1570 - struct vm_area_struct **vmas); 1571 - #else 1572 - static inline long get_user_pages_longterm(unsigned long start, 1573 - unsigned long nr_pages, unsigned int gup_flags, 1574 - struct page **pages, struct vm_area_struct **vmas) 1575 - { 1576 - return get_user_pages(start, nr_pages, gup_flags, pages, vmas); 1577 - } 1578 - #endif /* CONFIG_FS_DAX */ 1579 - 1580 - int get_user_pages_fast(unsigned long start, int nr_pages, int write, 1581 - struct page **pages); 1508 + int get_user_pages_fast(unsigned long start, int nr_pages, 1509 + unsigned int gup_flags, struct page **pages); 1582 1510 1583 1511 /* Container for pinned pfns / pages */ 1584 1512 struct frame_vector { ··· 2579 2533 int remap_pfn_range(struct vm_area_struct *, unsigned long addr, 2580 2534 unsigned long pfn, unsigned long size, pgprot_t); 2581 2535 int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); 2536 + int vm_map_pages(struct vm_area_struct *vma, struct page **pages, 2537 + unsigned long num); 2538 + int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, 2539 + unsigned long num); 2582 2540 vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 2583 2541 unsigned long pfn); 2584 2542 vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, ··· 2633 2583 #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ 2634 2584 #define FOLL_COW 0x4000 /* internal GUP flag */ 2635 2585 #define FOLL_ANON 0x8000 /* don't do file mappings */ 2586 + #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ 2587 + 2588 + /* 2589 + * NOTE on FOLL_LONGTERM: 2590 + * 2591 + * FOLL_LONGTERM indicates that the page will be held for an indefinite time 2592 + * period _often_ under userspace control. This is contrasted with 2593 + * iov_iter_get_pages() where usages which are transient. 2594 + * 2595 + * FIXME: For pages which are part of a filesystem, mappings are subject to the 2596 + * lifetime enforced by the filesystem and we need guarantees that longterm 2597 + * users like RDMA and V4L2 only establish mappings which coordinate usage with 2598 + * the filesystem. Ideas for this coordination include revoking the longterm 2599 + * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was 2600 + * added after the problem with filesystems was found FS DAX VMAs are 2601 + * specifically failed. Filesystem pages are still subject to bugs and use of 2602 + * FOLL_LONGTERM should be avoided on those pages. 2603 + * 2604 + * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call. 2605 + * Currently only get_user_pages() and get_user_pages_fast() support this flag 2606 + * and calls to get_user_pages_[un]locked are specifically not allowed. This 2607 + * is due to an incompatibility with the FS DAX check and 2608 + * FAULT_FLAG_ALLOW_RETRY 2609 + * 2610 + * In the CMA case: longterm pins in a CMA region would unnecessarily fragment 2611 + * that region. And so CMA attempts to migrate the page before pinning when 2612 + * FOLL_LONGTERM is specified. 2613 + */ 2636 2614 2637 2615 static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) 2638 2616 {

+1 -1

include/linux/mm_inline.h

··· 29 29 { 30 30 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 31 31 32 - __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages); 32 + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); 33 33 __mod_zone_page_state(&pgdat->node_zones[zid], 34 34 NR_ZONE_LRU_BASE + lru, nr_pages); 35 35 }

+1 -1

include/linux/mm_types.h

··· 103 103 }; 104 104 struct { /* slab, slob and slub */ 105 105 union { 106 - struct list_head slab_list; /* uses lru */ 106 + struct list_head slab_list; 107 107 struct { /* Partial pages */ 108 108 struct page *next; 109 109 #ifdef CONFIG_64BIT

+59 -4

include/linux/mmu_notifier.h

··· 10 10 struct mmu_notifier; 11 11 struct mmu_notifier_ops; 12 12 13 + /** 14 + * enum mmu_notifier_event - reason for the mmu notifier callback 15 + * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that 16 + * move the range 17 + * 18 + * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like 19 + * madvise() or replacing a page by another one, ...). 20 + * 21 + * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range 22 + * ie using the vma access permission (vm_page_prot) to update the whole range 23 + * is enough no need to inspect changes to the CPU page table (mprotect() 24 + * syscall) 25 + * 26 + * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for 27 + * pages in the range so to mirror those changes the user must inspect the CPU 28 + * page table (from the end callback). 29 + * 30 + * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same 31 + * access flags). User should soft dirty the page in the end callback to make 32 + * sure that anyone relying on soft dirtyness catch pages that might be written 33 + * through non CPU mappings. 34 + */ 35 + enum mmu_notifier_event { 36 + MMU_NOTIFY_UNMAP = 0, 37 + MMU_NOTIFY_CLEAR, 38 + MMU_NOTIFY_PROTECTION_VMA, 39 + MMU_NOTIFY_PROTECTION_PAGE, 40 + MMU_NOTIFY_SOFT_DIRTY, 41 + }; 42 + 13 43 #ifdef CONFIG_MMU_NOTIFIER 14 44 15 45 /* ··· 55 25 spinlock_t lock; 56 26 }; 57 27 28 + #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) 29 + 58 30 struct mmu_notifier_range { 31 + struct vm_area_struct *vma; 59 32 struct mm_struct *mm; 60 33 unsigned long start; 61 34 unsigned long end; 62 - bool blockable; 35 + unsigned flags; 36 + enum mmu_notifier_event event; 63 37 }; 64 38 65 39 struct mmu_notifier_ops { ··· 259 225 bool only_end); 260 226 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, 261 227 unsigned long start, unsigned long end); 228 + extern bool 229 + mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); 230 + 231 + static inline bool 232 + mmu_notifier_range_blockable(const struct mmu_notifier_range *range) 233 + { 234 + return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); 235 + } 262 236 263 237 static inline void mmu_notifier_release(struct mm_struct *mm) 264 238 { ··· 311 269 mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) 312 270 { 313 271 if (mm_has_notifiers(range->mm)) { 314 - range->blockable = true; 272 + range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; 315 273 __mmu_notifier_invalidate_range_start(range); 316 274 } 317 275 } ··· 320 278 mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) 321 279 { 322 280 if (mm_has_notifiers(range->mm)) { 323 - range->blockable = false; 281 + range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; 324 282 return __mmu_notifier_invalidate_range_start(range); 325 283 } 326 284 return 0; ··· 360 318 361 319 362 320 static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, 321 + enum mmu_notifier_event event, 322 + unsigned flags, 323 + struct vm_area_struct *vma, 363 324 struct mm_struct *mm, 364 325 unsigned long start, 365 326 unsigned long end) 366 327 { 328 + range->vma = vma; 329 + range->event = event; 367 330 range->mm = mm; 368 331 range->start = start; 369 332 range->end = end; 333 + range->flags = flags; 370 334 } 371 335 372 336 #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ ··· 500 452 range->end = end; 501 453 } 502 454 503 - #define mmu_notifier_range_init(range, mm, start, end) \ 455 + #define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \ 504 456 _mmu_notifier_range_init(range, start, end) 505 457 458 + static inline bool 459 + mmu_notifier_range_blockable(const struct mmu_notifier_range *range) 460 + { 461 + return true; 462 + } 506 463 507 464 static inline int mm_has_notifiers(struct mm_struct *mm) 508 465 { ··· 569 516 static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) 570 517 { 571 518 } 519 + 520 + #define mmu_notifier_range_update_to_read_only(r) false 572 521 573 522 #define ptep_clear_flush_young_notify ptep_clear_flush_young 574 523 #define pmdp_clear_flush_young_notify pmdp_clear_flush_young

-5

include/linux/mmzone.h

··· 247 247 #endif 248 248 }; 249 249 250 - /* Mask used at gathering information at once (see memcontrol.c) */ 251 - #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 252 - #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 253 - #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 254 - 255 250 /* Isolate unmapped file */ 256 251 #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) 257 252 /* Isolate for asynchronous migration */

+14 -12

include/linux/pagemap.h

··· 333 333 mapping_gfp_mask(mapping)); 334 334 } 335 335 336 + static inline struct page *find_subpage(struct page *page, pgoff_t offset) 337 + { 338 + unsigned long mask; 339 + 340 + if (PageHuge(page)) 341 + return page; 342 + 343 + VM_BUG_ON_PAGE(PageTail(page), page); 344 + 345 + mask = (1UL << compound_order(page)) - 1; 346 + return page + (offset & mask); 347 + } 348 + 336 349 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); 337 350 struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); 338 351 unsigned find_get_entries(struct address_space *mapping, pgoff_t start, ··· 373 360 return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag, 374 361 nr_pages, pages); 375 362 } 376 - unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, 377 - xa_mark_t tag, unsigned int nr_entries, 378 - struct page **entries, pgoff_t *indices); 379 363 380 364 struct page *grab_cache_page_write_begin(struct address_space *mapping, 381 365 pgoff_t index, unsigned flags); ··· 537 527 538 528 extern void put_and_wait_on_page_locked(struct page *page); 539 529 540 - /* 541 - * Wait for a page to complete writeback 542 - */ 543 - static inline void wait_on_page_writeback(struct page *page) 544 - { 545 - if (PageWriteback(page)) 546 - wait_on_page_bit(page, PG_writeback); 547 - } 548 - 530 + void wait_on_page_writeback(struct page *page); 549 531 extern void end_page_writeback(struct page *page); 550 532 void wait_for_stable_page(struct page *page); 551 533

+2

include/linux/userfaultfd_k.h

··· 28 28 #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) 29 29 #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) 30 30 31 + extern int sysctl_unprivileged_userfaultfd; 32 + 31 33 extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); 32 34 33 35 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,

+1 -1

include/linux/vmstat.h

··· 26 26 unsigned nr_congested; 27 27 unsigned nr_writeback; 28 28 unsigned nr_immediate; 29 - unsigned nr_activate; 29 + unsigned nr_activate[2]; 30 30 unsigned nr_ref_keep; 31 31 unsigned nr_unmap_fail; 32 32 };

+4 -6

include/trace/events/compaction.h

··· 64 64 TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) 65 65 ); 66 66 67 + #ifdef CONFIG_COMPACTION 67 68 TRACE_EVENT(mm_compaction_migratepages, 68 69 69 70 TP_PROTO(unsigned long nr_all, ··· 133 132 __entry->sync ? "sync" : "async") 134 133 ); 135 134 136 - #ifdef CONFIG_COMPACTION 137 135 TRACE_EVENT(mm_compaction_end, 138 136 TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn, 139 137 unsigned long free_pfn, unsigned long zone_end, bool sync, ··· 166 166 __entry->sync ? "sync" : "async", 167 167 __print_symbolic(__entry->status, COMPACTION_STATUS)) 168 168 ); 169 - #endif 170 169 171 170 TRACE_EVENT(mm_compaction_try_to_compact_pages, 172 171 ··· 188 189 __entry->prio = prio; 189 190 ), 190 191 191 - TP_printk("order=%d gfp_mask=0x%x priority=%d", 192 + TP_printk("order=%d gfp_mask=%s priority=%d", 192 193 __entry->order, 193 - __entry->gfp_mask, 194 + show_gfp_flags(__entry->gfp_mask), 194 195 __entry->prio) 195 196 ); 196 197 197 - #ifdef CONFIG_COMPACTION 198 198 DECLARE_EVENT_CLASS(mm_compaction_suitable_template, 199 199 200 200 TP_PROTO(struct zone *zone, ··· 294 296 295 297 TP_ARGS(zone, order) 296 298 ); 297 - #endif 298 299 299 300 TRACE_EVENT(mm_compaction_kcompactd_sleep, 300 301 ··· 349 352 350 353 TP_ARGS(nid, order, classzone_idx) 351 354 ); 355 + #endif 352 356 353 357 #endif /* _TRACE_COMPACTION_H */ 354 358

+62 -36

include/trace/events/vmscan.h

··· 27 27 {RECLAIM_WB_ASYNC, "RECLAIM_WB_ASYNC"} \ 28 28 ) : "RECLAIM_WB_NONE" 29 29 30 - #define trace_reclaim_flags(page) ( \ 31 - (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ 30 + #define trace_reclaim_flags(file) ( \ 31 + (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ 32 32 (RECLAIM_WB_ASYNC) \ 33 - ) 34 - 35 - #define trace_shrink_flags(file) \ 36 - ( \ 37 - (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ 38 - (RECLAIM_WB_ASYNC) \ 39 33 ) 40 34 41 35 TRACE_EVENT(mm_vmscan_kswapd_sleep, ··· 67 73 __entry->order = order; 68 74 ), 69 75 70 - TP_printk("nid=%d zid=%d order=%d", __entry->nid, __entry->zid, __entry->order) 76 + TP_printk("nid=%d order=%d", 77 + __entry->nid, 78 + __entry->order) 71 79 ); 72 80 73 81 TRACE_EVENT(mm_vmscan_wakeup_kswapd, ··· 92 96 __entry->gfp_flags = gfp_flags; 93 97 ), 94 98 95 - TP_printk("nid=%d zid=%d order=%d gfp_flags=%s", 99 + TP_printk("nid=%d order=%d gfp_flags=%s", 96 100 __entry->nid, 97 - __entry->zid, 98 101 __entry->order, 99 102 show_gfp_flags(__entry->gfp_flags)) 100 103 ); 101 104 102 105 DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, 103 106 104 - TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), 107 + TP_PROTO(int order, gfp_t gfp_flags), 105 108 106 - TP_ARGS(order, may_writepage, gfp_flags, classzone_idx), 109 + TP_ARGS(order, gfp_flags), 107 110 108 111 TP_STRUCT__entry( 109 112 __field( int, order ) 110 - __field( int, may_writepage ) 111 113 __field( gfp_t, gfp_flags ) 112 - __field( int, classzone_idx ) 113 114 ), 114 115 115 116 TP_fast_assign( 116 117 __entry->order = order; 117 - __entry->may_writepage = may_writepage; 118 118 __entry->gfp_flags = gfp_flags; 119 - __entry->classzone_idx = classzone_idx; 120 119 ), 121 120 122 - TP_printk("order=%d may_writepage=%d gfp_flags=%s classzone_idx=%d", 121 + TP_printk("order=%d gfp_flags=%s", 123 122 __entry->order, 124 - __entry->may_writepage, 125 - show_gfp_flags(__entry->gfp_flags), 126 - __entry->classzone_idx) 123 + show_gfp_flags(__entry->gfp_flags)) 127 124 ); 128 125 129 126 DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin, 130 127 131 - TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), 128 + TP_PROTO(int order, gfp_t gfp_flags), 132 129 133 - TP_ARGS(order, may_writepage, gfp_flags, classzone_idx) 130 + TP_ARGS(order, gfp_flags) 134 131 ); 135 132 136 133 #ifdef CONFIG_MEMCG 137 134 DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin, 138 135 139 - TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), 136 + TP_PROTO(int order, gfp_t gfp_flags), 140 137 141 - TP_ARGS(order, may_writepage, gfp_flags, classzone_idx) 138 + TP_ARGS(order, gfp_flags) 142 139 ); 143 140 144 141 DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin, 145 142 146 - TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), 143 + TP_PROTO(int order, gfp_t gfp_flags), 147 144 148 - TP_ARGS(order, may_writepage, gfp_flags, classzone_idx) 145 + TP_ARGS(order, gfp_flags) 149 146 ); 150 147 #endif /* CONFIG_MEMCG */ 151 148 ··· 322 333 323 334 TP_fast_assign( 324 335 __entry->pfn = page_to_pfn(page); 325 - __entry->reclaim_flags = trace_reclaim_flags(page); 336 + __entry->reclaim_flags = trace_reclaim_flags( 337 + page_is_file_cache(page)); 326 338 ), 327 339 328 340 TP_printk("page=%p pfn=%lu flags=%s", ··· 348 358 __field(unsigned long, nr_writeback) 349 359 __field(unsigned long, nr_congested) 350 360 __field(unsigned long, nr_immediate) 351 - __field(unsigned long, nr_activate) 361 + __field(unsigned int, nr_activate0) 362 + __field(unsigned int, nr_activate1) 352 363 __field(unsigned long, nr_ref_keep) 353 364 __field(unsigned long, nr_unmap_fail) 354 365 __field(int, priority) ··· 364 373 __entry->nr_writeback = stat->nr_writeback; 365 374 __entry->nr_congested = stat->nr_congested; 366 375 __entry->nr_immediate = stat->nr_immediate; 367 - __entry->nr_activate = stat->nr_activate; 376 + __entry->nr_activate0 = stat->nr_activate[0]; 377 + __entry->nr_activate1 = stat->nr_activate[1]; 368 378 __entry->nr_ref_keep = stat->nr_ref_keep; 369 379 __entry->nr_unmap_fail = stat->nr_unmap_fail; 370 380 __entry->priority = priority; 371 - __entry->reclaim_flags = trace_shrink_flags(file); 381 + __entry->reclaim_flags = trace_reclaim_flags(file); 372 382 ), 373 383 374 - TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate=%ld nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s", 384 + TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s", 375 385 __entry->nid, 376 386 __entry->nr_scanned, __entry->nr_reclaimed, 377 387 __entry->nr_dirty, __entry->nr_writeback, 378 388 __entry->nr_congested, __entry->nr_immediate, 379 - __entry->nr_activate, __entry->nr_ref_keep, 380 - __entry->nr_unmap_fail, __entry->priority, 389 + __entry->nr_activate0, __entry->nr_activate1, 390 + __entry->nr_ref_keep, __entry->nr_unmap_fail, 391 + __entry->priority, 381 392 show_reclaim_flags(__entry->reclaim_flags)) 382 393 ); 383 394 ··· 408 415 __entry->nr_deactivated = nr_deactivated; 409 416 __entry->nr_referenced = nr_referenced; 410 417 __entry->priority = priority; 411 - __entry->reclaim_flags = trace_shrink_flags(file); 418 + __entry->reclaim_flags = trace_reclaim_flags(file); 412 419 ), 413 420 414 421 TP_printk("nid=%d nr_taken=%ld nr_active=%ld nr_deactivated=%ld nr_referenced=%ld priority=%d flags=%s", ··· 447 454 __entry->total_active = total_active; 448 455 __entry->active = active; 449 456 __entry->ratio = ratio; 450 - __entry->reclaim_flags = trace_shrink_flags(file) & RECLAIM_WB_LRU; 457 + __entry->reclaim_flags = trace_reclaim_flags(file) & 458 + RECLAIM_WB_LRU; 451 459 ), 452 460 453 461 TP_printk("nid=%d reclaim_idx=%d total_inactive=%ld inactive=%ld total_active=%ld active=%ld ratio=%ld flags=%s", ··· 459 465 __entry->ratio, 460 466 show_reclaim_flags(__entry->reclaim_flags)) 461 467 ); 468 + 469 + TRACE_EVENT(mm_vmscan_node_reclaim_begin, 470 + 471 + TP_PROTO(int nid, int order, gfp_t gfp_flags), 472 + 473 + TP_ARGS(nid, order, gfp_flags), 474 + 475 + TP_STRUCT__entry( 476 + __field(int, nid) 477 + __field(int, order) 478 + __field(gfp_t, gfp_flags) 479 + ), 480 + 481 + TP_fast_assign( 482 + __entry->nid = nid; 483 + __entry->order = order; 484 + __entry->gfp_flags = gfp_flags; 485 + ), 486 + 487 + TP_printk("nid=%d order=%d gfp_flags=%s", 488 + __entry->nid, 489 + __entry->order, 490 + show_gfp_flags(__entry->gfp_flags)) 491 + ); 492 + 493 + DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end, 494 + 495 + TP_PROTO(unsigned long nr_reclaimed), 496 + 497 + TP_ARGS(nr_reclaimed) 498 + ); 499 + 462 500 #endif /* _TRACE_VMSCAN_H */ 463 501 464 502 /* This part must be outside protection */

+15 -1

include/trace/events/writeback.h

··· 53 53 54 54 struct wb_writeback_work; 55 55 56 - TRACE_EVENT(writeback_dirty_page, 56 + DECLARE_EVENT_CLASS(writeback_page_template, 57 57 58 58 TP_PROTO(struct page *page, struct address_space *mapping), 59 59 ··· 77 77 __entry->ino, 78 78 __entry->index 79 79 ) 80 + ); 81 + 82 + DEFINE_EVENT(writeback_page_template, writeback_dirty_page, 83 + 84 + TP_PROTO(struct page *page, struct address_space *mapping), 85 + 86 + TP_ARGS(page, mapping) 87 + ); 88 + 89 + DEFINE_EVENT(writeback_page_template, wait_on_page_writeback, 90 + 91 + TP_PROTO(struct page *page, struct address_space *mapping), 92 + 93 + TP_ARGS(page, mapping) 80 94 ); 81 95 82 96 DECLARE_EVENT_CLASS(writeback_dirty_inode_template,

+3

include/uapi/linux/fs.h

··· 320 320 #define SYNC_FILE_RANGE_WAIT_BEFORE 1 321 321 #define SYNC_FILE_RANGE_WRITE 2 322 322 #define SYNC_FILE_RANGE_WAIT_AFTER 4 323 + #define SYNC_FILE_RANGE_WRITE_AND_WAIT (SYNC_FILE_RANGE_WRITE | \ 324 + SYNC_FILE_RANGE_WAIT_BEFORE | \ 325 + SYNC_FILE_RANGE_WAIT_AFTER) 323 326 324 327 /* 325 328 * Flags for preadv2/pwritev2:

+90 -61

init/initramfs.c

··· 513 513 } 514 514 __setup("retain_initrd", retain_initrd_param); 515 515 516 + #ifdef CONFIG_ARCH_HAS_KEEPINITRD 517 + static int __init keepinitrd_setup(char *__unused) 518 + { 519 + do_retain_initrd = 1; 520 + return 1; 521 + } 522 + __setup("keepinitrd", keepinitrd_setup); 523 + #endif 524 + 516 525 extern char __initramfs_start[]; 517 526 extern unsigned long __initramfs_size; 518 527 #include <linux/initrd.h> 519 528 #include <linux/kexec.h> 520 529 521 - static void __init free_initrd(void) 530 + void __weak free_initrd_mem(unsigned long start, unsigned long end) 522 531 { 523 - #ifdef CONFIG_KEXEC_CORE 524 - unsigned long crashk_start = (unsigned long)__va(crashk_res.start); 525 - unsigned long crashk_end = (unsigned long)__va(crashk_res.end); 526 - #endif 527 - if (do_retain_initrd) 528 - goto skip; 532 + free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, 533 + "initrd"); 534 + } 529 535 530 536 #ifdef CONFIG_KEXEC_CORE 537 + static bool kexec_free_initrd(void) 538 + { 539 + unsigned long crashk_start = (unsigned long)__va(crashk_res.start); 540 + unsigned long crashk_end = (unsigned long)__va(crashk_res.end); 541 + 531 542 /* 532 543 * If the initrd region is overlapped with crashkernel reserved region, 533 544 * free only memory that is not part of crashkernel region. 534 545 */ 535 - if (initrd_start < crashk_end && initrd_end > crashk_start) { 536 - /* 537 - * Initialize initrd memory region since the kexec boot does 538 - * not do. 539 - */ 540 - memset((void *)initrd_start, 0, initrd_end - initrd_start); 541 - if (initrd_start < crashk_start) 542 - free_initrd_mem(initrd_start, crashk_start); 543 - if (initrd_end > crashk_end) 544 - free_initrd_mem(crashk_end, initrd_end); 545 - } else 546 - #endif 547 - free_initrd_mem(initrd_start, initrd_end); 548 - skip: 549 - initrd_start = 0; 550 - initrd_end = 0; 546 + if (initrd_start >= crashk_end || initrd_end <= crashk_start) 547 + return false; 548 + 549 + /* 550 + * Initialize initrd memory region since the kexec boot does not do. 551 + */ 552 + memset((void *)initrd_start, 0, initrd_end - initrd_start); 553 + if (initrd_start < crashk_start) 554 + free_initrd_mem(initrd_start, crashk_start); 555 + if (initrd_end > crashk_end) 556 + free_initrd_mem(crashk_end, initrd_end); 557 + return true; 551 558 } 559 + #else 560 + static inline bool kexec_free_initrd(void) 561 + { 562 + return false; 563 + } 564 + #endif /* CONFIG_KEXEC_CORE */ 552 565 553 566 #ifdef CONFIG_BLK_DEV_RAM 554 567 #define BUF_SIZE 1024 ··· 610 597 ksys_close(fd); 611 598 kfree(buf); 612 599 } 613 - #endif 600 + #else 601 + static inline void clean_rootfs(void) 602 + { 603 + } 604 + #endif /* CONFIG_BLK_DEV_RAM */ 605 + 606 + #ifdef CONFIG_BLK_DEV_RAM 607 + static void populate_initrd_image(char *err) 608 + { 609 + ssize_t written; 610 + int fd; 611 + 612 + unpack_to_rootfs(__initramfs_start, __initramfs_size); 613 + 614 + printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n", 615 + err); 616 + fd = ksys_open("/initrd.image", O_WRONLY | O_CREAT, 0700); 617 + if (fd < 0) 618 + return; 619 + 620 + written = xwrite(fd, (char *)initrd_start, initrd_end - initrd_start); 621 + if (written != initrd_end - initrd_start) 622 + pr_err("/initrd.image: incomplete write (%zd != %ld)\n", 623 + written, initrd_end - initrd_start); 624 + ksys_close(fd); 625 + } 626 + #else 627 + static void populate_initrd_image(char *err) 628 + { 629 + printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); 630 + } 631 + #endif /* CONFIG_BLK_DEV_RAM */ 614 632 615 633 static int __init populate_rootfs(void) 616 634 { ··· 649 605 char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); 650 606 if (err) 651 607 panic("%s", err); /* Failed to decompress INTERNAL initramfs */ 652 - /* If available load the bootloader supplied initrd */ 653 - if (initrd_start && !IS_ENABLED(CONFIG_INITRAMFS_FORCE)) { 654 - #ifdef CONFIG_BLK_DEV_RAM 655 - int fd; 608 + 609 + if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE)) 610 + goto done; 611 + 612 + if (IS_ENABLED(CONFIG_BLK_DEV_RAM)) 656 613 printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); 657 - err = unpack_to_rootfs((char *)initrd_start, 658 - initrd_end - initrd_start); 659 - if (!err) { 660 - free_initrd(); 661 - goto done; 662 - } else { 663 - clean_rootfs(); 664 - unpack_to_rootfs(__initramfs_start, __initramfs_size); 665 - } 666 - printk(KERN_INFO "rootfs image is not initramfs (%s)" 667 - "; looks like an initrd\n", err); 668 - fd = ksys_open("/initrd.image", 669 - O_WRONLY|O_CREAT, 0700); 670 - if (fd >= 0) { 671 - ssize_t written = xwrite(fd, (char *)initrd_start, 672 - initrd_end - initrd_start); 673 - 674 - if (written != initrd_end - initrd_start) 675 - pr_err("/initrd.image: incomplete write (%zd != %ld)\n", 676 - written, initrd_end - initrd_start); 677 - 678 - ksys_close(fd); 679 - free_initrd(); 680 - } 681 - done: 682 - /* empty statement */; 683 - #else 614 + else 684 615 printk(KERN_INFO "Unpacking initramfs...\n"); 685 - err = unpack_to_rootfs((char *)initrd_start, 686 - initrd_end - initrd_start); 687 - if (err) 688 - printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); 689 - free_initrd(); 690 - #endif 616 + 617 + err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); 618 + if (err) { 619 + clean_rootfs(); 620 + populate_initrd_image(err); 691 621 } 622 + 623 + done: 624 + /* 625 + * If the initrd region is overlapped with crashkernel reserved region, 626 + * free only memory that is not part of crashkernel region. 627 + */ 628 + if (!do_retain_initrd && !kexec_free_initrd()) 629 + free_initrd_mem(initrd_start, initrd_end); 630 + initrd_start = 0; 631 + initrd_end = 0; 632 + 692 633 flush_delayed_fput(); 693 634 return 0; 694 635 }

+5

init/main.c

··· 1074 1074 } 1075 1075 #endif 1076 1076 1077 + void __weak free_initmem(void) 1078 + { 1079 + free_initmem_default(POISON_FREE_INITMEM); 1080 + } 1081 + 1077 1082 static int __ref kernel_init(void *unused) 1078 1083 { 1079 1084 int ret;

+2 -1

kernel/events/uprobes.c

··· 161 161 struct mmu_notifier_range range; 162 162 struct mem_cgroup *memcg; 163 163 164 - mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); 164 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, 165 + addr + PAGE_SIZE); 165 166 166 167 VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); 167 168

+1 -1

kernel/futex.c

··· 543 543 if (unlikely(should_fail_futex(fshared))) 544 544 return -EFAULT; 545 545 546 - err = get_user_pages_fast(address, 1, 1, &page); 546 + err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); 547 547 /* 548 548 * If write access is not required (eg. FUTEX_WAIT), try 549 549 * and get read-only access.

+8 -8

kernel/kexec_file.c

··· 500 500 return locate_mem_hole_bottom_up(start, end, kbuf); 501 501 } 502 502 503 - #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK 504 - static int kexec_walk_memblock(struct kexec_buf *kbuf, 505 - int (*func)(struct resource *, void *)) 506 - { 507 - return 0; 508 - } 509 - #else 503 + #ifdef CONFIG_ARCH_KEEP_MEMBLOCK 510 504 static int kexec_walk_memblock(struct kexec_buf *kbuf, 511 505 int (*func)(struct resource *, void *)) 512 506 { ··· 544 550 545 551 return ret; 546 552 } 553 + #else 554 + static int kexec_walk_memblock(struct kexec_buf *kbuf, 555 + int (*func)(struct resource *, void *)) 556 + { 557 + return 0; 558 + } 547 559 #endif 548 560 549 561 /** ··· 589 589 if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) 590 590 return 0; 591 591 592 - if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) 592 + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) 593 593 ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); 594 594 else 595 595 ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback);

+9 -4

kernel/memremap.c

··· 45 45 */ 46 46 return devmem->page_fault(vma, addr, page, flags, pmdp); 47 47 } 48 - EXPORT_SYMBOL(device_private_entry_fault); 49 48 #endif /* CONFIG_DEVICE_PRIVATE */ 50 49 51 50 static void pgmap_array_delete(struct resource *res) ··· 147 148 &pgmap->altmap : NULL; 148 149 struct resource *res = &pgmap->res; 149 150 struct dev_pagemap *conflict_pgmap; 151 + struct mhp_restrictions restrictions = { 152 + /* 153 + * We do not want any optional features only our own memmap 154 + */ 155 + .altmap = altmap, 156 + }; 150 157 pgprot_t pgprot = PAGE_KERNEL; 151 158 int error, nid, is_ram; 152 159 ··· 219 214 */ 220 215 if (pgmap->type == MEMORY_DEVICE_PRIVATE) { 221 216 error = add_pages(nid, align_start >> PAGE_SHIFT, 222 - align_size >> PAGE_SHIFT, NULL, false); 217 + align_size >> PAGE_SHIFT, &restrictions); 223 218 } else { 224 219 error = kasan_add_zero_shadow(__va(align_start), align_size); 225 220 if (error) { ··· 227 222 goto err_kasan; 228 223 } 229 224 230 - error = arch_add_memory(nid, align_start, align_size, altmap, 231 - false); 225 + error = arch_add_memory(nid, align_start, align_size, 226 + &restrictions); 232 227 } 233 228 234 229 if (!error) {

+1 -1

kernel/sys.c

··· 1924 1924 ((unsigned long)prctl_map->__m1 __op \ 1925 1925 (unsigned long)prctl_map->__m2) ? 0 : -EINVAL 1926 1926 error = __prctl_check_order(start_code, <, end_code); 1927 - error |= __prctl_check_order(start_data, <, end_data); 1927 + error |= __prctl_check_order(start_data,<=, end_data); 1928 1928 error |= __prctl_check_order(start_brk, <=, brk); 1929 1929 error |= __prctl_check_order(arg_start, <=, arg_end); 1930 1930 error |= __prctl_check_order(env_start, <=, env_end);

+12

kernel/sysctl.c

··· 66 66 #include <linux/kexec.h> 67 67 #include <linux/bpf.h> 68 68 #include <linux/mount.h> 69 + #include <linux/userfaultfd_k.h> 69 70 70 71 #include "../lib/kstrtox.h" 71 72 ··· 1719 1718 .proc_handler = proc_dointvec_minmax, 1720 1719 .extra1 = (void *)&mmap_rnd_compat_bits_min, 1721 1720 .extra2 = (void *)&mmap_rnd_compat_bits_max, 1721 + }, 1722 + #endif 1723 + #ifdef CONFIG_USERFAULTFD 1724 + { 1725 + .procname = "unprivileged_userfaultfd", 1726 + .data = &sysctl_unprivileged_userfaultfd, 1727 + .maxlen = sizeof(sysctl_unprivileged_userfaultfd), 1728 + .mode = 0644, 1729 + .proc_handler = proc_dointvec_minmax, 1730 + .extra1 = &zero, 1731 + .extra2 = &one, 1722 1732 }, 1723 1733 #endif 1724 1734 { }

+5 -2

lib/iov_iter.c

··· 1293 1293 len = maxpages * PAGE_SIZE; 1294 1294 addr &= ~(PAGE_SIZE - 1); 1295 1295 n = DIV_ROUND_UP(len, PAGE_SIZE); 1296 - res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, pages); 1296 + res = get_user_pages_fast(addr, n, 1297 + iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, 1298 + pages); 1297 1299 if (unlikely(res < 0)) 1298 1300 return res; 1299 1301 return (res == n ? len : res * PAGE_SIZE) - *start; ··· 1376 1374 p = get_pages_array(n); 1377 1375 if (!p) 1378 1376 return -ENOMEM; 1379 - res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, p); 1377 + res = get_user_pages_fast(addr, n, 1378 + iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p); 1380 1379 if (unlikely(res < 0)) { 1381 1380 kvfree(p); 1382 1381 return res;

+43 -37

mm/Kconfig

··· 11 11 default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT 12 12 default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT 13 13 default FLATMEM_MANUAL 14 + help 15 + This option allows you to change some of the ways that 16 + Linux manages its memory internally. Most users will 17 + only have one option here selected by the architecture 18 + configuration. This is normal. 14 19 15 20 config FLATMEM_MANUAL 16 21 bool "Flat Memory" 17 22 depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE 18 23 help 19 - This option allows you to change some of the ways that 20 - Linux manages its memory internally. Most users will 21 - only have one option here: FLATMEM. This is normal 22 - and a correct option. 24 + This option is best suited for non-NUMA systems with 25 + flat address space. The FLATMEM is the most efficient 26 + system in terms of performance and resource consumption 27 + and it is the best option for smaller systems. 23 28 24 - Some users of more advanced features like NUMA and 25 - memory hotplug may have different options here. 26 - DISCONTIGMEM is a more mature, better tested system, 27 - but is incompatible with memory hotplug and may suffer 28 - decreased performance over SPARSEMEM. If unsure between 29 - "Sparse Memory" and "Discontiguous Memory", choose 30 - "Discontiguous Memory". 29 + For systems that have holes in their physical address 30 + spaces and for features like NUMA and memory hotplug, 31 + choose "Sparse Memory" 31 32 32 33 If unsure, choose this option (Flat Memory) over any other. 33 34 ··· 39 38 This option provides enhanced support for discontiguous 40 39 memory systems, over FLATMEM. These systems have holes 41 40 in their physical address spaces, and this option provides 42 - more efficient handling of these holes. However, the vast 43 - majority of hardware has quite flat address spaces, and 44 - can have degraded performance from the extra overhead that 45 - this option imposes. 41 + more efficient handling of these holes. 46 42 47 - Many NUMA configurations will have this as the only option. 43 + Although "Discontiguous Memory" is still used by several 44 + architectures, it is considered deprecated in favor of 45 + "Sparse Memory". 48 46 49 - If unsure, choose "Flat Memory" over this option. 47 + If unsure, choose "Sparse Memory" over this option. 50 48 51 49 config SPARSEMEM_MANUAL 52 50 bool "Sparse Memory" 53 51 depends on ARCH_SPARSEMEM_ENABLE 54 52 help 55 53 This will be the only option for some systems, including 56 - memory hotplug systems. This is normal. 54 + memory hot-plug systems. This is normal. 57 55 58 - For many other systems, this will be an alternative to 59 - "Discontiguous Memory". This option provides some potential 60 - performance benefits, along with decreased code complexity, 61 - but it is newer, and more experimental. 56 + This option provides efficient support for systems with 57 + holes is their physical address space and allows memory 58 + hot-plug and hot-remove. 62 59 63 - If unsure, choose "Discontiguous Memory" or "Flat Memory" 64 - over this option. 60 + If unsure, choose "Flat Memory" over this option. 65 61 66 62 endchoice 67 63 ··· 134 136 config HAVE_GENERIC_GUP 135 137 bool 136 138 137 - config ARCH_DISCARD_MEMBLOCK 139 + config ARCH_KEEP_MEMBLOCK 138 140 bool 139 141 140 142 config MEMORY_ISOLATION ··· 159 161 160 162 config MEMORY_HOTPLUG_DEFAULT_ONLINE 161 163 bool "Online the newly added memory blocks by default" 162 - default n 163 164 depends on MEMORY_HOTPLUG 164 165 help 165 166 This option sets the default policy setting for memory hotplug ··· 254 257 255 258 config ARCH_ENABLE_THP_MIGRATION 256 259 bool 260 + 261 + config CONTIG_ALLOC 262 + def_bool (MEMORY_ISOLATION && COMPACTION) || CMA 257 263 258 264 config PHYS_ADDR_T_64BIT 259 265 def_bool 64BIT ··· 436 436 437 437 config CLEANCACHE 438 438 bool "Enable cleancache driver to cache clean pages if tmem is present" 439 - default n 440 439 help 441 440 Cleancache can be thought of as a page-granularity victim cache 442 441 for clean pages that the kernel's pageframe replacement algorithm ··· 459 460 config FRONTSWAP 460 461 bool "Enable frontswap to cache swap pages if tmem is present" 461 462 depends on SWAP 462 - default n 463 463 help 464 464 Frontswap is so named because it can be thought of as the opposite 465 465 of a "backing" store for a swap device. The data is stored into ··· 530 532 depends on FRONTSWAP && CRYPTO=y 531 533 select CRYPTO_LZO 532 534 select ZPOOL 533 - default n 534 535 help 535 536 A lightweight compressed cache for swap pages. It takes 536 537 pages that are in the process of being swapped out and attempts to ··· 546 549 547 550 config ZPOOL 548 551 tristate "Common API for compressed memory storage" 549 - default n 550 552 help 551 553 Compressed memory storage API. This allows using either zbud or 552 554 zsmalloc. 553 555 554 556 config ZBUD 555 557 tristate "Low (Up to 2x) density storage for compressed pages" 556 - default n 557 558 help 558 559 A special purpose allocator for storing compressed pages. 559 560 It is designed to store up to two compressed pages per physical ··· 562 567 config Z3FOLD 563 568 tristate "Up to 3x density storage for compressed pages" 564 569 depends on ZPOOL 565 - default n 566 570 help 567 571 A special purpose allocator for storing compressed pages. 568 572 It is designed to store up to three compressed pages per physical ··· 571 577 config ZSMALLOC 572 578 tristate "Memory allocator for compressed pages" 573 579 depends on MMU 574 - default n 575 580 help 576 581 zsmalloc is a slab-based memory allocator designed to store 577 582 compressed RAM pages. zsmalloc uses virtual memory mapping ··· 621 628 622 629 config DEFERRED_STRUCT_PAGE_INIT 623 630 bool "Defer initialisation of struct pages to kthreads" 624 - default n 625 631 depends on SPARSEMEM 626 632 depends on !NEED_PER_CPU_KM 627 633 depends on 64BIT ··· 668 676 669 677 If FS_DAX is enabled, then say Y. 670 678 679 + config ARCH_HAS_HMM_MIRROR 680 + bool 681 + default y 682 + depends on (X86_64 || PPC64) 683 + depends on MMU && 64BIT 684 + 685 + config ARCH_HAS_HMM_DEVICE 686 + bool 687 + default y 688 + depends on (X86_64 || PPC64) 689 + depends on MEMORY_HOTPLUG 690 + depends on MEMORY_HOTREMOVE 691 + depends on SPARSEMEM_VMEMMAP 692 + depends on ARCH_HAS_ZONE_DEVICE 693 + select XARRAY_MULTI 694 + 671 695 config ARCH_HAS_HMM 672 696 bool 673 697 default y ··· 702 694 703 695 config HMM 704 696 bool 697 + select MMU_NOTIFIER 705 698 select MIGRATE_VMA_HELPER 706 699 707 700 config HMM_MIRROR 708 701 bool "HMM mirror CPU page table into a device page table" 709 702 depends on ARCH_HAS_HMM 710 - select MMU_NOTIFIER 711 703 select HMM 712 704 help 713 705 Select HMM_MIRROR if you want to mirror range of the CPU page table of a ··· 748 740 749 741 config PERCPU_STATS 750 742 bool "Collect percpu memory statistics" 751 - default n 752 743 help 753 744 This feature collects and exposes statistics via debugfs. The 754 745 information includes global and per chunk statistics, which can ··· 755 748 756 749 config GUP_BENCHMARK 757 750 bool "Enable infrastructure for get_user_pages_fast() benchmarking" 758 - default n 759 751 help 760 752 Provides /sys/kernel/debug/gup_benchmark that helps with testing 761 753 performance of get_user_pages_fast().

-1

mm/Kconfig.debug

··· 33 33 34 34 config DEBUG_PAGEALLOC_ENABLE_DEFAULT 35 35 bool "Enable debug page memory allocations by default?" 36 - default n 37 36 depends on DEBUG_PAGEALLOC 38 37 ---help--- 39 38 Enable debug page memory allocations by default? This value

+14 -9

mm/cma.c

··· 106 106 107 107 cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); 108 108 109 - if (!cma->bitmap) 109 + if (!cma->bitmap) { 110 + cma->count = 0; 110 111 return -ENOMEM; 112 + } 111 113 112 114 WARN_ON_ONCE(!pfn_valid(pfn)); 113 115 zone = page_zone(pfn_to_page(pfn)); ··· 369 367 #ifdef CONFIG_CMA_DEBUG 370 368 static void cma_debug_show_areas(struct cma *cma) 371 369 { 372 - unsigned long next_zero_bit, next_set_bit; 370 + unsigned long next_zero_bit, next_set_bit, nr_zero; 373 371 unsigned long start = 0; 374 - unsigned int nr_zero, nr_total = 0; 372 + unsigned long nr_part, nr_total = 0; 373 + unsigned long nbits = cma_bitmap_maxno(cma); 375 374 376 375 mutex_lock(&cma->lock); 377 376 pr_info("number of available pages: "); 378 377 for (;;) { 379 - next_zero_bit = find_next_zero_bit(cma->bitmap, cma->count, start); 380 - if (next_zero_bit >= cma->count) 378 + next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start); 379 + if (next_zero_bit >= nbits) 381 380 break; 382 - next_set_bit = find_next_bit(cma->bitmap, cma->count, next_zero_bit); 381 + next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit); 383 382 nr_zero = next_set_bit - next_zero_bit; 384 - pr_cont("%s%u@%lu", nr_total ? "+" : "", nr_zero, next_zero_bit); 385 - nr_total += nr_zero; 383 + nr_part = nr_zero << cma->order_per_bit; 384 + pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part, 385 + next_zero_bit); 386 + nr_total += nr_part; 386 387 start = next_zero_bit + nr_zero; 387 388 } 388 - pr_cont("=> %u free of %lu total pages\n", nr_total, cma->count); 389 + pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count); 389 390 mutex_unlock(&cma->lock); 390 391 } 391 392 #else

+1 -1

mm/cma_debug.c

··· 56 56 mutex_lock(&cma->lock); 57 57 for (;;) { 58 58 start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end); 59 - if (start >= cma->count) 59 + if (start >= bitmap_maxno) 60 60 break; 61 61 end = find_next_bit(cma->bitmap, bitmap_maxno, start); 62 62 maxchunk = max(end - start, maxchunk);

+3 -1

mm/compaction.c

··· 1164 1164 static inline unsigned int 1165 1165 freelist_scan_limit(struct compact_control *cc) 1166 1166 { 1167 - return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1; 1167 + unsigned short shift = BITS_PER_LONG - 1; 1168 + 1169 + return (COMPACT_CLUSTER_MAX >> min(shift, cc->fast_search_fail)) + 1; 1168 1170 } 1169 1171 1170 1172 /*

+62 -156

mm/filemap.c

··· 24 24 #include <linux/pagemap.h> 25 25 #include <linux/file.h> 26 26 #include <linux/uio.h> 27 + #include <linux/error-injection.h> 27 28 #include <linux/hash.h> 28 29 #include <linux/writeback.h> 29 30 #include <linux/backing-dev.h> ··· 280 279 * @pvec: pagevec with pages to delete 281 280 * 282 281 * The function walks over mapping->i_pages and removes pages passed in @pvec 283 - * from the mapping. The function expects @pvec to be sorted by page index. 282 + * from the mapping. The function expects @pvec to be sorted by page index 283 + * and is optimised for it to be dense. 284 284 * It tolerates holes in @pvec (mapping entries at those indices are not 285 285 * modified). The function expects only THP head pages to be present in the 286 - * @pvec and takes care to delete all corresponding tail pages from the 287 - * mapping as well. 286 + * @pvec. 288 287 * 289 288 * The function expects the i_pages lock to be held. 290 289 */ ··· 293 292 { 294 293 XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); 295 294 int total_pages = 0; 296 - int i = 0, tail_pages = 0; 295 + int i = 0; 297 296 struct page *page; 298 297 299 298 mapping_set_update(&xas, mapping); 300 299 xas_for_each(&xas, page, ULONG_MAX) { 301 - if (i >= pagevec_count(pvec) && !tail_pages) 300 + if (i >= pagevec_count(pvec)) 302 301 break; 302 + 303 + /* A swap/dax/shadow entry got inserted? Skip it. */ 303 304 if (xa_is_value(page)) 304 305 continue; 305 - if (!tail_pages) { 306 - /* 307 - * Some page got inserted in our range? Skip it. We 308 - * have our pages locked so they are protected from 309 - * being removed. 310 - */ 311 - if (page != pvec->pages[i]) { 312 - VM_BUG_ON_PAGE(page->index > 313 - pvec->pages[i]->index, page); 314 - continue; 315 - } 316 - WARN_ON_ONCE(!PageLocked(page)); 317 - if (PageTransHuge(page) && !PageHuge(page)) 318 - tail_pages = HPAGE_PMD_NR - 1; 319 - page->mapping = NULL; 320 - /* 321 - * Leave page->index set: truncation lookup relies 322 - * upon it 323 - */ 324 - i++; 325 - } else { 326 - VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages 327 - != pvec->pages[i]->index, page); 328 - tail_pages--; 306 + /* 307 + * A page got inserted in our range? Skip it. We have our 308 + * pages locked so they are protected from being removed. 309 + * If we see a page whose index is higher than ours, it 310 + * means our page has been removed, which shouldn't be 311 + * possible because we're holding the PageLock. 312 + */ 313 + if (page != pvec->pages[i]) { 314 + VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index, 315 + page); 316 + continue; 329 317 } 318 + 319 + WARN_ON_ONCE(!PageLocked(page)); 320 + 321 + if (page->index == xas.xa_index) 322 + page->mapping = NULL; 323 + /* Leave page->index set: truncation lookup relies on it */ 324 + 325 + /* 326 + * Move to the next page in the vector if this is a regular 327 + * page or the index is of the last sub-page of this compound 328 + * page. 329 + */ 330 + if (page->index + (1UL << compound_order(page)) - 1 == 331 + xas.xa_index) 332 + i++; 330 333 xas_store(&xas, NULL); 331 334 total_pages++; 332 335 } ··· 883 878 put_page(page); 884 879 return xas_error(&xas); 885 880 } 881 + ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); 886 882 887 883 /** 888 884 * add_to_page_cache_locked - add a locked page to the pagecache ··· 1446 1440 EXPORT_SYMBOL(page_cache_next_miss); 1447 1441 1448 1442 /** 1449 - * page_cache_prev_miss() - Find the next gap in the page cache. 1443 + * page_cache_prev_miss() - Find the previous gap in the page cache. 1450 1444 * @mapping: Mapping. 1451 1445 * @index: Index. 1452 1446 * @max_scan: Maximum range to search. ··· 1497 1491 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) 1498 1492 { 1499 1493 XA_STATE(xas, &mapping->i_pages, offset); 1500 - struct page *head, *page; 1494 + struct page *page; 1501 1495 1502 1496 rcu_read_lock(); 1503 1497 repeat: ··· 1512 1506 if (!page || xa_is_value(page)) 1513 1507 goto out; 1514 1508 1515 - head = compound_head(page); 1516 - if (!page_cache_get_speculative(head)) 1509 + if (!page_cache_get_speculative(page)) 1517 1510 goto repeat; 1518 - 1519 - /* The page was split under us? */ 1520 - if (compound_head(page) != head) { 1521 - put_page(head); 1522 - goto repeat; 1523 - } 1524 1511 1525 1512 /* 1526 - * Has the page moved? 1513 + * Has the page moved or been split? 1527 1514 * This is part of the lockless pagecache protocol. See 1528 1515 * include/linux/pagemap.h for details. 1529 1516 */ 1530 1517 if (unlikely(page != xas_reload(&xas))) { 1531 - put_page(head); 1518 + put_page(page); 1532 1519 goto repeat; 1533 1520 } 1521 + page = find_subpage(page, offset); 1534 1522 out: 1535 1523 rcu_read_unlock(); 1536 1524 ··· 1706 1706 1707 1707 rcu_read_lock(); 1708 1708 xas_for_each(&xas, page, ULONG_MAX) { 1709 - struct page *head; 1710 1709 if (xas_retry(&xas, page)) 1711 1710 continue; 1712 1711 /* ··· 1716 1717 if (xa_is_value(page)) 1717 1718 goto export; 1718 1719 1719 - head = compound_head(page); 1720 - if (!page_cache_get_speculative(head)) 1720 + if (!page_cache_get_speculative(page)) 1721 1721 goto retry; 1722 1722 1723 - /* The page was split under us? */ 1724 - if (compound_head(page) != head) 1725 - goto put_page; 1726 - 1727 - /* Has the page moved? */ 1723 + /* Has the page moved or been split? */ 1728 1724 if (unlikely(page != xas_reload(&xas))) 1729 1725 goto put_page; 1726 + page = find_subpage(page, xas.xa_index); 1730 1727 1731 1728 export: 1732 1729 indices[ret] = xas.xa_index; ··· 1731 1736 break; 1732 1737 continue; 1733 1738 put_page: 1734 - put_page(head); 1739 + put_page(page); 1735 1740 retry: 1736 1741 xas_reset(&xas); 1737 1742 } ··· 1773 1778 1774 1779 rcu_read_lock(); 1775 1780 xas_for_each(&xas, page, end) { 1776 - struct page *head; 1777 1781 if (xas_retry(&xas, page)) 1778 1782 continue; 1779 1783 /* Skip over shadow, swap and DAX entries */ 1780 1784 if (xa_is_value(page)) 1781 1785 continue; 1782 1786 1783 - head = compound_head(page); 1784 - if (!page_cache_get_speculative(head)) 1787 + if (!page_cache_get_speculative(page)) 1785 1788 goto retry; 1786 1789 1787 - /* The page was split under us? */ 1788 - if (compound_head(page) != head) 1789 - goto put_page; 1790 - 1791 - /* Has the page moved? */ 1790 + /* Has the page moved or been split? */ 1792 1791 if (unlikely(page != xas_reload(&xas))) 1793 1792 goto put_page; 1794 1793 1795 - pages[ret] = page; 1794 + pages[ret] = find_subpage(page, xas.xa_index); 1796 1795 if (++ret == nr_pages) { 1797 1796 *start = xas.xa_index + 1; 1798 1797 goto out; 1799 1798 } 1800 1799 continue; 1801 1800 put_page: 1802 - put_page(head); 1801 + put_page(page); 1803 1802 retry: 1804 1803 xas_reset(&xas); 1805 1804 } ··· 1838 1849 1839 1850 rcu_read_lock(); 1840 1851 for (page = xas_load(&xas); page; page = xas_next(&xas)) { 1841 - struct page *head; 1842 1852 if (xas_retry(&xas, page)) 1843 1853 continue; 1844 1854 /* ··· 1847 1859 if (xa_is_value(page)) 1848 1860 break; 1849 1861 1850 - head = compound_head(page); 1851 - if (!page_cache_get_speculative(head)) 1862 + if (!page_cache_get_speculative(page)) 1852 1863 goto retry; 1853 1864 1854 - /* The page was split under us? */ 1855 - if (compound_head(page) != head) 1856 - goto put_page; 1857 - 1858 - /* Has the page moved? */ 1865 + /* Has the page moved or been split? */ 1859 1866 if (unlikely(page != xas_reload(&xas))) 1860 1867 goto put_page; 1861 1868 1862 - pages[ret] = page; 1869 + pages[ret] = find_subpage(page, xas.xa_index); 1863 1870 if (++ret == nr_pages) 1864 1871 break; 1865 1872 continue; 1866 1873 put_page: 1867 - put_page(head); 1874 + put_page(page); 1868 1875 retry: 1869 1876 xas_reset(&xas); 1870 1877 } ··· 1895 1912 1896 1913 rcu_read_lock(); 1897 1914 xas_for_each_marked(&xas, page, end, tag) { 1898 - struct page *head; 1899 1915 if (xas_retry(&xas, page)) 1900 1916 continue; 1901 1917 /* ··· 1905 1923 if (xa_is_value(page)) 1906 1924 continue; 1907 1925 1908 - head = compound_head(page); 1909 - if (!page_cache_get_speculative(head)) 1926 + if (!page_cache_get_speculative(page)) 1910 1927 goto retry; 1911 1928 1912 - /* The page was split under us? */ 1913 - if (compound_head(page) != head) 1914 - goto put_page; 1915 - 1916 - /* Has the page moved? */ 1929 + /* Has the page moved or been split? */ 1917 1930 if (unlikely(page != xas_reload(&xas))) 1918 1931 goto put_page; 1919 1932 1920 - pages[ret] = page; 1933 + pages[ret] = find_subpage(page, xas.xa_index); 1921 1934 if (++ret == nr_pages) { 1922 1935 *index = xas.xa_index + 1; 1923 1936 goto out; 1924 1937 } 1925 1938 continue; 1926 1939 put_page: 1927 - put_page(head); 1940 + put_page(page); 1928 1941 retry: 1929 1942 xas_reset(&xas); 1930 1943 } ··· 1940 1963 return ret; 1941 1964 } 1942 1965 EXPORT_SYMBOL(find_get_pages_range_tag); 1943 - 1944 - /** 1945 - * find_get_entries_tag - find and return entries that match @tag 1946 - * @mapping: the address_space to search 1947 - * @start: the starting page cache index 1948 - * @tag: the tag index 1949 - * @nr_entries: the maximum number of entries 1950 - * @entries: where the resulting entries are placed 1951 - * @indices: the cache indices corresponding to the entries in @entries 1952 - * 1953 - * Like find_get_entries, except we only return entries which are tagged with 1954 - * @tag. 1955 - * 1956 - * Return: the number of entries which were found. 1957 - */ 1958 - unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, 1959 - xa_mark_t tag, unsigned int nr_entries, 1960 - struct page **entries, pgoff_t *indices) 1961 - { 1962 - XA_STATE(xas, &mapping->i_pages, start); 1963 - struct page *page; 1964 - unsigned int ret = 0; 1965 - 1966 - if (!nr_entries) 1967 - return 0; 1968 - 1969 - rcu_read_lock(); 1970 - xas_for_each_marked(&xas, page, ULONG_MAX, tag) { 1971 - struct page *head; 1972 - if (xas_retry(&xas, page)) 1973 - continue; 1974 - /* 1975 - * A shadow entry of a recently evicted page, a swap 1976 - * entry from shmem/tmpfs or a DAX entry. Return it 1977 - * without attempting to raise page count. 1978 - */ 1979 - if (xa_is_value(page)) 1980 - goto export; 1981 - 1982 - head = compound_head(page); 1983 - if (!page_cache_get_speculative(head)) 1984 - goto retry; 1985 - 1986 - /* The page was split under us? */ 1987 - if (compound_head(page) != head) 1988 - goto put_page; 1989 - 1990 - /* Has the page moved? */ 1991 - if (unlikely(page != xas_reload(&xas))) 1992 - goto put_page; 1993 - 1994 - export: 1995 - indices[ret] = xas.xa_index; 1996 - entries[ret] = page; 1997 - if (++ret == nr_entries) 1998 - break; 1999 - continue; 2000 - put_page: 2001 - put_page(head); 2002 - retry: 2003 - xas_reset(&xas); 2004 - } 2005 - rcu_read_unlock(); 2006 - return ret; 2007 - } 2008 - EXPORT_SYMBOL(find_get_entries_tag); 2009 1966 2010 1967 /* 2011 1968 * CD/DVDs are error prone. When a medium error occurs, the driver may fail ··· 2602 2691 pgoff_t last_pgoff = start_pgoff; 2603 2692 unsigned long max_idx; 2604 2693 XA_STATE(xas, &mapping->i_pages, start_pgoff); 2605 - struct page *head, *page; 2694 + struct page *page; 2606 2695 2607 2696 rcu_read_lock(); 2608 2697 xas_for_each(&xas, page, end_pgoff) { ··· 2611 2700 if (xa_is_value(page)) 2612 2701 goto next; 2613 2702 2614 - head = compound_head(page); 2615 - 2616 2703 /* 2617 2704 * Check for a locked page first, as a speculative 2618 2705 * reference may adversely influence page migration. 2619 2706 */ 2620 - if (PageLocked(head)) 2707 + if (PageLocked(page)) 2621 2708 goto next; 2622 - if (!page_cache_get_speculative(head)) 2709 + if (!page_cache_get_speculative(page)) 2623 2710 goto next; 2624 2711 2625 - /* The page was split under us? */ 2626 - if (compound_head(page) != head) 2627 - goto skip; 2628 - 2629 - /* Has the page moved? */ 2712 + /* Has the page moved or been split? */ 2630 2713 if (unlikely(page != xas_reload(&xas))) 2631 2714 goto skip; 2715 + page = find_subpage(page, xas.xa_index); 2632 2716 2633 2717 if (!PageUptodate(page) || 2634 2718 PageReadahead(page) ||

+286 -105

mm/gup.c

··· 28 28 unsigned int page_mask; 29 29 }; 30 30 31 + typedef int (*set_dirty_func_t)(struct page *page); 32 + 33 + static void __put_user_pages_dirty(struct page **pages, 34 + unsigned long npages, 35 + set_dirty_func_t sdf) 36 + { 37 + unsigned long index; 38 + 39 + for (index = 0; index < npages; index++) { 40 + struct page *page = compound_head(pages[index]); 41 + 42 + /* 43 + * Checking PageDirty at this point may race with 44 + * clear_page_dirty_for_io(), but that's OK. Two key cases: 45 + * 46 + * 1) This code sees the page as already dirty, so it skips 47 + * the call to sdf(). That could happen because 48 + * clear_page_dirty_for_io() called page_mkclean(), 49 + * followed by set_page_dirty(). However, now the page is 50 + * going to get written back, which meets the original 51 + * intention of setting it dirty, so all is well: 52 + * clear_page_dirty_for_io() goes on to call 53 + * TestClearPageDirty(), and write the page back. 54 + * 55 + * 2) This code sees the page as clean, so it calls sdf(). 56 + * The page stays dirty, despite being written back, so it 57 + * gets written back again in the next writeback cycle. 58 + * This is harmless. 59 + */ 60 + if (!PageDirty(page)) 61 + sdf(page); 62 + 63 + put_user_page(page); 64 + } 65 + } 66 + 67 + /** 68 + * put_user_pages_dirty() - release and dirty an array of gup-pinned pages 69 + * @pages: array of pages to be marked dirty and released. 70 + * @npages: number of pages in the @pages array. 71 + * 72 + * "gup-pinned page" refers to a page that has had one of the get_user_pages() 73 + * variants called on that page. 74 + * 75 + * For each page in the @pages array, make that page (or its head page, if a 76 + * compound page) dirty, if it was previously listed as clean. Then, release 77 + * the page using put_user_page(). 78 + * 79 + * Please see the put_user_page() documentation for details. 80 + * 81 + * set_page_dirty(), which does not lock the page, is used here. 82 + * Therefore, it is the caller's responsibility to ensure that this is 83 + * safe. If not, then put_user_pages_dirty_lock() should be called instead. 84 + * 85 + */ 86 + void put_user_pages_dirty(struct page **pages, unsigned long npages) 87 + { 88 + __put_user_pages_dirty(pages, npages, set_page_dirty); 89 + } 90 + EXPORT_SYMBOL(put_user_pages_dirty); 91 + 92 + /** 93 + * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages 94 + * @pages: array of pages to be marked dirty and released. 95 + * @npages: number of pages in the @pages array. 96 + * 97 + * For each page in the @pages array, make that page (or its head page, if a 98 + * compound page) dirty, if it was previously listed as clean. Then, release 99 + * the page using put_user_page(). 100 + * 101 + * Please see the put_user_page() documentation for details. 102 + * 103 + * This is just like put_user_pages_dirty(), except that it invokes 104 + * set_page_dirty_lock(), instead of set_page_dirty(). 105 + * 106 + */ 107 + void put_user_pages_dirty_lock(struct page **pages, unsigned long npages) 108 + { 109 + __put_user_pages_dirty(pages, npages, set_page_dirty_lock); 110 + } 111 + EXPORT_SYMBOL(put_user_pages_dirty_lock); 112 + 113 + /** 114 + * put_user_pages() - release an array of gup-pinned pages. 115 + * @pages: array of pages to be marked dirty and released. 116 + * @npages: number of pages in the @pages array. 117 + * 118 + * For each page in the @pages array, release the page using put_user_page(). 119 + * 120 + * Please see the put_user_page() documentation for details. 121 + */ 122 + void put_user_pages(struct page **pages, unsigned long npages) 123 + { 124 + unsigned long index; 125 + 126 + /* 127 + * TODO: this can be optimized for huge pages: if a series of pages is 128 + * physically contiguous and part of the same compound page, then a 129 + * single operation to the head page should suffice. 130 + */ 131 + for (index = 0; index < npages; index++) 132 + put_user_page(pages[index]); 133 + } 134 + EXPORT_SYMBOL(put_user_pages); 135 + 31 136 static struct page *no_page_table(struct vm_area_struct *vma, 32 137 unsigned int flags) 33 138 { ··· 1123 1018 unsigned int gup_flags, struct page **pages, 1124 1019 int *locked) 1125 1020 { 1021 + /* 1022 + * FIXME: Current FOLL_LONGTERM behavior is incompatible with 1023 + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on 1024 + * vmas. As there are no users of this flag in this call we simply 1025 + * disallow this option for now. 1026 + */ 1027 + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) 1028 + return -EINVAL; 1029 + 1126 1030 return __get_user_pages_locked(current, current->mm, start, nr_pages, 1127 1031 pages, NULL, locked, 1128 1032 gup_flags | FOLL_TOUCH); ··· 1159 1045 struct mm_struct *mm = current->mm; 1160 1046 int locked = 1; 1161 1047 long ret; 1048 + 1049 + /* 1050 + * FIXME: Current FOLL_LONGTERM behavior is incompatible with 1051 + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on 1052 + * vmas. As there are no users of this flag in this call we simply 1053 + * disallow this option for now. 1054 + */ 1055 + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) 1056 + return -EINVAL; 1162 1057 1163 1058 down_read(&mm->mmap_sem); 1164 1059 ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, ··· 1239 1116 unsigned int gup_flags, struct page **pages, 1240 1117 struct vm_area_struct **vmas, int *locked) 1241 1118 { 1119 + /* 1120 + * FIXME: Current FOLL_LONGTERM behavior is incompatible with 1121 + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on 1122 + * vmas. As there are no users of this flag in this call we simply 1123 + * disallow this option for now. 1124 + */ 1125 + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) 1126 + return -EINVAL; 1127 + 1242 1128 return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, 1243 1129 locked, 1244 1130 gup_flags | FOLL_TOUCH | FOLL_REMOTE); 1245 1131 } 1246 1132 EXPORT_SYMBOL(get_user_pages_remote); 1247 1133 1248 - /* 1249 - * This is the same as get_user_pages_remote(), just with a 1250 - * less-flexible calling convention where we assume that the task 1251 - * and mm being operated on are the current task's and don't allow 1252 - * passing of a locked parameter. We also obviously don't pass 1253 - * FOLL_REMOTE in here. 1254 - */ 1255 - long get_user_pages(unsigned long start, unsigned long nr_pages, 1256 - unsigned int gup_flags, struct page **pages, 1257 - struct vm_area_struct **vmas) 1258 - { 1259 - return __get_user_pages_locked(current, current->mm, start, nr_pages, 1260 - pages, vmas, NULL, 1261 - gup_flags | FOLL_TOUCH); 1262 - } 1263 - EXPORT_SYMBOL(get_user_pages); 1264 - 1265 1134 #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) 1266 - 1267 - #ifdef CONFIG_FS_DAX 1268 1135 static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) 1269 1136 { 1270 1137 long i; ··· 1273 1160 } 1274 1161 return false; 1275 1162 } 1276 - #else 1277 - static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) 1278 - { 1279 - return false; 1280 - } 1281 - #endif 1282 1163 1283 1164 #ifdef CONFIG_CMA 1284 1165 static struct page *new_non_cma_page(struct page *page, unsigned long private) ··· 1326 1219 return __alloc_pages_node(nid, gfp_mask, 0); 1327 1220 } 1328 1221 1329 - static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, 1330 - unsigned int gup_flags, 1222 + static long check_and_migrate_cma_pages(struct task_struct *tsk, 1223 + struct mm_struct *mm, 1224 + unsigned long start, 1225 + unsigned long nr_pages, 1331 1226 struct page **pages, 1332 - struct vm_area_struct **vmas) 1227 + struct vm_area_struct **vmas, 1228 + unsigned int gup_flags) 1333 1229 { 1334 1230 long i; 1335 1231 bool drain_allow = true; ··· 1388 1278 putback_movable_pages(&cma_page_list); 1389 1279 } 1390 1280 /* 1391 - * We did migrate all the pages, Try to get the page references again 1392 - * migrating any new CMA pages which we failed to isolate earlier. 1281 + * We did migrate all the pages, Try to get the page references 1282 + * again migrating any new CMA pages which we failed to isolate 1283 + * earlier. 1393 1284 */ 1394 - nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas); 1285 + nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages, 1286 + pages, vmas, NULL, 1287 + gup_flags); 1288 + 1395 1289 if ((nr_pages > 0) && migrate_allow) { 1396 1290 drain_allow = true; 1397 1291 goto check_again; ··· 1405 1291 return nr_pages; 1406 1292 } 1407 1293 #else 1408 - static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages, 1409 - unsigned int gup_flags, 1410 - struct page **pages, 1411 - struct vm_area_struct **vmas) 1294 + static long check_and_migrate_cma_pages(struct task_struct *tsk, 1295 + struct mm_struct *mm, 1296 + unsigned long start, 1297 + unsigned long nr_pages, 1298 + struct page **pages, 1299 + struct vm_area_struct **vmas, 1300 + unsigned int gup_flags) 1412 1301 { 1413 1302 return nr_pages; 1414 1303 } 1415 1304 #endif 1416 1305 1417 1306 /* 1418 - * This is the same as get_user_pages() in that it assumes we are 1419 - * operating on the current task's mm, but it goes further to validate 1420 - * that the vmas associated with the address range are suitable for 1421 - * longterm elevated page reference counts. For example, filesystem-dax 1422 - * mappings are subject to the lifetime enforced by the filesystem and 1423 - * we need guarantees that longterm users like RDMA and V4L2 only 1424 - * establish mappings that have a kernel enforced revocation mechanism. 1425 - * 1426 - * "longterm" == userspace controlled elevated page count lifetime. 1427 - * Contrast this to iov_iter_get_pages() usages which are transient. 1307 + * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which 1308 + * allows us to process the FOLL_LONGTERM flag. 1428 1309 */ 1429 - long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, 1430 - unsigned int gup_flags, struct page **pages, 1431 - struct vm_area_struct **vmas_arg) 1310 + static long __gup_longterm_locked(struct task_struct *tsk, 1311 + struct mm_struct *mm, 1312 + unsigned long start, 1313 + unsigned long nr_pages, 1314 + struct page **pages, 1315 + struct vm_area_struct **vmas, 1316 + unsigned int gup_flags) 1432 1317 { 1433 - struct vm_area_struct **vmas = vmas_arg; 1434 - unsigned long flags; 1318 + struct vm_area_struct **vmas_tmp = vmas; 1319 + unsigned long flags = 0; 1435 1320 long rc, i; 1436 1321 1437 - if (!pages) 1438 - return -EINVAL; 1322 + if (gup_flags & FOLL_LONGTERM) { 1323 + if (!pages) 1324 + return -EINVAL; 1439 1325 1440 - if (!vmas) { 1441 - vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *), 1442 - GFP_KERNEL); 1443 - if (!vmas) 1444 - return -ENOMEM; 1326 + if (!vmas_tmp) { 1327 + vmas_tmp = kcalloc(nr_pages, 1328 + sizeof(struct vm_area_struct *), 1329 + GFP_KERNEL); 1330 + if (!vmas_tmp) 1331 + return -ENOMEM; 1332 + } 1333 + flags = memalloc_nocma_save(); 1445 1334 } 1446 1335 1447 - flags = memalloc_nocma_save(); 1448 - rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); 1449 - memalloc_nocma_restore(flags); 1450 - if (rc < 0) 1451 - goto out; 1336 + rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, 1337 + vmas_tmp, NULL, gup_flags); 1452 1338 1453 - if (check_dax_vmas(vmas, rc)) { 1454 - for (i = 0; i < rc; i++) 1455 - put_page(pages[i]); 1456 - rc = -EOPNOTSUPP; 1457 - goto out; 1339 + if (gup_flags & FOLL_LONGTERM) { 1340 + memalloc_nocma_restore(flags); 1341 + if (rc < 0) 1342 + goto out; 1343 + 1344 + if (check_dax_vmas(vmas_tmp, rc)) { 1345 + for (i = 0; i < rc; i++) 1346 + put_page(pages[i]); 1347 + rc = -EOPNOTSUPP; 1348 + goto out; 1349 + } 1350 + 1351 + rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages, 1352 + vmas_tmp, gup_flags); 1458 1353 } 1459 1354 1460 - rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas); 1461 1355 out: 1462 - if (vmas != vmas_arg) 1463 - kfree(vmas); 1356 + if (vmas_tmp != vmas) 1357 + kfree(vmas_tmp); 1464 1358 return rc; 1465 1359 } 1466 - EXPORT_SYMBOL(get_user_pages_longterm); 1467 - #endif /* CONFIG_FS_DAX */ 1360 + #else /* !CONFIG_FS_DAX && !CONFIG_CMA */ 1361 + static __always_inline long __gup_longterm_locked(struct task_struct *tsk, 1362 + struct mm_struct *mm, 1363 + unsigned long start, 1364 + unsigned long nr_pages, 1365 + struct page **pages, 1366 + struct vm_area_struct **vmas, 1367 + unsigned int flags) 1368 + { 1369 + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, 1370 + NULL, flags); 1371 + } 1372 + #endif /* CONFIG_FS_DAX || CONFIG_CMA */ 1373 + 1374 + /* 1375 + * This is the same as get_user_pages_remote(), just with a 1376 + * less-flexible calling convention where we assume that the task 1377 + * and mm being operated on are the current task's and don't allow 1378 + * passing of a locked parameter. We also obviously don't pass 1379 + * FOLL_REMOTE in here. 1380 + */ 1381 + long get_user_pages(unsigned long start, unsigned long nr_pages, 1382 + unsigned int gup_flags, struct page **pages, 1383 + struct vm_area_struct **vmas) 1384 + { 1385 + return __gup_longterm_locked(current, current->mm, start, nr_pages, 1386 + pages, vmas, gup_flags | FOLL_TOUCH); 1387 + } 1388 + EXPORT_SYMBOL(get_user_pages); 1468 1389 1469 1390 /** 1470 1391 * populate_vma_page_range() - populate a range of pages in the vma. ··· 1720 1571 1721 1572 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL 1722 1573 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, 1723 - int write, struct page **pages, int *nr) 1574 + unsigned int flags, struct page **pages, int *nr) 1724 1575 { 1725 1576 struct dev_pagemap *pgmap = NULL; 1726 1577 int nr_start = *nr, ret = 0; ··· 1738 1589 if (pte_protnone(pte)) 1739 1590 goto pte_unmap; 1740 1591 1741 - if (!pte_access_permitted(pte, write)) 1592 + if (!pte_access_permitted(pte, flags & FOLL_WRITE)) 1742 1593 goto pte_unmap; 1743 1594 1744 1595 if (pte_devmap(pte)) { 1596 + if (unlikely(flags & FOLL_LONGTERM)) 1597 + goto pte_unmap; 1598 + 1745 1599 pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); 1746 1600 if (unlikely(!pgmap)) { 1747 1601 undo_dev_pagemap(nr, nr_start, pages); ··· 1793 1641 * useful to have gup_huge_pmd even if we can't operate on ptes. 1794 1642 */ 1795 1643 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, 1796 - int write, struct page **pages, int *nr) 1644 + unsigned int flags, struct page **pages, int *nr) 1797 1645 { 1798 1646 return 0; 1799 1647 } ··· 1876 1724 #endif 1877 1725 1878 1726 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, 1879 - unsigned long end, int write, struct page **pages, int *nr) 1727 + unsigned long end, unsigned int flags, struct page **pages, int *nr) 1880 1728 { 1881 1729 struct page *head, *page; 1882 1730 int refs; 1883 1731 1884 - if (!pmd_access_permitted(orig, write)) 1732 + if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) 1885 1733 return 0; 1886 1734 1887 - if (pmd_devmap(orig)) 1735 + if (pmd_devmap(orig)) { 1736 + if (unlikely(flags & FOLL_LONGTERM)) 1737 + return 0; 1888 1738 return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr); 1739 + } 1889 1740 1890 1741 refs = 0; 1891 1742 page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); ··· 1917 1762 } 1918 1763 1919 1764 static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, 1920 - unsigned long end, int write, struct page **pages, int *nr) 1765 + unsigned long end, unsigned int flags, struct page **pages, int *nr) 1921 1766 { 1922 1767 struct page *head, *page; 1923 1768 int refs; 1924 1769 1925 - if (!pud_access_permitted(orig, write)) 1770 + if (!pud_access_permitted(orig, flags & FOLL_WRITE)) 1926 1771 return 0; 1927 1772 1928 - if (pud_devmap(orig)) 1773 + if (pud_devmap(orig)) { 1774 + if (unlikely(flags & FOLL_LONGTERM)) 1775 + return 0; 1929 1776 return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr); 1777 + } 1930 1778 1931 1779 refs = 0; 1932 1780 page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); ··· 1958 1800 } 1959 1801 1960 1802 static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, 1961 - unsigned long end, int write, 1803 + unsigned long end, unsigned int flags, 1962 1804 struct page **pages, int *nr) 1963 1805 { 1964 1806 int refs; 1965 1807 struct page *head, *page; 1966 1808 1967 - if (!pgd_access_permitted(orig, write)) 1809 + if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) 1968 1810 return 0; 1969 1811 1970 1812 BUILD_BUG_ON(pgd_devmap(orig)); ··· 1995 1837 } 1996 1838 1997 1839 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, 1998 - int write, struct page **pages, int *nr) 1840 + unsigned int flags, struct page **pages, int *nr) 1999 1841 { 2000 1842 unsigned long next; 2001 1843 pmd_t *pmdp; ··· 2018 1860 if (pmd_protnone(pmd)) 2019 1861 return 0; 2020 1862 2021 - if (!gup_huge_pmd(pmd, pmdp, addr, next, write, 1863 + if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, 2022 1864 pages, nr)) 2023 1865 return 0; 2024 1866 ··· 2028 1870 * pmd format and THP pmd format 2029 1871 */ 2030 1872 if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, 2031 - PMD_SHIFT, next, write, pages, nr)) 1873 + PMD_SHIFT, next, flags, pages, nr)) 2032 1874 return 0; 2033 - } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) 1875 + } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr)) 2034 1876 return 0; 2035 1877 } while (pmdp++, addr = next, addr != end); 2036 1878 ··· 2038 1880 } 2039 1881 2040 1882 static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, 2041 - int write, struct page **pages, int *nr) 1883 + unsigned int flags, struct page **pages, int *nr) 2042 1884 { 2043 1885 unsigned long next; 2044 1886 pud_t *pudp; ··· 2051 1893 if (pud_none(pud)) 2052 1894 return 0; 2053 1895 if (unlikely(pud_huge(pud))) { 2054 - if (!gup_huge_pud(pud, pudp, addr, next, write, 1896 + if (!gup_huge_pud(pud, pudp, addr, next, flags, 2055 1897 pages, nr)) 2056 1898 return 0; 2057 1899 } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { 2058 1900 if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, 2059 - PUD_SHIFT, next, write, pages, nr)) 1901 + PUD_SHIFT, next, flags, pages, nr)) 2060 1902 return 0; 2061 - } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) 1903 + } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr)) 2062 1904 return 0; 2063 1905 } while (pudp++, addr = next, addr != end); 2064 1906 ··· 2066 1908 } 2067 1909 2068 1910 static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, 2069 - int write, struct page **pages, int *nr) 1911 + unsigned int flags, struct page **pages, int *nr) 2070 1912 { 2071 1913 unsigned long next; 2072 1914 p4d_t *p4dp; ··· 2081 1923 BUILD_BUG_ON(p4d_huge(p4d)); 2082 1924 if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { 2083 1925 if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, 2084 - P4D_SHIFT, next, write, pages, nr)) 1926 + P4D_SHIFT, next, flags, pages, nr)) 2085 1927 return 0; 2086 - } else if (!gup_pud_range(p4d, addr, next, write, pages, nr)) 1928 + } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr)) 2087 1929 return 0; 2088 1930 } while (p4dp++, addr = next, addr != end); 2089 1931 ··· 2091 1933 } 2092 1934 2093 1935 static void gup_pgd_range(unsigned long addr, unsigned long end, 2094 - int write, struct page **pages, int *nr) 1936 + unsigned int flags, struct page **pages, int *nr) 2095 1937 { 2096 1938 unsigned long next; 2097 1939 pgd_t *pgdp; ··· 2104 1946 if (pgd_none(pgd)) 2105 1947 return; 2106 1948 if (unlikely(pgd_huge(pgd))) { 2107 - if (!gup_huge_pgd(pgd, pgdp, addr, next, write, 1949 + if (!gup_huge_pgd(pgd, pgdp, addr, next, flags, 2108 1950 pages, nr)) 2109 1951 return; 2110 1952 } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { 2111 1953 if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, 2112 - PGDIR_SHIFT, next, write, pages, nr)) 1954 + PGDIR_SHIFT, next, flags, pages, nr)) 2113 1955 return; 2114 - } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr)) 1956 + } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr)) 2115 1957 return; 2116 1958 } while (pgdp++, addr = next, addr != end); 2117 1959 } ··· 2165 2007 2166 2008 if (gup_fast_permitted(start, nr_pages)) { 2167 2009 local_irq_save(flags); 2168 - gup_pgd_range(start, end, write, pages, &nr); 2010 + gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr); 2169 2011 local_irq_restore(flags); 2170 2012 } 2171 2013 2172 2014 return nr; 2173 2015 } 2174 2016 2017 + static int __gup_longterm_unlocked(unsigned long start, int nr_pages, 2018 + unsigned int gup_flags, struct page **pages) 2019 + { 2020 + int ret; 2021 + 2022 + /* 2023 + * FIXME: FOLL_LONGTERM does not work with 2024 + * get_user_pages_unlocked() (see comments in that function) 2025 + */ 2026 + if (gup_flags & FOLL_LONGTERM) { 2027 + down_read(&current->mm->mmap_sem); 2028 + ret = __gup_longterm_locked(current, current->mm, 2029 + start, nr_pages, 2030 + pages, NULL, gup_flags); 2031 + up_read(&current->mm->mmap_sem); 2032 + } else { 2033 + ret = get_user_pages_unlocked(start, nr_pages, 2034 + pages, gup_flags); 2035 + } 2036 + 2037 + return ret; 2038 + } 2039 + 2175 2040 /** 2176 2041 * get_user_pages_fast() - pin user pages in memory 2177 2042 * @start: starting user address 2178 2043 * @nr_pages: number of pages from start to pin 2179 - * @write: whether pages will be written to 2044 + * @gup_flags: flags modifying pin behaviour 2180 2045 * @pages: array that receives pointers to the pages pinned. 2181 2046 * Should be at least nr_pages long. 2182 2047 * ··· 2211 2030 * requested. If nr_pages is 0 or negative, returns 0. If no pages 2212 2031 * were pinned, returns -errno. 2213 2032 */ 2214 - int get_user_pages_fast(unsigned long start, int nr_pages, int write, 2215 - struct page **pages) 2033 + int get_user_pages_fast(unsigned long start, int nr_pages, 2034 + unsigned int gup_flags, struct page **pages) 2216 2035 { 2217 2036 unsigned long addr, len, end; 2218 2037 int nr = 0, ret = 0; ··· 2230 2049 2231 2050 if (gup_fast_permitted(start, nr_pages)) { 2232 2051 local_irq_disable(); 2233 - gup_pgd_range(addr, end, write, pages, &nr); 2052 + gup_pgd_range(addr, end, gup_flags, pages, &nr); 2234 2053 local_irq_enable(); 2235 2054 ret = nr; 2236 2055 } ··· 2240 2059 start += nr << PAGE_SHIFT; 2241 2060 pages += nr; 2242 2061 2243 - ret = get_user_pages_unlocked(start, nr_pages - nr, pages, 2244 - write ? FOLL_WRITE : 0); 2062 + ret = __gup_longterm_unlocked(start, nr_pages - nr, 2063 + gup_flags, pages); 2245 2064 2246 2065 /* Have to be a bit careful with return values */ 2247 2066 if (nr > 0) {

+3 -2

mm/gup_benchmark.c

··· 54 54 pages + i); 55 55 break; 56 56 case GUP_LONGTERM_BENCHMARK: 57 - nr = get_user_pages_longterm(addr, nr, gup->flags & 1, 58 - pages + i, NULL); 57 + nr = get_user_pages(addr, nr, 58 + (gup->flags & 1) | FOLL_LONGTERM, 59 + pages + i, NULL); 59 60 break; 60 61 case GUP_BENCHMARK: 61 62 nr = get_user_pages(addr, nr, gup->flags & 1, pages + i,

+748 -360

mm/hmm.c

··· 30 30 #include <linux/hugetlb.h> 31 31 #include <linux/memremap.h> 32 32 #include <linux/jump_label.h> 33 + #include <linux/dma-mapping.h> 33 34 #include <linux/mmu_notifier.h> 34 35 #include <linux/memory_hotplug.h> 35 36 ··· 39 38 #if IS_ENABLED(CONFIG_HMM_MIRROR) 40 39 static const struct mmu_notifier_ops hmm_mmu_notifier_ops; 41 40 42 - /* 43 - * struct hmm - HMM per mm struct 44 - * 45 - * @mm: mm struct this HMM struct is bound to 46 - * @lock: lock protecting ranges list 47 - * @ranges: list of range being snapshotted 48 - * @mirrors: list of mirrors for this mm 49 - * @mmu_notifier: mmu notifier to track updates to CPU page table 50 - * @mirrors_sem: read/write semaphore protecting the mirrors list 51 - */ 52 - struct hmm { 53 - struct mm_struct *mm; 54 - spinlock_t lock; 55 - struct list_head ranges; 56 - struct list_head mirrors; 57 - struct mmu_notifier mmu_notifier; 58 - struct rw_semaphore mirrors_sem; 59 - }; 60 - 61 - /* 62 - * hmm_register - register HMM against an mm (HMM internal) 63 - * 64 - * @mm: mm struct to attach to 65 - * 66 - * This is not intended to be used directly by device drivers. It allocates an 67 - * HMM struct if mm does not have one, and initializes it. 68 - */ 69 - static struct hmm *hmm_register(struct mm_struct *mm) 41 + static inline struct hmm *mm_get_hmm(struct mm_struct *mm) 70 42 { 71 43 struct hmm *hmm = READ_ONCE(mm->hmm); 44 + 45 + if (hmm && kref_get_unless_zero(&hmm->kref)) 46 + return hmm; 47 + 48 + return NULL; 49 + } 50 + 51 + /** 52 + * hmm_get_or_create - register HMM against an mm (HMM internal) 53 + * 54 + * @mm: mm struct to attach to 55 + * Returns: returns an HMM object, either by referencing the existing 56 + * (per-process) object, or by creating a new one. 57 + * 58 + * This is not intended to be used directly by device drivers. If mm already 59 + * has an HMM struct then it get a reference on it and returns it. Otherwise 60 + * it allocates an HMM struct, initializes it, associate it with the mm and 61 + * returns it. 62 + */ 63 + static struct hmm *hmm_get_or_create(struct mm_struct *mm) 64 + { 65 + struct hmm *hmm = mm_get_hmm(mm); 72 66 bool cleanup = false; 73 67 74 - /* 75 - * The hmm struct can only be freed once the mm_struct goes away, 76 - * hence we should always have pre-allocated an new hmm struct 77 - * above. 78 - */ 79 68 if (hmm) 80 69 return hmm; 81 70 82 71 hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); 83 72 if (!hmm) 84 73 return NULL; 74 + init_waitqueue_head(&hmm->wq); 85 75 INIT_LIST_HEAD(&hmm->mirrors); 86 76 init_rwsem(&hmm->mirrors_sem); 87 77 hmm->mmu_notifier.ops = NULL; 88 78 INIT_LIST_HEAD(&hmm->ranges); 89 - spin_lock_init(&hmm->lock); 79 + mutex_init(&hmm->lock); 80 + kref_init(&hmm->kref); 81 + hmm->notifiers = 0; 82 + hmm->dead = false; 90 83 hmm->mm = mm; 91 84 92 85 spin_lock(&mm->page_table_lock); ··· 101 106 if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) 102 107 goto error_mm; 103 108 104 - return mm->hmm; 109 + return hmm; 105 110 106 111 error_mm: 107 112 spin_lock(&mm->page_table_lock); ··· 113 118 return NULL; 114 119 } 115 120 116 - void hmm_mm_destroy(struct mm_struct *mm) 121 + static void hmm_free(struct kref *kref) 117 122 { 118 - kfree(mm->hmm); 123 + struct hmm *hmm = container_of(kref, struct hmm, kref); 124 + struct mm_struct *mm = hmm->mm; 125 + 126 + mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); 127 + 128 + spin_lock(&mm->page_table_lock); 129 + if (mm->hmm == hmm) 130 + mm->hmm = NULL; 131 + spin_unlock(&mm->page_table_lock); 132 + 133 + kfree(hmm); 119 134 } 120 135 121 - static int hmm_invalidate_range(struct hmm *hmm, bool device, 122 - const struct hmm_update *update) 136 + static inline void hmm_put(struct hmm *hmm) 123 137 { 124 - struct hmm_mirror *mirror; 125 - struct hmm_range *range; 138 + kref_put(&hmm->kref, hmm_free); 139 + } 126 140 127 - spin_lock(&hmm->lock); 128 - list_for_each_entry(range, &hmm->ranges, list) { 129 - unsigned long addr, idx, npages; 141 + void hmm_mm_destroy(struct mm_struct *mm) 142 + { 143 + struct hmm *hmm; 130 144 131 - if (update->end < range->start || update->start >= range->end) 132 - continue; 133 - 134 - range->valid = false; 135 - addr = max(update->start, range->start); 136 - idx = (addr - range->start) >> PAGE_SHIFT; 137 - npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT; 138 - memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages); 145 + spin_lock(&mm->page_table_lock); 146 + hmm = mm_get_hmm(mm); 147 + mm->hmm = NULL; 148 + if (hmm) { 149 + hmm->mm = NULL; 150 + hmm->dead = true; 151 + spin_unlock(&mm->page_table_lock); 152 + hmm_put(hmm); 153 + return; 139 154 } 140 - spin_unlock(&hmm->lock); 141 155 142 - if (!device) 143 - return 0; 144 - 145 - down_read(&hmm->mirrors_sem); 146 - list_for_each_entry(mirror, &hmm->mirrors, list) { 147 - int ret; 148 - 149 - ret = mirror->ops->sync_cpu_device_pagetables(mirror, update); 150 - if (!update->blockable && ret == -EAGAIN) { 151 - up_read(&hmm->mirrors_sem); 152 - return -EAGAIN; 153 - } 154 - } 155 - up_read(&hmm->mirrors_sem); 156 - 157 - return 0; 156 + spin_unlock(&mm->page_table_lock); 158 157 } 159 158 160 159 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) 161 160 { 161 + struct hmm *hmm = mm_get_hmm(mm); 162 162 struct hmm_mirror *mirror; 163 - struct hmm *hmm = mm->hmm; 163 + struct hmm_range *range; 164 + 165 + /* Report this HMM as dying. */ 166 + hmm->dead = true; 167 + 168 + /* Wake-up everyone waiting on any range. */ 169 + mutex_lock(&hmm->lock); 170 + list_for_each_entry(range, &hmm->ranges, list) { 171 + range->valid = false; 172 + } 173 + wake_up_all(&hmm->wq); 174 + mutex_unlock(&hmm->lock); 164 175 165 176 down_write(&hmm->mirrors_sem); 166 177 mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, ··· 187 186 struct hmm_mirror, list); 188 187 } 189 188 up_write(&hmm->mirrors_sem); 189 + 190 + hmm_put(hmm); 190 191 } 191 192 192 193 static int hmm_invalidate_range_start(struct mmu_notifier *mn, 193 - const struct mmu_notifier_range *range) 194 + const struct mmu_notifier_range *nrange) 194 195 { 196 + struct hmm *hmm = mm_get_hmm(nrange->mm); 197 + struct hmm_mirror *mirror; 195 198 struct hmm_update update; 196 - struct hmm *hmm = range->mm->hmm; 199 + struct hmm_range *range; 200 + int ret = 0; 197 201 198 202 VM_BUG_ON(!hmm); 199 203 200 - update.start = range->start; 201 - update.end = range->end; 204 + update.start = nrange->start; 205 + update.end = nrange->end; 202 206 update.event = HMM_UPDATE_INVALIDATE; 203 - update.blockable = range->blockable; 204 - return hmm_invalidate_range(hmm, true, &update); 207 + update.blockable = mmu_notifier_range_blockable(nrange); 208 + 209 + if (mmu_notifier_range_blockable(nrange)) 210 + mutex_lock(&hmm->lock); 211 + else if (!mutex_trylock(&hmm->lock)) { 212 + ret = -EAGAIN; 213 + goto out; 214 + } 215 + hmm->notifiers++; 216 + list_for_each_entry(range, &hmm->ranges, list) { 217 + if (update.end < range->start || update.start >= range->end) 218 + continue; 219 + 220 + range->valid = false; 221 + } 222 + mutex_unlock(&hmm->lock); 223 + 224 + if (mmu_notifier_range_blockable(nrange)) 225 + down_read(&hmm->mirrors_sem); 226 + else if (!down_read_trylock(&hmm->mirrors_sem)) { 227 + ret = -EAGAIN; 228 + goto out; 229 + } 230 + list_for_each_entry(mirror, &hmm->mirrors, list) { 231 + int ret; 232 + 233 + ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update); 234 + if (!update.blockable && ret == -EAGAIN) { 235 + up_read(&hmm->mirrors_sem); 236 + ret = -EAGAIN; 237 + goto out; 238 + } 239 + } 240 + up_read(&hmm->mirrors_sem); 241 + 242 + out: 243 + hmm_put(hmm); 244 + return ret; 205 245 } 206 246 207 247 static void hmm_invalidate_range_end(struct mmu_notifier *mn, 208 - const struct mmu_notifier_range *range) 248 + const struct mmu_notifier_range *nrange) 209 249 { 210 - struct hmm_update update; 211 - struct hmm *hmm = range->mm->hmm; 250 + struct hmm *hmm = mm_get_hmm(nrange->mm); 212 251 213 252 VM_BUG_ON(!hmm); 214 253 215 - update.start = range->start; 216 - update.end = range->end; 217 - update.event = HMM_UPDATE_INVALIDATE; 218 - update.blockable = true; 219 - hmm_invalidate_range(hmm, false, &update); 254 + mutex_lock(&hmm->lock); 255 + hmm->notifiers--; 256 + if (!hmm->notifiers) { 257 + struct hmm_range *range; 258 + 259 + list_for_each_entry(range, &hmm->ranges, list) { 260 + if (range->valid) 261 + continue; 262 + range->valid = true; 263 + } 264 + wake_up_all(&hmm->wq); 265 + } 266 + mutex_unlock(&hmm->lock); 267 + 268 + hmm_put(hmm); 220 269 } 221 270 222 271 static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { ··· 292 241 if (!mm || !mirror || !mirror->ops) 293 242 return -EINVAL; 294 243 295 - again: 296 - mirror->hmm = hmm_register(mm); 244 + mirror->hmm = hmm_get_or_create(mm); 297 245 if (!mirror->hmm) 298 246 return -ENOMEM; 299 247 300 248 down_write(&mirror->hmm->mirrors_sem); 301 - if (mirror->hmm->mm == NULL) { 302 - /* 303 - * A racing hmm_mirror_unregister() is about to destroy the hmm 304 - * struct. Try again to allocate a new one. 305 - */ 306 - up_write(&mirror->hmm->mirrors_sem); 307 - mirror->hmm = NULL; 308 - goto again; 309 - } else { 310 - list_add(&mirror->list, &mirror->hmm->mirrors); 311 - up_write(&mirror->hmm->mirrors_sem); 312 - } 249 + list_add(&mirror->list, &mirror->hmm->mirrors); 250 + up_write(&mirror->hmm->mirrors_sem); 313 251 314 252 return 0; 315 253 } ··· 313 273 */ 314 274 void hmm_mirror_unregister(struct hmm_mirror *mirror) 315 275 { 316 - bool should_unregister = false; 317 - struct mm_struct *mm; 318 - struct hmm *hmm; 276 + struct hmm *hmm = READ_ONCE(mirror->hmm); 319 277 320 - if (mirror->hmm == NULL) 278 + if (hmm == NULL) 321 279 return; 322 280 323 - hmm = mirror->hmm; 324 281 down_write(&hmm->mirrors_sem); 325 282 list_del_init(&mirror->list); 326 - should_unregister = list_empty(&hmm->mirrors); 283 + /* To protect us against double unregister ... */ 327 284 mirror->hmm = NULL; 328 - mm = hmm->mm; 329 - hmm->mm = NULL; 330 285 up_write(&hmm->mirrors_sem); 331 286 332 - if (!should_unregister || mm == NULL) 333 - return; 334 - 335 - mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); 336 - 337 - spin_lock(&mm->page_table_lock); 338 - if (mm->hmm == hmm) 339 - mm->hmm = NULL; 340 - spin_unlock(&mm->page_table_lock); 341 - 342 - kfree(hmm); 287 + hmm_put(hmm); 343 288 } 344 289 EXPORT_SYMBOL(hmm_mirror_unregister); 345 290 346 291 struct hmm_vma_walk { 347 292 struct hmm_range *range; 293 + struct dev_pagemap *pgmap; 348 294 unsigned long last; 349 295 bool fault; 350 296 bool block; ··· 349 323 flags |= write_fault ? FAULT_FLAG_WRITE : 0; 350 324 ret = handle_mm_fault(vma, addr, flags); 351 325 if (ret & VM_FAULT_RETRY) 352 - return -EBUSY; 326 + return -EAGAIN; 353 327 if (ret & VM_FAULT_ERROR) { 354 328 *pfn = range->values[HMM_PFN_ERROR]; 355 329 return -EFAULT; 356 330 } 357 331 358 - return -EAGAIN; 332 + return -EBUSY; 359 333 } 360 334 361 335 static int hmm_pfns_bad(unsigned long addr, ··· 381 355 * @fault: should we fault or not ? 382 356 * @write_fault: write fault ? 383 357 * @walk: mm_walk structure 384 - * Returns: 0 on success, -EAGAIN after page fault, or page fault error 358 + * Returns: 0 on success, -EBUSY after page fault, or page fault error 385 359 * 386 360 * This function will be called whenever pmd_none() or pte_none() returns true, 387 361 * or whenever there is no page directory covering the virtual address range. ··· 393 367 struct hmm_vma_walk *hmm_vma_walk = walk->private; 394 368 struct hmm_range *range = hmm_vma_walk->range; 395 369 uint64_t *pfns = range->pfns; 396 - unsigned long i; 370 + unsigned long i, page_size; 397 371 398 372 hmm_vma_walk->last = addr; 399 - i = (addr - range->start) >> PAGE_SHIFT; 400 - for (; addr < end; addr += PAGE_SIZE, i++) { 373 + page_size = hmm_range_page_size(range); 374 + i = (addr - range->start) >> range->page_shift; 375 + 376 + for (; addr < end; addr += page_size, i++) { 401 377 pfns[i] = range->values[HMM_PFN_NONE]; 402 378 if (fault || write_fault) { 403 379 int ret; 404 380 405 381 ret = hmm_vma_do_fault(walk, addr, write_fault, 406 382 &pfns[i]); 407 - if (ret != -EAGAIN) 383 + if (ret != -EBUSY) 408 384 return ret; 409 385 } 410 386 } 411 387 412 - return (fault || write_fault) ? -EAGAIN : 0; 388 + return (fault || write_fault) ? -EBUSY : 0; 413 389 } 414 390 415 391 static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, ··· 420 392 { 421 393 struct hmm_range *range = hmm_vma_walk->range; 422 394 423 - *fault = *write_fault = false; 424 395 if (!hmm_vma_walk->fault) 425 396 return; 397 + 398 + /* 399 + * So we not only consider the individual per page request we also 400 + * consider the default flags requested for the range. The API can 401 + * be use in 2 fashions. The first one where the HMM user coalesce 402 + * multiple page fault into one request and set flags per pfns for 403 + * of those faults. The second one where the HMM user want to pre- 404 + * fault a range with specific flags. For the latter one it is a 405 + * waste to have the user pre-fill the pfn arrays with a default 406 + * flags value. 407 + */ 408 + pfns = (pfns & range->pfn_flags_mask) | range->default_flags; 426 409 427 410 /* We aren't ask to do anything ... */ 428 411 if (!(pfns & range->flags[HMM_PFN_VALID])) ··· 470 431 return; 471 432 } 472 433 434 + *fault = *write_fault = false; 473 435 for (i = 0; i < npages; ++i) { 474 436 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, 475 437 fault, write_fault); 476 - if ((*fault) || (*write_fault)) 438 + if ((*write_fault)) 477 439 return; 478 440 } 479 441 } ··· 505 465 range->flags[HMM_PFN_VALID]; 506 466 } 507 467 468 + static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) 469 + { 470 + if (!pud_present(pud)) 471 + return 0; 472 + return pud_write(pud) ? range->flags[HMM_PFN_VALID] | 473 + range->flags[HMM_PFN_WRITE] : 474 + range->flags[HMM_PFN_VALID]; 475 + } 476 + 508 477 static int hmm_vma_handle_pmd(struct mm_walk *walk, 509 478 unsigned long addr, 510 479 unsigned long end, 511 480 uint64_t *pfns, 512 481 pmd_t pmd) 513 482 { 483 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 514 484 struct hmm_vma_walk *hmm_vma_walk = walk->private; 515 485 struct hmm_range *range = hmm_vma_walk->range; 516 486 unsigned long pfn, npages, i; ··· 536 486 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 537 487 538 488 pfn = pmd_pfn(pmd) + pte_index(addr); 539 - for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) 540 - pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; 489 + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { 490 + if (pmd_devmap(pmd)) { 491 + hmm_vma_walk->pgmap = get_dev_pagemap(pfn, 492 + hmm_vma_walk->pgmap); 493 + if (unlikely(!hmm_vma_walk->pgmap)) 494 + return -EBUSY; 495 + } 496 + pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; 497 + } 498 + if (hmm_vma_walk->pgmap) { 499 + put_dev_pagemap(hmm_vma_walk->pgmap); 500 + hmm_vma_walk->pgmap = NULL; 501 + } 541 502 hmm_vma_walk->last = end; 542 503 return 0; 504 + #else 505 + /* If THP is not enabled then we should never reach that code ! */ 506 + return -EINVAL; 507 + #endif 543 508 } 544 509 545 510 static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) ··· 579 514 uint64_t orig_pfn = *pfn; 580 515 581 516 *pfn = range->values[HMM_PFN_NONE]; 582 - cpu_flags = pte_to_hmm_pfn_flags(range, pte); 583 - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 584 - &fault, &write_fault); 517 + fault = write_fault = false; 585 518 586 519 if (pte_none(pte)) { 520 + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, 521 + &fault, &write_fault); 587 522 if (fault || write_fault) 588 523 goto fault; 589 524 return 0; ··· 611 546 &fault, &write_fault); 612 547 if (fault || write_fault) 613 548 goto fault; 614 - *pfn = hmm_pfn_from_pfn(range, swp_offset(entry)); 549 + *pfn = hmm_device_entry_from_pfn(range, 550 + swp_offset(entry)); 615 551 *pfn |= cpu_flags; 616 552 return 0; 617 553 } ··· 623 557 hmm_vma_walk->last = addr; 624 558 migration_entry_wait(vma->vm_mm, 625 559 pmdp, addr); 626 - return -EAGAIN; 560 + return -EBUSY; 627 561 } 628 562 return 0; 629 563 } ··· 631 565 /* Report error for everything else */ 632 566 *pfn = range->values[HMM_PFN_ERROR]; 633 567 return -EFAULT; 568 + } else { 569 + cpu_flags = pte_to_hmm_pfn_flags(range, pte); 570 + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 571 + &fault, &write_fault); 634 572 } 635 573 636 574 if (fault || write_fault) 637 575 goto fault; 638 576 639 - *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; 577 + if (pte_devmap(pte)) { 578 + hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte), 579 + hmm_vma_walk->pgmap); 580 + if (unlikely(!hmm_vma_walk->pgmap)) 581 + return -EBUSY; 582 + } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { 583 + *pfn = range->values[HMM_PFN_SPECIAL]; 584 + return -EFAULT; 585 + } 586 + 587 + *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; 640 588 return 0; 641 589 642 590 fault: 591 + if (hmm_vma_walk->pgmap) { 592 + put_dev_pagemap(hmm_vma_walk->pgmap); 593 + hmm_vma_walk->pgmap = NULL; 594 + } 643 595 pte_unmap(ptep); 644 596 /* Fault any virtual address we were asked to fault */ 645 597 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); ··· 699 615 if (fault || write_fault) { 700 616 hmm_vma_walk->last = addr; 701 617 pmd_migration_entry_wait(vma->vm_mm, pmdp); 702 - return -EAGAIN; 618 + return -EBUSY; 703 619 } 704 620 return 0; 705 621 } else if (!pmd_present(pmd)) ··· 745 661 return r; 746 662 } 747 663 } 664 + if (hmm_vma_walk->pgmap) { 665 + /* 666 + * We do put_dev_pagemap() here and not in hmm_vma_handle_pte() 667 + * so that we can leverage get_dev_pagemap() optimization which 668 + * will not re-take a reference on a pgmap if we already have 669 + * one. 670 + */ 671 + put_dev_pagemap(hmm_vma_walk->pgmap); 672 + hmm_vma_walk->pgmap = NULL; 673 + } 748 674 pte_unmap(ptep - 1); 749 675 750 676 hmm_vma_walk->last = addr; 751 677 return 0; 678 + } 679 + 680 + static int hmm_vma_walk_pud(pud_t *pudp, 681 + unsigned long start, 682 + unsigned long end, 683 + struct mm_walk *walk) 684 + { 685 + struct hmm_vma_walk *hmm_vma_walk = walk->private; 686 + struct hmm_range *range = hmm_vma_walk->range; 687 + unsigned long addr = start, next; 688 + pmd_t *pmdp; 689 + pud_t pud; 690 + int ret; 691 + 692 + again: 693 + pud = READ_ONCE(*pudp); 694 + if (pud_none(pud)) 695 + return hmm_vma_walk_hole(start, end, walk); 696 + 697 + if (pud_huge(pud) && pud_devmap(pud)) { 698 + unsigned long i, npages, pfn; 699 + uint64_t *pfns, cpu_flags; 700 + bool fault, write_fault; 701 + 702 + if (!pud_present(pud)) 703 + return hmm_vma_walk_hole(start, end, walk); 704 + 705 + i = (addr - range->start) >> PAGE_SHIFT; 706 + npages = (end - addr) >> PAGE_SHIFT; 707 + pfns = &range->pfns[i]; 708 + 709 + cpu_flags = pud_to_hmm_pfn_flags(range, pud); 710 + hmm_range_need_fault(hmm_vma_walk, pfns, npages, 711 + cpu_flags, &fault, &write_fault); 712 + if (fault || write_fault) 713 + return hmm_vma_walk_hole_(addr, end, fault, 714 + write_fault, walk); 715 + 716 + #ifdef CONFIG_HUGETLB_PAGE 717 + pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 718 + for (i = 0; i < npages; ++i, ++pfn) { 719 + hmm_vma_walk->pgmap = get_dev_pagemap(pfn, 720 + hmm_vma_walk->pgmap); 721 + if (unlikely(!hmm_vma_walk->pgmap)) 722 + return -EBUSY; 723 + pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 724 + cpu_flags; 725 + } 726 + if (hmm_vma_walk->pgmap) { 727 + put_dev_pagemap(hmm_vma_walk->pgmap); 728 + hmm_vma_walk->pgmap = NULL; 729 + } 730 + hmm_vma_walk->last = end; 731 + return 0; 732 + #else 733 + return -EINVAL; 734 + #endif 735 + } 736 + 737 + split_huge_pud(walk->vma, pudp, addr); 738 + if (pud_none(*pudp)) 739 + goto again; 740 + 741 + pmdp = pmd_offset(pudp, addr); 742 + do { 743 + next = pmd_addr_end(addr, end); 744 + ret = hmm_vma_walk_pmd(pmdp, addr, next, walk); 745 + if (ret) 746 + return ret; 747 + } while (pmdp++, addr = next, addr != end); 748 + 749 + return 0; 750 + } 751 + 752 + static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, 753 + unsigned long start, unsigned long end, 754 + struct mm_walk *walk) 755 + { 756 + #ifdef CONFIG_HUGETLB_PAGE 757 + unsigned long addr = start, i, pfn, mask, size, pfn_inc; 758 + struct hmm_vma_walk *hmm_vma_walk = walk->private; 759 + struct hmm_range *range = hmm_vma_walk->range; 760 + struct vm_area_struct *vma = walk->vma; 761 + struct hstate *h = hstate_vma(vma); 762 + uint64_t orig_pfn, cpu_flags; 763 + bool fault, write_fault; 764 + spinlock_t *ptl; 765 + pte_t entry; 766 + int ret = 0; 767 + 768 + size = 1UL << huge_page_shift(h); 769 + mask = size - 1; 770 + if (range->page_shift != PAGE_SHIFT) { 771 + /* Make sure we are looking at full page. */ 772 + if (start & mask) 773 + return -EINVAL; 774 + if (end < (start + size)) 775 + return -EINVAL; 776 + pfn_inc = size >> PAGE_SHIFT; 777 + } else { 778 + pfn_inc = 1; 779 + size = PAGE_SIZE; 780 + } 781 + 782 + 783 + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); 784 + entry = huge_ptep_get(pte); 785 + 786 + i = (start - range->start) >> range->page_shift; 787 + orig_pfn = range->pfns[i]; 788 + range->pfns[i] = range->values[HMM_PFN_NONE]; 789 + cpu_flags = pte_to_hmm_pfn_flags(range, entry); 790 + fault = write_fault = false; 791 + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 792 + &fault, &write_fault); 793 + if (fault || write_fault) { 794 + ret = -ENOENT; 795 + goto unlock; 796 + } 797 + 798 + pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift); 799 + for (; addr < end; addr += size, i++, pfn += pfn_inc) 800 + range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 801 + cpu_flags; 802 + hmm_vma_walk->last = end; 803 + 804 + unlock: 805 + spin_unlock(ptl); 806 + 807 + if (ret == -ENOENT) 808 + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 809 + 810 + return ret; 811 + #else /* CONFIG_HUGETLB_PAGE */ 812 + return -EINVAL; 813 + #endif 752 814 } 753 815 754 816 static void hmm_pfns_clear(struct hmm_range *range, ··· 906 676 *pfns = range->values[HMM_PFN_NONE]; 907 677 } 908 678 909 - static void hmm_pfns_special(struct hmm_range *range) 910 - { 911 - unsigned long addr = range->start, i = 0; 912 - 913 - for (; addr < range->end; addr += PAGE_SIZE, i++) 914 - range->pfns[i] = range->values[HMM_PFN_SPECIAL]; 915 - } 916 - 917 679 /* 918 - * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses 919 - * @range: range being snapshotted 920 - * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid 921 - * vma permission, 0 success 680 + * hmm_range_register() - start tracking change to CPU page table over a range 681 + * @range: range 682 + * @mm: the mm struct for the range of virtual address 683 + * @start: start virtual address (inclusive) 684 + * @end: end virtual address (exclusive) 685 + * @page_shift: expect page shift for the range 686 + * Returns 0 on success, -EFAULT if the address space is no longer valid 922 687 * 923 - * This snapshots the CPU page table for a range of virtual addresses. Snapshot 924 - * validity is tracked by range struct. See hmm_vma_range_done() for further 925 - * information. 926 - * 927 - * The range struct is initialized here. It tracks the CPU page table, but only 928 - * if the function returns success (0), in which case the caller must then call 929 - * hmm_vma_range_done() to stop CPU page table update tracking on this range. 930 - * 931 - * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS 932 - * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! 688 + * Track updates to the CPU page table see include/linux/hmm.h 933 689 */ 934 - int hmm_vma_get_pfns(struct hmm_range *range) 690 + int hmm_range_register(struct hmm_range *range, 691 + struct mm_struct *mm, 692 + unsigned long start, 693 + unsigned long end, 694 + unsigned page_shift) 935 695 { 936 - struct vm_area_struct *vma = range->vma; 937 - struct hmm_vma_walk hmm_vma_walk; 938 - struct mm_walk mm_walk; 939 - struct hmm *hmm; 696 + unsigned long mask = ((1UL << page_shift) - 1UL); 940 697 941 - /* Sanity check, this really should not happen ! */ 942 - if (range->start < vma->vm_start || range->start >= vma->vm_end) 943 - return -EINVAL; 944 - if (range->end < vma->vm_start || range->end > vma->vm_end) 945 - return -EINVAL; 698 + range->valid = false; 699 + range->hmm = NULL; 946 700 947 - hmm = hmm_register(vma->vm_mm); 948 - if (!hmm) 949 - return -ENOMEM; 950 - /* Caller must have registered a mirror, via hmm_mirror_register() ! */ 951 - if (!hmm->mmu_notifier.ops) 701 + if ((start & mask) || (end & mask)) 702 + return -EINVAL; 703 + if (start >= end) 952 704 return -EINVAL; 953 705 954 - /* FIXME support hugetlb fs */ 955 - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || 956 - vma_is_dax(vma)) { 957 - hmm_pfns_special(range); 958 - return -EINVAL; 959 - } 706 + range->page_shift = page_shift; 707 + range->start = start; 708 + range->end = end; 960 709 961 - if (!(vma->vm_flags & VM_READ)) { 962 - /* 963 - * If vma do not allow read access, then assume that it does 964 - * not allow write access, either. Architecture that allow 965 - * write without read access are not supported by HMM, because 966 - * operations such has atomic access would not work. 967 - */ 968 - hmm_pfns_clear(range, range->pfns, range->start, range->end); 969 - return -EPERM; 710 + range->hmm = hmm_get_or_create(mm); 711 + if (!range->hmm) 712 + return -EFAULT; 713 + 714 + /* Check if hmm_mm_destroy() was call. */ 715 + if (range->hmm->mm == NULL || range->hmm->dead) { 716 + hmm_put(range->hmm); 717 + return -EFAULT; 970 718 } 971 719 972 720 /* Initialize range to track CPU page table update */ 973 - spin_lock(&hmm->lock); 974 - range->valid = true; 975 - list_add_rcu(&range->list, &hmm->ranges); 976 - spin_unlock(&hmm->lock); 721 + mutex_lock(&range->hmm->lock); 977 722 978 - hmm_vma_walk.fault = false; 979 - hmm_vma_walk.range = range; 980 - mm_walk.private = &hmm_vma_walk; 723 + list_add_rcu(&range->list, &range->hmm->ranges); 981 724 982 - mm_walk.vma = vma; 983 - mm_walk.mm = vma->vm_mm; 984 - mm_walk.pte_entry = NULL; 985 - mm_walk.test_walk = NULL; 986 - mm_walk.hugetlb_entry = NULL; 987 - mm_walk.pmd_entry = hmm_vma_walk_pmd; 988 - mm_walk.pte_hole = hmm_vma_walk_hole; 725 + /* 726 + * If there are any concurrent notifiers we have to wait for them for 727 + * the range to be valid (see hmm_range_wait_until_valid()). 728 + */ 729 + if (!range->hmm->notifiers) 730 + range->valid = true; 731 + mutex_unlock(&range->hmm->lock); 989 732 990 - walk_page_range(range->start, range->end, &mm_walk); 991 733 return 0; 992 734 } 993 - EXPORT_SYMBOL(hmm_vma_get_pfns); 735 + EXPORT_SYMBOL(hmm_range_register); 994 736 995 737 /* 996 - * hmm_vma_range_done() - stop tracking change to CPU page table over a range 997 - * @range: range being tracked 998 - * Returns: false if range data has been invalidated, true otherwise 738 + * hmm_range_unregister() - stop tracking change to CPU page table over a range 739 + * @range: range 999 740 * 1000 741 * Range struct is used to track updates to the CPU page table after a call to 1001 - * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done 1002 - * using the data, or wants to lock updates to the data it got from those 1003 - * functions, it must call the hmm_vma_range_done() function, which will then 1004 - * stop tracking CPU page table updates. 1005 - * 1006 - * Note that device driver must still implement general CPU page table update 1007 - * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using 1008 - * the mmu_notifier API directly. 1009 - * 1010 - * CPU page table update tracking done through hmm_range is only temporary and 1011 - * to be used while trying to duplicate CPU page table contents for a range of 1012 - * virtual addresses. 1013 - * 1014 - * There are two ways to use this : 1015 - * again: 1016 - * hmm_vma_get_pfns(range); or hmm_vma_fault(...); 1017 - * trans = device_build_page_table_update_transaction(pfns); 1018 - * device_page_table_lock(); 1019 - * if (!hmm_vma_range_done(range)) { 1020 - * device_page_table_unlock(); 1021 - * goto again; 1022 - * } 1023 - * device_commit_transaction(trans); 1024 - * device_page_table_unlock(); 1025 - * 1026 - * Or: 1027 - * hmm_vma_get_pfns(range); or hmm_vma_fault(...); 1028 - * device_page_table_lock(); 1029 - * hmm_vma_range_done(range); 1030 - * device_update_page_table(range->pfns); 1031 - * device_page_table_unlock(); 742 + * hmm_range_register(). See include/linux/hmm.h for how to use it. 1032 743 */ 1033 - bool hmm_vma_range_done(struct hmm_range *range) 744 + void hmm_range_unregister(struct hmm_range *range) 1034 745 { 1035 - unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; 1036 - struct hmm *hmm; 746 + /* Sanity check this really should not happen. */ 747 + if (range->hmm == NULL || range->end <= range->start) 748 + return; 1037 749 1038 - if (range->end <= range->start) { 1039 - BUG(); 1040 - return false; 1041 - } 1042 - 1043 - hmm = hmm_register(range->vma->vm_mm); 1044 - if (!hmm) { 1045 - memset(range->pfns, 0, sizeof(*range->pfns) * npages); 1046 - return false; 1047 - } 1048 - 1049 - spin_lock(&hmm->lock); 750 + mutex_lock(&range->hmm->lock); 1050 751 list_del_rcu(&range->list); 1051 - spin_unlock(&hmm->lock); 752 + mutex_unlock(&range->hmm->lock); 1052 753 1053 - return range->valid; 754 + /* Drop reference taken by hmm_range_register() */ 755 + range->valid = false; 756 + hmm_put(range->hmm); 757 + range->hmm = NULL; 1054 758 } 1055 - EXPORT_SYMBOL(hmm_vma_range_done); 759 + EXPORT_SYMBOL(hmm_range_unregister); 1056 760 1057 761 /* 1058 - * hmm_vma_fault() - try to fault some address in a virtual address range 762 + * hmm_range_snapshot() - snapshot CPU page table for a range 763 + * @range: range 764 + * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid 765 + * permission (for instance asking for write and range is read only), 766 + * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid 767 + * vma or it is illegal to access that range), number of valid pages 768 + * in range->pfns[] (from range start address). 769 + * 770 + * This snapshots the CPU page table for a range of virtual addresses. Snapshot 771 + * validity is tracked by range struct. See in include/linux/hmm.h for example 772 + * on how to use. 773 + */ 774 + long hmm_range_snapshot(struct hmm_range *range) 775 + { 776 + const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; 777 + unsigned long start = range->start, end; 778 + struct hmm_vma_walk hmm_vma_walk; 779 + struct hmm *hmm = range->hmm; 780 + struct vm_area_struct *vma; 781 + struct mm_walk mm_walk; 782 + 783 + /* Check if hmm_mm_destroy() was call. */ 784 + if (hmm->mm == NULL || hmm->dead) 785 + return -EFAULT; 786 + 787 + do { 788 + /* If range is no longer valid force retry. */ 789 + if (!range->valid) 790 + return -EAGAIN; 791 + 792 + vma = find_vma(hmm->mm, start); 793 + if (vma == NULL || (vma->vm_flags & device_vma)) 794 + return -EFAULT; 795 + 796 + if (is_vm_hugetlb_page(vma)) { 797 + struct hstate *h = hstate_vma(vma); 798 + 799 + if (huge_page_shift(h) != range->page_shift && 800 + range->page_shift != PAGE_SHIFT) 801 + return -EINVAL; 802 + } else { 803 + if (range->page_shift != PAGE_SHIFT) 804 + return -EINVAL; 805 + } 806 + 807 + if (!(vma->vm_flags & VM_READ)) { 808 + /* 809 + * If vma do not allow read access, then assume that it 810 + * does not allow write access, either. HMM does not 811 + * support architecture that allow write without read. 812 + */ 813 + hmm_pfns_clear(range, range->pfns, 814 + range->start, range->end); 815 + return -EPERM; 816 + } 817 + 818 + range->vma = vma; 819 + hmm_vma_walk.pgmap = NULL; 820 + hmm_vma_walk.last = start; 821 + hmm_vma_walk.fault = false; 822 + hmm_vma_walk.range = range; 823 + mm_walk.private = &hmm_vma_walk; 824 + end = min(range->end, vma->vm_end); 825 + 826 + mm_walk.vma = vma; 827 + mm_walk.mm = vma->vm_mm; 828 + mm_walk.pte_entry = NULL; 829 + mm_walk.test_walk = NULL; 830 + mm_walk.hugetlb_entry = NULL; 831 + mm_walk.pud_entry = hmm_vma_walk_pud; 832 + mm_walk.pmd_entry = hmm_vma_walk_pmd; 833 + mm_walk.pte_hole = hmm_vma_walk_hole; 834 + mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; 835 + 836 + walk_page_range(start, end, &mm_walk); 837 + start = end; 838 + } while (start < range->end); 839 + 840 + return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 841 + } 842 + EXPORT_SYMBOL(hmm_range_snapshot); 843 + 844 + /* 845 + * hmm_range_fault() - try to fault some address in a virtual address range 1059 846 * @range: range being faulted 1060 847 * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) 1061 - * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) 848 + * Returns: number of valid pages in range->pfns[] (from range start 849 + * address). This may be zero. If the return value is negative, 850 + * then one of the following values may be returned: 851 + * 852 + * -EINVAL invalid arguments or mm or virtual address are in an 853 + * invalid vma (for instance device file vma). 854 + * -ENOMEM: Out of memory. 855 + * -EPERM: Invalid permission (for instance asking for write and 856 + * range is read only). 857 + * -EAGAIN: If you need to retry and mmap_sem was drop. This can only 858 + * happens if block argument is false. 859 + * -EBUSY: If the the range is being invalidated and you should wait 860 + * for invalidation to finish. 861 + * -EFAULT: Invalid (ie either no valid vma or it is illegal to access 862 + * that range), number of valid pages in range->pfns[] (from 863 + * range start address). 1062 864 * 1063 865 * This is similar to a regular CPU page fault except that it will not trigger 1064 - * any memory migration if the memory being faulted is not accessible by CPUs. 866 + * any memory migration if the memory being faulted is not accessible by CPUs 867 + * and caller does not ask for migration. 1065 868 * 1066 869 * On error, for one virtual address in the range, the function will mark the 1067 870 * corresponding HMM pfn entry with an error flag. 1068 - * 1069 - * Expected use pattern: 1070 - * retry: 1071 - * down_read(&mm->mmap_sem); 1072 - * // Find vma and address device wants to fault, initialize hmm_pfn_t 1073 - * // array accordingly 1074 - * ret = hmm_vma_fault(range, write, block); 1075 - * switch (ret) { 1076 - * case -EAGAIN: 1077 - * hmm_vma_range_done(range); 1078 - * // You might want to rate limit or yield to play nicely, you may 1079 - * // also commit any valid pfn in the array assuming that you are 1080 - * // getting true from hmm_vma_range_monitor_end() 1081 - * goto retry; 1082 - * case 0: 1083 - * break; 1084 - * case -ENOMEM: 1085 - * case -EINVAL: 1086 - * case -EPERM: 1087 - * default: 1088 - * // Handle error ! 1089 - * up_read(&mm->mmap_sem) 1090 - * return; 1091 - * } 1092 - * // Take device driver lock that serialize device page table update 1093 - * driver_lock_device_page_table_update(); 1094 - * hmm_vma_range_done(range); 1095 - * // Commit pfns we got from hmm_vma_fault() 1096 - * driver_unlock_device_page_table_update(); 1097 - * up_read(&mm->mmap_sem) 1098 - * 1099 - * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0) 1100 - * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION ! 1101 - * 1102 - * YOU HAVE BEEN WARNED ! 1103 871 */ 1104 - int hmm_vma_fault(struct hmm_range *range, bool block) 872 + long hmm_range_fault(struct hmm_range *range, bool block) 1105 873 { 1106 - struct vm_area_struct *vma = range->vma; 1107 - unsigned long start = range->start; 874 + const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; 875 + unsigned long start = range->start, end; 1108 876 struct hmm_vma_walk hmm_vma_walk; 877 + struct hmm *hmm = range->hmm; 878 + struct vm_area_struct *vma; 1109 879 struct mm_walk mm_walk; 1110 - struct hmm *hmm; 1111 880 int ret; 1112 881 1113 - /* Sanity check, this really should not happen ! */ 1114 - if (range->start < vma->vm_start || range->start >= vma->vm_end) 1115 - return -EINVAL; 1116 - if (range->end < vma->vm_start || range->end > vma->vm_end) 1117 - return -EINVAL; 1118 - 1119 - hmm = hmm_register(vma->vm_mm); 1120 - if (!hmm) { 1121 - hmm_pfns_clear(range, range->pfns, range->start, range->end); 1122 - return -ENOMEM; 1123 - } 1124 - /* Caller must have registered a mirror using hmm_mirror_register() */ 1125 - if (!hmm->mmu_notifier.ops) 1126 - return -EINVAL; 1127 - 1128 - /* FIXME support hugetlb fs */ 1129 - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || 1130 - vma_is_dax(vma)) { 1131 - hmm_pfns_special(range); 1132 - return -EINVAL; 1133 - } 1134 - 1135 - if (!(vma->vm_flags & VM_READ)) { 1136 - /* 1137 - * If vma do not allow read access, then assume that it does 1138 - * not allow write access, either. Architecture that allow 1139 - * write without read access are not supported by HMM, because 1140 - * operations such has atomic access would not work. 1141 - */ 1142 - hmm_pfns_clear(range, range->pfns, range->start, range->end); 1143 - return -EPERM; 1144 - } 1145 - 1146 - /* Initialize range to track CPU page table update */ 1147 - spin_lock(&hmm->lock); 1148 - range->valid = true; 1149 - list_add_rcu(&range->list, &hmm->ranges); 1150 - spin_unlock(&hmm->lock); 1151 - 1152 - hmm_vma_walk.fault = true; 1153 - hmm_vma_walk.block = block; 1154 - hmm_vma_walk.range = range; 1155 - mm_walk.private = &hmm_vma_walk; 1156 - hmm_vma_walk.last = range->start; 1157 - 1158 - mm_walk.vma = vma; 1159 - mm_walk.mm = vma->vm_mm; 1160 - mm_walk.pte_entry = NULL; 1161 - mm_walk.test_walk = NULL; 1162 - mm_walk.hugetlb_entry = NULL; 1163 - mm_walk.pmd_entry = hmm_vma_walk_pmd; 1164 - mm_walk.pte_hole = hmm_vma_walk_hole; 882 + /* Check if hmm_mm_destroy() was call. */ 883 + if (hmm->mm == NULL || hmm->dead) 884 + return -EFAULT; 1165 885 1166 886 do { 1167 - ret = walk_page_range(start, range->end, &mm_walk); 1168 - start = hmm_vma_walk.last; 1169 - } while (ret == -EAGAIN); 887 + /* If range is no longer valid force retry. */ 888 + if (!range->valid) { 889 + up_read(&hmm->mm->mmap_sem); 890 + return -EAGAIN; 891 + } 1170 892 1171 - if (ret) { 1172 - unsigned long i; 893 + vma = find_vma(hmm->mm, start); 894 + if (vma == NULL || (vma->vm_flags & device_vma)) 895 + return -EFAULT; 1173 896 1174 - i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 1175 - hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last, 1176 - range->end); 1177 - hmm_vma_range_done(range); 897 + if (is_vm_hugetlb_page(vma)) { 898 + if (huge_page_shift(hstate_vma(vma)) != 899 + range->page_shift && 900 + range->page_shift != PAGE_SHIFT) 901 + return -EINVAL; 902 + } else { 903 + if (range->page_shift != PAGE_SHIFT) 904 + return -EINVAL; 905 + } 906 + 907 + if (!(vma->vm_flags & VM_READ)) { 908 + /* 909 + * If vma do not allow read access, then assume that it 910 + * does not allow write access, either. HMM does not 911 + * support architecture that allow write without read. 912 + */ 913 + hmm_pfns_clear(range, range->pfns, 914 + range->start, range->end); 915 + return -EPERM; 916 + } 917 + 918 + range->vma = vma; 919 + hmm_vma_walk.pgmap = NULL; 920 + hmm_vma_walk.last = start; 921 + hmm_vma_walk.fault = true; 922 + hmm_vma_walk.block = block; 923 + hmm_vma_walk.range = range; 924 + mm_walk.private = &hmm_vma_walk; 925 + end = min(range->end, vma->vm_end); 926 + 927 + mm_walk.vma = vma; 928 + mm_walk.mm = vma->vm_mm; 929 + mm_walk.pte_entry = NULL; 930 + mm_walk.test_walk = NULL; 931 + mm_walk.hugetlb_entry = NULL; 932 + mm_walk.pud_entry = hmm_vma_walk_pud; 933 + mm_walk.pmd_entry = hmm_vma_walk_pmd; 934 + mm_walk.pte_hole = hmm_vma_walk_hole; 935 + mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; 936 + 937 + do { 938 + ret = walk_page_range(start, end, &mm_walk); 939 + start = hmm_vma_walk.last; 940 + 941 + /* Keep trying while the range is valid. */ 942 + } while (ret == -EBUSY && range->valid); 943 + 944 + if (ret) { 945 + unsigned long i; 946 + 947 + i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 948 + hmm_pfns_clear(range, &range->pfns[i], 949 + hmm_vma_walk.last, range->end); 950 + return ret; 951 + } 952 + start = end; 953 + 954 + } while (start < range->end); 955 + 956 + return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 957 + } 958 + EXPORT_SYMBOL(hmm_range_fault); 959 + 960 + /** 961 + * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one. 962 + * @range: range being faulted 963 + * @device: device against to dma map page to 964 + * @daddrs: dma address of mapped pages 965 + * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) 966 + * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been 967 + * drop and you need to try again, some other error value otherwise 968 + * 969 + * Note same usage pattern as hmm_range_fault(). 970 + */ 971 + long hmm_range_dma_map(struct hmm_range *range, 972 + struct device *device, 973 + dma_addr_t *daddrs, 974 + bool block) 975 + { 976 + unsigned long i, npages, mapped; 977 + long ret; 978 + 979 + ret = hmm_range_fault(range, block); 980 + if (ret <= 0) 981 + return ret ? ret : -EBUSY; 982 + 983 + npages = (range->end - range->start) >> PAGE_SHIFT; 984 + for (i = 0, mapped = 0; i < npages; ++i) { 985 + enum dma_data_direction dir = DMA_TO_DEVICE; 986 + struct page *page; 987 + 988 + /* 989 + * FIXME need to update DMA API to provide invalid DMA address 990 + * value instead of a function to test dma address value. This 991 + * would remove lot of dumb code duplicated accross many arch. 992 + * 993 + * For now setting it to 0 here is good enough as the pfns[] 994 + * value is what is use to check what is valid and what isn't. 995 + */ 996 + daddrs[i] = 0; 997 + 998 + page = hmm_device_entry_to_page(range, range->pfns[i]); 999 + if (page == NULL) 1000 + continue; 1001 + 1002 + /* Check if range is being invalidated */ 1003 + if (!range->valid) { 1004 + ret = -EBUSY; 1005 + goto unmap; 1006 + } 1007 + 1008 + /* If it is read and write than map bi-directional. */ 1009 + if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) 1010 + dir = DMA_BIDIRECTIONAL; 1011 + 1012 + daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir); 1013 + if (dma_mapping_error(device, daddrs[i])) { 1014 + ret = -EFAULT; 1015 + goto unmap; 1016 + } 1017 + 1018 + mapped++; 1178 1019 } 1020 + 1021 + return mapped; 1022 + 1023 + unmap: 1024 + for (npages = i, i = 0; (i < npages) && mapped; ++i) { 1025 + enum dma_data_direction dir = DMA_TO_DEVICE; 1026 + struct page *page; 1027 + 1028 + page = hmm_device_entry_to_page(range, range->pfns[i]); 1029 + if (page == NULL) 1030 + continue; 1031 + 1032 + if (dma_mapping_error(device, daddrs[i])) 1033 + continue; 1034 + 1035 + /* If it is read and write than map bi-directional. */ 1036 + if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) 1037 + dir = DMA_BIDIRECTIONAL; 1038 + 1039 + dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); 1040 + mapped--; 1041 + } 1042 + 1179 1043 return ret; 1180 1044 } 1181 - EXPORT_SYMBOL(hmm_vma_fault); 1045 + EXPORT_SYMBOL(hmm_range_dma_map); 1046 + 1047 + /** 1048 + * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() 1049 + * @range: range being unmapped 1050 + * @vma: the vma against which the range (optional) 1051 + * @device: device against which dma map was done 1052 + * @daddrs: dma address of mapped pages 1053 + * @dirty: dirty page if it had the write flag set 1054 + * Returns: number of page unmapped on success, -EINVAL otherwise 1055 + * 1056 + * Note that caller MUST abide by mmu notifier or use HMM mirror and abide 1057 + * to the sync_cpu_device_pagetables() callback so that it is safe here to 1058 + * call set_page_dirty(). Caller must also take appropriate locks to avoid 1059 + * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. 1060 + */ 1061 + long hmm_range_dma_unmap(struct hmm_range *range, 1062 + struct vm_area_struct *vma, 1063 + struct device *device, 1064 + dma_addr_t *daddrs, 1065 + bool dirty) 1066 + { 1067 + unsigned long i, npages; 1068 + long cpages = 0; 1069 + 1070 + /* Sanity check. */ 1071 + if (range->end <= range->start) 1072 + return -EINVAL; 1073 + if (!daddrs) 1074 + return -EINVAL; 1075 + if (!range->pfns) 1076 + return -EINVAL; 1077 + 1078 + npages = (range->end - range->start) >> PAGE_SHIFT; 1079 + for (i = 0; i < npages; ++i) { 1080 + enum dma_data_direction dir = DMA_TO_DEVICE; 1081 + struct page *page; 1082 + 1083 + page = hmm_device_entry_to_page(range, range->pfns[i]); 1084 + if (page == NULL) 1085 + continue; 1086 + 1087 + /* If it is read and write than map bi-directional. */ 1088 + if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) { 1089 + dir = DMA_BIDIRECTIONAL; 1090 + 1091 + /* 1092 + * See comments in function description on why it is 1093 + * safe here to call set_page_dirty() 1094 + */ 1095 + if (dirty) 1096 + set_page_dirty(page); 1097 + } 1098 + 1099 + /* Unmap and clear pfns/dma address */ 1100 + dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); 1101 + range->pfns[i] = range->values[HMM_PFN_NONE]; 1102 + /* FIXME see comments in hmm_vma_dma_map() */ 1103 + daddrs[i] = 0; 1104 + cpages++; 1105 + } 1106 + 1107 + return cpages; 1108 + } 1109 + EXPORT_SYMBOL(hmm_range_dma_unmap); 1182 1110 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ 1183 1111 1184 1112

+22 -13

mm/huge_memory.c

··· 509 509 set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); 510 510 } 511 511 512 - unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, 512 + static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, 513 513 loff_t off, unsigned long flags, unsigned long size) 514 514 { 515 515 unsigned long addr; ··· 793 793 pte_free(mm, pgtable); 794 794 } 795 795 796 - vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 797 - pmd_t *pmd, pfn_t pfn, bool write) 796 + vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) 798 797 { 798 + unsigned long addr = vmf->address & PMD_MASK; 799 + struct vm_area_struct *vma = vmf->vma; 799 800 pgprot_t pgprot = vma->vm_page_prot; 800 801 pgtable_t pgtable = NULL; 802 + 801 803 /* 802 804 * If we had pmd_special, we could avoid all these restrictions, 803 805 * but we need to be consistent with PTEs and architectures that ··· 822 820 823 821 track_pfn_insert(vma, &pgprot, pfn); 824 822 825 - insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable); 823 + insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable); 826 824 return VM_FAULT_NOPAGE; 827 825 } 828 826 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); ··· 871 869 spin_unlock(ptl); 872 870 } 873 871 874 - vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, 875 - pud_t *pud, pfn_t pfn, bool write) 872 + vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) 876 873 { 874 + unsigned long addr = vmf->address & PUD_MASK; 875 + struct vm_area_struct *vma = vmf->vma; 877 876 pgprot_t pgprot = vma->vm_page_prot; 877 + 878 878 /* 879 879 * If we had pud_special, we could avoid all these restrictions, 880 880 * but we need to be consistent with PTEs and architectures that ··· 893 889 894 890 track_pfn_insert(vma, &pgprot, pfn); 895 891 896 - insert_pfn_pud(vma, addr, pud, pfn, pgprot, write); 892 + insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write); 897 893 return VM_FAULT_NOPAGE; 898 894 } 899 895 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); ··· 1224 1220 cond_resched(); 1225 1221 } 1226 1222 1227 - mmu_notifier_range_init(&range, vma->vm_mm, haddr, 1228 - haddr + HPAGE_PMD_SIZE); 1223 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 1224 + haddr, haddr + HPAGE_PMD_SIZE); 1229 1225 mmu_notifier_invalidate_range_start(&range); 1230 1226 1231 1227 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); ··· 1388 1384 vma, HPAGE_PMD_NR); 1389 1385 __SetPageUptodate(new_page); 1390 1386 1391 - mmu_notifier_range_init(&range, vma->vm_mm, haddr, 1392 - haddr + HPAGE_PMD_SIZE); 1387 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 1388 + haddr, haddr + HPAGE_PMD_SIZE); 1393 1389 mmu_notifier_invalidate_range_start(&range); 1394 1390 1395 1391 spin_lock(vmf->ptl); ··· 2064 2060 spinlock_t *ptl; 2065 2061 struct mmu_notifier_range range; 2066 2062 2067 - mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK, 2063 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 2064 + address & HPAGE_PUD_MASK, 2068 2065 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); 2069 2066 mmu_notifier_invalidate_range_start(&range); 2070 2067 ptl = pud_lock(vma->vm_mm, pud); ··· 2283 2278 spinlock_t *ptl; 2284 2279 struct mmu_notifier_range range; 2285 2280 2286 - mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK, 2281 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 2282 + address & HPAGE_PMD_MASK, 2287 2283 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); 2288 2284 mmu_notifier_invalidate_range_start(&range); 2289 2285 ptl = pmd_lock(vma->vm_mm, pmd); ··· 2498 2492 if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) 2499 2493 shmem_uncharge(head->mapping->host, 1); 2500 2494 put_page(head + i); 2495 + } else if (!PageAnon(page)) { 2496 + __xa_store(&head->mapping->i_pages, head[i].index, 2497 + head + i, 0); 2501 2498 } 2502 2499 } 2503 2500

+119 -61

mm/hugetlb.c

··· 740 740 741 741 static inline struct resv_map *inode_resv_map(struct inode *inode) 742 742 { 743 - return inode->i_mapping->private_data; 743 + /* 744 + * At inode evict time, i_mapping may not point to the original 745 + * address space within the inode. This original address space 746 + * contains the pointer to the resv_map. So, always use the 747 + * address space embedded within the inode. 748 + * The VERY common case is inode->mapping == &inode->i_data but, 749 + * this may not be true for device special inodes. 750 + */ 751 + return (struct resv_map *)(&inode->i_data)->private_data; 744 752 } 745 753 746 754 static struct resv_map *vma_resv_map(struct vm_area_struct *vma) ··· 1067 1059 free_contig_range(page_to_pfn(page), 1 << order); 1068 1060 } 1069 1061 1062 + #ifdef CONFIG_CONTIG_ALLOC 1070 1063 static int __alloc_gigantic_page(unsigned long start_pfn, 1071 1064 unsigned long nr_pages, gfp_t gfp_mask) 1072 1065 { ··· 1152 1143 1153 1144 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); 1154 1145 static void prep_compound_gigantic_page(struct page *page, unsigned int order); 1146 + #else /* !CONFIG_CONTIG_ALLOC */ 1147 + static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 1148 + int nid, nodemask_t *nodemask) 1149 + { 1150 + return NULL; 1151 + } 1152 + #endif /* CONFIG_CONTIG_ALLOC */ 1155 1153 1156 1154 #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ 1157 - static inline bool gigantic_page_supported(void) { return false; } 1158 1155 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, 1159 - int nid, nodemask_t *nodemask) { return NULL; } 1156 + int nid, nodemask_t *nodemask) 1157 + { 1158 + return NULL; 1159 + } 1160 1160 static inline void free_gigantic_page(struct page *page, unsigned int order) { } 1161 1161 static inline void destroy_compound_gigantic_page(struct page *page, 1162 1162 unsigned int order) { } ··· 1175 1157 { 1176 1158 int i; 1177 1159 1178 - if (hstate_is_gigantic(h) && !gigantic_page_supported()) 1160 + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 1179 1161 return; 1180 1162 1181 1163 h->nr_huge_pages--; ··· 1276 1258 ClearPagePrivate(page); 1277 1259 1278 1260 /* 1279 - * A return code of zero implies that the subpool will be under its 1280 - * minimum size if the reservation is not restored after page is free. 1281 - * Therefore, force restore_reserve operation. 1261 + * If PagePrivate() was set on page, page allocation consumed a 1262 + * reservation. If the page was associated with a subpool, there 1263 + * would have been a page reserved in the subpool before allocation 1264 + * via hugepage_subpool_get_pages(). Since we are 'restoring' the 1265 + * reservtion, do not call hugepage_subpool_put_pages() as this will 1266 + * remove the reserved page from the subpool. 1282 1267 */ 1283 - if (hugepage_subpool_put_pages(spool, 1) == 0) 1284 - restore_reserve = true; 1268 + if (!restore_reserve) { 1269 + /* 1270 + * A return code of zero implies that the subpool will be 1271 + * under its minimum size if the reservation is not restored 1272 + * after page is free. Therefore, force restore_reserve 1273 + * operation. 1274 + */ 1275 + if (hugepage_subpool_put_pages(spool, 1) == 0) 1276 + restore_reserve = true; 1277 + } 1285 1278 1286 1279 spin_lock(&hugetlb_lock); 1287 1280 clear_page_huge_active(page); ··· 1603 1574 */ 1604 1575 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { 1605 1576 SetPageHugeTemporary(page); 1577 + spin_unlock(&hugetlb_lock); 1606 1578 put_page(page); 1607 - page = NULL; 1579 + return NULL; 1608 1580 } else { 1609 1581 h->surplus_huge_pages++; 1610 1582 h->surplus_huge_pages_node[page_to_nid(page)]++; ··· 2307 2277 } 2308 2278 2309 2279 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 2310 - static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, 2311 - nodemask_t *nodes_allowed) 2280 + static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, 2281 + nodemask_t *nodes_allowed) 2312 2282 { 2313 2283 unsigned long min_count, ret; 2314 2284 2315 - if (hstate_is_gigantic(h) && !gigantic_page_supported()) 2316 - return h->max_huge_pages; 2285 + spin_lock(&hugetlb_lock); 2286 + 2287 + /* 2288 + * Check for a node specific request. 2289 + * Changing node specific huge page count may require a corresponding 2290 + * change to the global count. In any case, the passed node mask 2291 + * (nodes_allowed) will restrict alloc/free to the specified node. 2292 + */ 2293 + if (nid != NUMA_NO_NODE) { 2294 + unsigned long old_count = count; 2295 + 2296 + count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 2297 + /* 2298 + * User may have specified a large count value which caused the 2299 + * above calculation to overflow. In this case, they wanted 2300 + * to allocate as many huge pages as possible. Set count to 2301 + * largest possible value to align with their intention. 2302 + */ 2303 + if (count < old_count) 2304 + count = ULONG_MAX; 2305 + } 2306 + 2307 + /* 2308 + * Gigantic pages runtime allocation depend on the capability for large 2309 + * page range allocation. 2310 + * If the system does not provide this feature, return an error when 2311 + * the user tries to allocate gigantic pages but let the user free the 2312 + * boottime allocated gigantic pages. 2313 + */ 2314 + if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { 2315 + if (count > persistent_huge_pages(h)) { 2316 + spin_unlock(&hugetlb_lock); 2317 + return -EINVAL; 2318 + } 2319 + /* Fall through to decrease pool */ 2320 + } 2317 2321 2318 2322 /* 2319 2323 * Increase the pool size ··· 2360 2296 * pool might be one hugepage larger than it needs to be, but 2361 2297 * within all the constraints specified by the sysctls. 2362 2298 */ 2363 - spin_lock(&hugetlb_lock); 2364 2299 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 2365 2300 if (!adjust_pool_surplus(h, nodes_allowed, -1)) 2366 2301 break; ··· 2414 2351 break; 2415 2352 } 2416 2353 out: 2417 - ret = persistent_huge_pages(h); 2354 + h->max_huge_pages = persistent_huge_pages(h); 2418 2355 spin_unlock(&hugetlb_lock); 2419 - return ret; 2356 + 2357 + return 0; 2420 2358 } 2421 2359 2422 2360 #define HSTATE_ATTR_RO(_name) \ ··· 2467 2403 unsigned long count, size_t len) 2468 2404 { 2469 2405 int err; 2470 - NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); 2406 + nodemask_t nodes_allowed, *n_mask; 2471 2407 2472 - if (hstate_is_gigantic(h) && !gigantic_page_supported()) { 2473 - err = -EINVAL; 2474 - goto out; 2475 - } 2408 + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) 2409 + return -EINVAL; 2476 2410 2477 2411 if (nid == NUMA_NO_NODE) { 2478 2412 /* 2479 2413 * global hstate attribute 2480 2414 */ 2481 2415 if (!(obey_mempolicy && 2482 - init_nodemask_of_mempolicy(nodes_allowed))) { 2483 - NODEMASK_FREE(nodes_allowed); 2484 - nodes_allowed = &node_states[N_MEMORY]; 2485 - } 2486 - } else if (nodes_allowed) { 2416 + init_nodemask_of_mempolicy(&nodes_allowed))) 2417 + n_mask = &node_states[N_MEMORY]; 2418 + else 2419 + n_mask = &nodes_allowed; 2420 + } else { 2487 2421 /* 2488 - * per node hstate attribute: adjust count to global, 2489 - * but restrict alloc/free to the specified node. 2422 + * Node specific request. count adjustment happens in 2423 + * set_max_huge_pages() after acquiring hugetlb_lock. 2490 2424 */ 2491 - count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 2492 - init_nodemask_of_node(nodes_allowed, nid); 2493 - } else 2494 - nodes_allowed = &node_states[N_MEMORY]; 2425 + init_nodemask_of_node(&nodes_allowed, nid); 2426 + n_mask = &nodes_allowed; 2427 + } 2495 2428 2496 - h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); 2429 + err = set_max_huge_pages(h, count, nid, n_mask); 2497 2430 2498 - if (nodes_allowed != &node_states[N_MEMORY]) 2499 - NODEMASK_FREE(nodes_allowed); 2500 - 2501 - return len; 2502 - out: 2503 - NODEMASK_FREE(nodes_allowed); 2504 - return err; 2431 + return err ? err : len; 2505 2432 } 2506 2433 2507 2434 static ssize_t nr_hugepages_store_common(bool obey_mempolicy, ··· 3302 3247 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 3303 3248 3304 3249 if (cow) { 3305 - mmu_notifier_range_init(&range, src, vma->vm_start, 3250 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src, 3251 + vma->vm_start, 3306 3252 vma->vm_end); 3307 3253 mmu_notifier_invalidate_range_start(&range); 3308 3254 } ··· 3415 3359 /* 3416 3360 * If sharing possible, alert mmu notifiers of worst case. 3417 3361 */ 3418 - mmu_notifier_range_init(&range, mm, start, end); 3362 + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, 3363 + end); 3419 3364 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 3420 3365 mmu_notifier_invalidate_range_start(&range); 3421 3366 address = start; ··· 3683 3626 pages_per_huge_page(h)); 3684 3627 __SetPageUptodate(new_page); 3685 3628 3686 - mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h)); 3629 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr, 3630 + haddr + huge_page_size(h)); 3687 3631 mmu_notifier_invalidate_range_start(&range); 3688 3632 3689 3633 /* ··· 3835 3777 * handling userfault. Reacquire after handling 3836 3778 * fault to make calling code simpler. 3837 3779 */ 3838 - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, 3839 - idx, haddr); 3780 + hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); 3840 3781 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 3841 3782 ret = handle_userfault(&vmf, VM_UFFD_MISSING); 3842 3783 mutex_lock(&hugetlb_fault_mutex_table[hash]); ··· 3943 3886 } 3944 3887 3945 3888 #ifdef CONFIG_SMP 3946 - u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, 3947 - struct vm_area_struct *vma, 3948 - struct address_space *mapping, 3889 + u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, 3949 3890 pgoff_t idx, unsigned long address) 3950 3891 { 3951 3892 unsigned long key[2]; 3952 3893 u32 hash; 3953 3894 3954 - if (vma->vm_flags & VM_SHARED) { 3955 - key[0] = (unsigned long) mapping; 3956 - key[1] = idx; 3957 - } else { 3958 - key[0] = (unsigned long) mm; 3959 - key[1] = address >> huge_page_shift(h); 3960 - } 3895 + key[0] = (unsigned long) mapping; 3896 + key[1] = idx; 3961 3897 3962 3898 hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); 3963 3899 ··· 3961 3911 * For uniprocesor systems we always use a single mutex, so just 3962 3912 * return 0 and avoid the hashing overhead. 3963 3913 */ 3964 - u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, 3965 - struct vm_area_struct *vma, 3966 - struct address_space *mapping, 3914 + u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, 3967 3915 pgoff_t idx, unsigned long address) 3968 3916 { 3969 3917 return 0; ··· 4006 3958 * get spurious allocation failures if two CPUs race to instantiate 4007 3959 * the same page in the page cache. 4008 3960 */ 4009 - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); 3961 + hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); 4010 3962 mutex_lock(&hugetlb_fault_mutex_table[hash]); 4011 3963 4012 3964 entry = huge_ptep_get(ptep); ··· 4419 4371 * start/end. Set range.start/range.end to cover the maximum possible 4420 4372 * range if PMD sharing is possible. 4421 4373 */ 4422 - mmu_notifier_range_init(&range, mm, start, end); 4374 + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 4375 + 0, vma, mm, start, end); 4423 4376 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); 4424 4377 4425 4378 BUG_ON(address >= end); ··· 4526 4477 * called to make the mapping read-write. Assume !vma is a shm mapping 4527 4478 */ 4528 4479 if (!vma || vma->vm_flags & VM_MAYSHARE) { 4480 + /* 4481 + * resv_map can not be NULL as hugetlb_reserve_pages is only 4482 + * called for inodes for which resv_maps were created (see 4483 + * hugetlbfs_get_inode). 4484 + */ 4529 4485 resv_map = inode_resv_map(inode); 4530 4486 4531 4487 chg = region_chg(resv_map, from, to); ··· 4622 4568 struct hugepage_subpool *spool = subpool_inode(inode); 4623 4569 long gbl_reserve; 4624 4570 4571 + /* 4572 + * Since this routine can be called in the evict inode path for all 4573 + * hugetlbfs inodes, resv_map could be NULL. 4574 + */ 4625 4575 if (resv_map) { 4626 4576 chg = region_del(resv_map, start, end); 4627 4577 /*

+4 -3

mm/khugepaged.c

··· 1016 1016 pte = pte_offset_map(pmd, address); 1017 1017 pte_ptl = pte_lockptr(mm, pmd); 1018 1018 1019 - mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE); 1019 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, 1020 + address, address + HPAGE_PMD_SIZE); 1020 1021 mmu_notifier_invalidate_range_start(&range); 1021 1022 pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ 1022 1023 /* ··· 1375 1374 result = SCAN_FAIL; 1376 1375 goto xa_locked; 1377 1376 } 1378 - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); 1377 + xas_store(&xas, new_page); 1379 1378 nr_none++; 1380 1379 continue; 1381 1380 } ··· 1451 1450 list_add_tail(&page->lru, &pagelist); 1452 1451 1453 1452 /* Finally, replace with the new page. */ 1454 - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); 1453 + xas_store(&xas, new_page); 1455 1454 continue; 1456 1455 out_unlock: 1457 1456 unlock_page(page);

+4 -2

mm/ksm.c

··· 1066 1066 1067 1067 BUG_ON(PageTransCompound(page)); 1068 1068 1069 - mmu_notifier_range_init(&range, mm, pvmw.address, 1069 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, 1070 + pvmw.address, 1070 1071 pvmw.address + PAGE_SIZE); 1071 1072 mmu_notifier_invalidate_range_start(&range); 1072 1073 ··· 1155 1154 if (!pmd) 1156 1155 goto out; 1157 1156 1158 - mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); 1157 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, 1158 + addr + PAGE_SIZE); 1159 1159 mmu_notifier_invalidate_range_start(&range); 1160 1160 1161 1161 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);

+2 -1

mm/madvise.c

··· 472 472 range.end = min(vma->vm_end, end_addr); 473 473 if (range.end <= vma->vm_start) 474 474 return -EINVAL; 475 - mmu_notifier_range_init(&range, mm, range.start, range.end); 475 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, 476 + range.start, range.end); 476 477 477 478 lru_add_drain(); 478 479 tlb_gather_mmu(&tlb, mm, range.start, range.end);

+67 -3

mm/memblock.c

··· 94 94 * :c:func:`mem_init` function frees all the memory to the buddy page 95 95 * allocator. 96 96 * 97 - * If an architecure enables %CONFIG_ARCH_DISCARD_MEMBLOCK, the 97 + * Unless an architecure enables %CONFIG_ARCH_KEEP_MEMBLOCK, the 98 98 * memblock data structures will be discarded after the system 99 99 * initialization compltes. 100 100 */ ··· 375 375 } 376 376 } 377 377 378 - #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK 378 + #ifndef CONFIG_ARCH_KEEP_MEMBLOCK 379 379 /** 380 380 * memblock_discard - discard memory and reserved arrays if they were allocated 381 381 */ ··· 1255 1255 return 0; 1256 1256 } 1257 1257 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 1258 + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1259 + /** 1260 + * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone() 1261 + * 1262 + * @idx: pointer to u64 loop variable 1263 + * @zone: zone in which all of the memory blocks reside 1264 + * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL 1265 + * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL 1266 + * 1267 + * This function is meant to be a zone/pfn specific wrapper for the 1268 + * for_each_mem_range type iterators. Specifically they are used in the 1269 + * deferred memory init routines and as such we were duplicating much of 1270 + * this logic throughout the code. So instead of having it in multiple 1271 + * locations it seemed like it would make more sense to centralize this to 1272 + * one new iterator that does everything they need. 1273 + */ 1274 + void __init_memblock 1275 + __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, 1276 + unsigned long *out_spfn, unsigned long *out_epfn) 1277 + { 1278 + int zone_nid = zone_to_nid(zone); 1279 + phys_addr_t spa, epa; 1280 + int nid; 1281 + 1282 + __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, 1283 + &memblock.memory, &memblock.reserved, 1284 + &spa, &epa, &nid); 1285 + 1286 + while (*idx != U64_MAX) { 1287 + unsigned long epfn = PFN_DOWN(epa); 1288 + unsigned long spfn = PFN_UP(spa); 1289 + 1290 + /* 1291 + * Verify the end is at least past the start of the zone and 1292 + * that we have at least one PFN to initialize. 1293 + */ 1294 + if (zone->zone_start_pfn < epfn && spfn < epfn) { 1295 + /* if we went too far just stop searching */ 1296 + if (zone_end_pfn(zone) <= spfn) { 1297 + *idx = U64_MAX; 1298 + break; 1299 + } 1300 + 1301 + if (out_spfn) 1302 + *out_spfn = max(zone->zone_start_pfn, spfn); 1303 + if (out_epfn) 1304 + *out_epfn = min(zone_end_pfn(zone), epfn); 1305 + 1306 + return; 1307 + } 1308 + 1309 + __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, 1310 + &memblock.memory, &memblock.reserved, 1311 + &spa, &epa, &nid); 1312 + } 1313 + 1314 + /* signal end of iteration */ 1315 + if (out_spfn) 1316 + *out_spfn = ULONG_MAX; 1317 + if (out_epfn) 1318 + *out_epfn = 0; 1319 + } 1320 + 1321 + #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 1258 1322 1259 1323 /** 1260 1324 * memblock_alloc_range_nid - allocate boot memory block ··· 1987 1923 return pages; 1988 1924 } 1989 1925 1990 - #if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK) 1926 + #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK) 1991 1927 1992 1928 static int memblock_debug_show(struct seq_file *m, void *private) 1993 1929 {

+49 -36

mm/memcontrol.c

··· 725 725 __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); 726 726 } 727 727 728 - unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 729 - int nid, unsigned int lru_mask) 730 - { 731 - struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); 732 - unsigned long nr = 0; 733 - enum lru_list lru; 734 - 735 - VM_BUG_ON((unsigned)nid >= nr_node_ids); 736 - 737 - for_each_lru(lru) { 738 - if (!(BIT(lru) & lru_mask)) 739 - continue; 740 - nr += mem_cgroup_get_lru_size(lruvec, lru); 741 - } 742 - return nr; 743 - } 744 - 745 - static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 746 - unsigned int lru_mask) 747 - { 748 - unsigned long nr = 0; 749 - int nid; 750 - 751 - for_each_node_state(nid, N_MEMORY) 752 - nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 753 - return nr; 754 - } 755 - 756 728 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 757 729 enum mem_cgroup_events_target target) 758 730 { ··· 1330 1358 1331 1359 for (i = 0; i < NR_LRU_LISTS; i++) 1332 1360 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], 1333 - K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); 1361 + K(memcg_page_state(iter, NR_LRU_BASE + i))); 1334 1362 1335 1363 pr_cont("\n"); 1336 1364 } ··· 1394 1422 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1395 1423 int nid, bool noswap) 1396 1424 { 1397 - if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1425 + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); 1426 + 1427 + if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) || 1428 + lruvec_page_state(lruvec, NR_ACTIVE_FILE)) 1398 1429 return true; 1399 1430 if (noswap || !total_swap_pages) 1400 1431 return false; 1401 - if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1432 + if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) || 1433 + lruvec_page_state(lruvec, NR_ACTIVE_ANON)) 1402 1434 return true; 1403 1435 return false; 1404 1436 ··· 2966 2990 acc->events_array ? acc->events_array[i] : i); 2967 2991 2968 2992 for (i = 0; i < NR_LRU_LISTS; i++) 2969 - acc->lru_pages[i] += 2970 - mem_cgroup_nr_lru_pages(mi, BIT(i)); 2993 + acc->lru_pages[i] += memcg_page_state(mi, 2994 + NR_LRU_BASE + i); 2971 2995 } 2972 2996 } 2973 2997 ··· 3307 3331 #endif 3308 3332 3309 3333 #ifdef CONFIG_NUMA 3334 + 3335 + #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 3336 + #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 3337 + #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 3338 + 3339 + static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 3340 + int nid, unsigned int lru_mask) 3341 + { 3342 + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); 3343 + unsigned long nr = 0; 3344 + enum lru_list lru; 3345 + 3346 + VM_BUG_ON((unsigned)nid >= nr_node_ids); 3347 + 3348 + for_each_lru(lru) { 3349 + if (!(BIT(lru) & lru_mask)) 3350 + continue; 3351 + nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); 3352 + } 3353 + return nr; 3354 + } 3355 + 3356 + static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 3357 + unsigned int lru_mask) 3358 + { 3359 + unsigned long nr = 0; 3360 + enum lru_list lru; 3361 + 3362 + for_each_lru(lru) { 3363 + if (!(BIT(lru) & lru_mask)) 3364 + continue; 3365 + nr += memcg_page_state(memcg, NR_LRU_BASE + lru); 3366 + } 3367 + return nr; 3368 + } 3369 + 3310 3370 static int memcg_numa_stat_show(struct seq_file *m, void *v) 3311 3371 { 3312 3372 struct numa_stat { ··· 3433 3421 3434 3422 for (i = 0; i < NR_LRU_LISTS; i++) 3435 3423 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], 3436 - mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 3424 + memcg_page_state(memcg, NR_LRU_BASE + i) * 3425 + PAGE_SIZE); 3437 3426 3438 3427 /* Hierarchical information */ 3439 3428 memory = memsw = PAGE_COUNTER_MAX; ··· 3940 3927 3941 3928 /* this should eventually include NR_UNSTABLE_NFS */ 3942 3929 *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); 3943 - *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | 3944 - (1 << LRU_ACTIVE_FILE)); 3930 + *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + 3931 + memcg_exact_page_state(memcg, NR_ACTIVE_FILE); 3945 3932 *pheadroom = PAGE_COUNTER_MAX; 3946 3933 3947 3934 while ((parent = parent_mem_cgroup(memcg))) {

+2

mm/memfd.c

··· 39 39 xas_for_each(xas, page, ULONG_MAX) { 40 40 if (xa_is_value(page)) 41 41 continue; 42 + page = find_subpage(page, xas->xa_index); 42 43 if (page_count(page) - page_mapcount(page) > 1) 43 44 xas_set_mark(xas, MEMFD_TAG_PINNED); 44 45 ··· 89 88 bool clear = true; 90 89 if (xa_is_value(page)) 91 90 continue; 91 + page = find_subpage(page, xas.xa_index); 92 92 if (page_count(page) - page_mapcount(page) != 1) { 93 93 /* 94 94 * On the last scan, we clean up all those tags

+97 -9

mm/memory.c

··· 1010 1010 is_cow = is_cow_mapping(vma->vm_flags); 1011 1011 1012 1012 if (is_cow) { 1013 - mmu_notifier_range_init(&range, src_mm, addr, end); 1013 + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 1014 + 0, vma, src_mm, addr, end); 1014 1015 mmu_notifier_invalidate_range_start(&range); 1015 1016 } 1016 1017 ··· 1335 1334 { 1336 1335 struct mmu_notifier_range range; 1337 1336 1338 - mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr); 1337 + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, 1338 + start_addr, end_addr); 1339 1339 mmu_notifier_invalidate_range_start(&range); 1340 1340 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) 1341 1341 unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); ··· 1358 1356 struct mmu_gather tlb; 1359 1357 1360 1358 lru_add_drain(); 1361 - mmu_notifier_range_init(&range, vma->vm_mm, start, start + size); 1359 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 1360 + start, start + size); 1362 1361 tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end); 1363 1362 update_hiwater_rss(vma->vm_mm); 1364 1363 mmu_notifier_invalidate_range_start(&range); ··· 1385 1382 struct mmu_gather tlb; 1386 1383 1387 1384 lru_add_drain(); 1388 - mmu_notifier_range_init(&range, vma->vm_mm, address, address + size); 1385 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 1386 + address, address + size); 1389 1387 tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end); 1390 1388 update_hiwater_rss(vma->vm_mm); 1391 1389 mmu_notifier_invalidate_range_start(&range); ··· 1526 1522 return insert_page(vma, addr, page, vma->vm_page_prot); 1527 1523 } 1528 1524 EXPORT_SYMBOL(vm_insert_page); 1525 + 1526 + /* 1527 + * __vm_map_pages - maps range of kernel pages into user vma 1528 + * @vma: user vma to map to 1529 + * @pages: pointer to array of source kernel pages 1530 + * @num: number of pages in page array 1531 + * @offset: user's requested vm_pgoff 1532 + * 1533 + * This allows drivers to map range of kernel pages into a user vma. 1534 + * 1535 + * Return: 0 on success and error code otherwise. 1536 + */ 1537 + static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages, 1538 + unsigned long num, unsigned long offset) 1539 + { 1540 + unsigned long count = vma_pages(vma); 1541 + unsigned long uaddr = vma->vm_start; 1542 + int ret, i; 1543 + 1544 + /* Fail if the user requested offset is beyond the end of the object */ 1545 + if (offset > num) 1546 + return -ENXIO; 1547 + 1548 + /* Fail if the user requested size exceeds available object size */ 1549 + if (count > num - offset) 1550 + return -ENXIO; 1551 + 1552 + for (i = 0; i < count; i++) { 1553 + ret = vm_insert_page(vma, uaddr, pages[offset + i]); 1554 + if (ret < 0) 1555 + return ret; 1556 + uaddr += PAGE_SIZE; 1557 + } 1558 + 1559 + return 0; 1560 + } 1561 + 1562 + /** 1563 + * vm_map_pages - maps range of kernel pages starts with non zero offset 1564 + * @vma: user vma to map to 1565 + * @pages: pointer to array of source kernel pages 1566 + * @num: number of pages in page array 1567 + * 1568 + * Maps an object consisting of @num pages, catering for the user's 1569 + * requested vm_pgoff 1570 + * 1571 + * If we fail to insert any page into the vma, the function will return 1572 + * immediately leaving any previously inserted pages present. Callers 1573 + * from the mmap handler may immediately return the error as their caller 1574 + * will destroy the vma, removing any successfully inserted pages. Other 1575 + * callers should make their own arrangements for calling unmap_region(). 1576 + * 1577 + * Context: Process context. Called by mmap handlers. 1578 + * Return: 0 on success and error code otherwise. 1579 + */ 1580 + int vm_map_pages(struct vm_area_struct *vma, struct page **pages, 1581 + unsigned long num) 1582 + { 1583 + return __vm_map_pages(vma, pages, num, vma->vm_pgoff); 1584 + } 1585 + EXPORT_SYMBOL(vm_map_pages); 1586 + 1587 + /** 1588 + * vm_map_pages_zero - map range of kernel pages starts with zero offset 1589 + * @vma: user vma to map to 1590 + * @pages: pointer to array of source kernel pages 1591 + * @num: number of pages in page array 1592 + * 1593 + * Similar to vm_map_pages(), except that it explicitly sets the offset 1594 + * to 0. This function is intended for the drivers that did not consider 1595 + * vm_pgoff. 1596 + * 1597 + * Context: Process context. Called by mmap handlers. 1598 + * Return: 0 on success and error code otherwise. 1599 + */ 1600 + int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, 1601 + unsigned long num) 1602 + { 1603 + return __vm_map_pages(vma, pages, num, 0); 1604 + } 1605 + EXPORT_SYMBOL(vm_map_pages_zero); 1529 1606 1530 1607 static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1531 1608 pfn_t pfn, pgprot_t prot, bool mkwrite) ··· 2364 2279 2365 2280 __SetPageUptodate(new_page); 2366 2281 2367 - mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK, 2282 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, 2283 + vmf->address & PAGE_MASK, 2368 2284 (vmf->address & PAGE_MASK) + PAGE_SIZE); 2369 2285 mmu_notifier_invalidate_range_start(&range); 2370 2286 ··· 4190 4104 goto out; 4191 4105 4192 4106 if (range) { 4193 - mmu_notifier_range_init(range, mm, address & PMD_MASK, 4194 - (address & PMD_MASK) + PMD_SIZE); 4107 + mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, 4108 + NULL, mm, address & PMD_MASK, 4109 + (address & PMD_MASK) + PMD_SIZE); 4195 4110 mmu_notifier_invalidate_range_start(range); 4196 4111 } 4197 4112 *ptlp = pmd_lock(mm, pmd); ··· 4209 4122 goto out; 4210 4123 4211 4124 if (range) { 4212 - mmu_notifier_range_init(range, mm, address & PAGE_MASK, 4213 - (address & PAGE_MASK) + PAGE_SIZE); 4125 + mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm, 4126 + address & PAGE_MASK, 4127 + (address & PAGE_MASK) + PAGE_SIZE); 4214 4128 mmu_notifier_invalidate_range_start(range); 4215 4129 } 4216 4130 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);

+52 -77

mm/memory_hotplug.c

··· 273 273 * add the new pages. 274 274 */ 275 275 int __ref __add_pages(int nid, unsigned long phys_start_pfn, 276 - unsigned long nr_pages, struct vmem_altmap *altmap, 277 - bool want_memblock) 276 + unsigned long nr_pages, struct mhp_restrictions *restrictions) 278 277 { 279 278 unsigned long i; 280 279 int err = 0; 281 280 int start_sec, end_sec; 281 + struct vmem_altmap *altmap = restrictions->altmap; 282 282 283 283 /* during initialize mem_map, align hot-added range to section */ 284 284 start_sec = pfn_to_section_nr(phys_start_pfn); ··· 299 299 300 300 for (i = start_sec; i <= end_sec; i++) { 301 301 err = __add_section(nid, section_nr_to_pfn(i), altmap, 302 - want_memblock); 302 + restrictions->flags & MHP_MEMBLOCK_API); 303 303 304 304 /* 305 305 * EEXIST is finally dealt with by ioresource collision ··· 516 516 pgdat_resize_unlock(zone->zone_pgdat, &flags); 517 517 } 518 518 519 - static int __remove_section(struct zone *zone, struct mem_section *ms, 520 - unsigned long map_offset, struct vmem_altmap *altmap) 519 + static void __remove_section(struct zone *zone, struct mem_section *ms, 520 + unsigned long map_offset, 521 + struct vmem_altmap *altmap) 521 522 { 522 523 unsigned long start_pfn; 523 524 int scn_nr; 524 - int ret = -EINVAL; 525 525 526 - if (!valid_section(ms)) 527 - return ret; 526 + if (WARN_ON_ONCE(!valid_section(ms))) 527 + return; 528 528 529 - ret = unregister_memory_section(ms); 530 - if (ret) 531 - return ret; 529 + unregister_memory_section(ms); 532 530 533 531 scn_nr = __section_nr(ms); 534 532 start_pfn = section_nr_to_pfn((unsigned long)scn_nr); 535 533 __remove_zone(zone, start_pfn); 536 534 537 535 sparse_remove_one_section(zone, ms, map_offset, altmap); 538 - return 0; 539 536 } 540 537 541 538 /** ··· 547 550 * sure that pages are marked reserved and zones are adjust properly by 548 551 * calling offline_pages(). 549 552 */ 550 - int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 551 - unsigned long nr_pages, struct vmem_altmap *altmap) 553 + void __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 554 + unsigned long nr_pages, struct vmem_altmap *altmap) 552 555 { 553 556 unsigned long i; 554 557 unsigned long map_offset = 0; 555 - int sections_to_remove, ret = 0; 558 + int sections_to_remove; 556 559 557 560 /* In the ZONE_DEVICE case device driver owns the memory region */ 558 561 if (is_dev_zone(zone)) { 559 562 if (altmap) 560 563 map_offset = vmem_altmap_offset(altmap); 561 - } else { 562 - resource_size_t start, size; 563 - 564 - start = phys_start_pfn << PAGE_SHIFT; 565 - size = nr_pages * PAGE_SIZE; 566 - 567 - ret = release_mem_region_adjustable(&iomem_resource, start, 568 - size); 569 - if (ret) { 570 - resource_size_t endres = start + size - 1; 571 - 572 - pr_warn("Unable to release resource <%pa-%pa> (%d)\n", 573 - &start, &endres, ret); 574 - } 575 564 } 576 565 577 566 clear_zone_contiguous(zone); ··· 573 590 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 574 591 575 592 cond_resched(); 576 - ret = __remove_section(zone, __pfn_to_section(pfn), map_offset, 577 - altmap); 593 + __remove_section(zone, __pfn_to_section(pfn), map_offset, 594 + altmap); 578 595 map_offset = 0; 579 - if (ret) 580 - break; 581 596 } 582 597 583 598 set_zone_contiguous(zone); 584 - 585 - return ret; 586 599 } 587 600 #endif /* CONFIG_MEMORY_HOTREMOVE */ 588 601 ··· 693 714 if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) 694 715 arg->status_change_nid_normal = nid; 695 716 #ifdef CONFIG_HIGHMEM 696 - if (zone_idx(zone) <= N_HIGH_MEMORY && !node_state(nid, N_HIGH_MEMORY)) 717 + if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY)) 697 718 arg->status_change_nid_high = nid; 698 719 #endif 699 720 } ··· 1076 1097 */ 1077 1098 int __ref add_memory_resource(int nid, struct resource *res) 1078 1099 { 1100 + struct mhp_restrictions restrictions = { 1101 + .flags = MHP_MEMBLOCK_API, 1102 + }; 1079 1103 u64 start, size; 1080 1104 bool new_node = false; 1081 1105 int ret; ··· 1106 1124 new_node = ret; 1107 1125 1108 1126 /* call arch's memory hotadd */ 1109 - ret = arch_add_memory(nid, start, size, NULL, true); 1127 + ret = arch_add_memory(nid, start, size, &restrictions); 1110 1128 if (ret < 0) 1111 1129 goto error; 1112 1130 ··· 1323 1341 if (!PageHuge(page)) 1324 1342 continue; 1325 1343 head = compound_head(page); 1326 - if (hugepage_migration_supported(page_hstate(head)) && 1327 - page_huge_active(head)) 1344 + if (page_huge_active(head)) 1328 1345 return pfn; 1329 1346 skip = (1 << compound_order(head)) - (page - head); 1330 1347 pfn += skip - 1; ··· 1363 1382 1364 1383 if (PageHuge(page)) { 1365 1384 struct page *head = compound_head(page); 1366 - if (compound_order(head) > PFN_SECTION_SHIFT) { 1367 - ret = -EBUSY; 1368 - break; 1369 - } 1370 1385 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1371 1386 isolate_huge_page(head, &source); 1372 1387 continue; ··· 1431 1454 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1432 1455 void *data) 1433 1456 { 1434 - __offline_isolated_pages(start, start + nr_pages); 1435 - return 0; 1436 - } 1457 + unsigned long *offlined_pages = (unsigned long *)data; 1437 1458 1438 - static void 1439 - offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1440 - { 1441 - walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1442 - offline_isolated_pages_cb); 1459 + *offlined_pages += __offline_isolated_pages(start, start + nr_pages); 1460 + return 0; 1443 1461 } 1444 1462 1445 1463 /* ··· 1444 1472 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1445 1473 void *data) 1446 1474 { 1447 - int ret; 1448 - long offlined = *(long *)data; 1449 - ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1450 - offlined = nr_pages; 1451 - if (!ret) 1452 - *(long *)data += offlined; 1453 - return ret; 1454 - } 1455 - 1456 - static long 1457 - check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1458 - { 1459 - long offlined = 0; 1460 - int ret; 1461 - 1462 - ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1463 - check_pages_isolated_cb); 1464 - if (ret < 0) 1465 - offlined = (long)ret; 1466 - return offlined; 1475 + return test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1467 1476 } 1468 1477 1469 1478 static int __init cmdline_parse_movable_node(char *p) ··· 1529 1576 unsigned long end_pfn) 1530 1577 { 1531 1578 unsigned long pfn, nr_pages; 1532 - long offlined_pages; 1579 + unsigned long offlined_pages = 0; 1533 1580 int ret, node, nr_isolate_pageblock; 1534 1581 unsigned long flags; 1535 1582 unsigned long valid_start, valid_end; ··· 1605 1652 goto failed_removal_isolated; 1606 1653 } 1607 1654 /* check again */ 1608 - offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1609 - } while (offlined_pages < 0); 1655 + ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, 1656 + NULL, check_pages_isolated_cb); 1657 + } while (ret); 1610 1658 1611 - pr_info("Offlined Pages %ld\n", offlined_pages); 1612 1659 /* Ok, all of our target is isolated. 1613 1660 We cannot do rollback at this point. */ 1614 - offline_isolated_pages(start_pfn, end_pfn); 1615 - 1661 + walk_system_ram_range(start_pfn, end_pfn - start_pfn, 1662 + &offlined_pages, offline_isolated_pages_cb); 1663 + pr_info("Offlined Pages %ld\n", offlined_pages); 1616 1664 /* 1617 1665 * Onlining will reset pagetype flags and makes migrate type 1618 1666 * MOVABLE, so just need to decrease the number of isolated ··· 1797 1843 } 1798 1844 EXPORT_SYMBOL(try_offline_node); 1799 1845 1846 + static void __release_memory_resource(resource_size_t start, 1847 + resource_size_t size) 1848 + { 1849 + int ret; 1850 + 1851 + /* 1852 + * When removing memory in the same granularity as it was added, 1853 + * this function never fails. It might only fail if resources 1854 + * have to be adjusted or split. We'll ignore the error, as 1855 + * removing of memory cannot fail. 1856 + */ 1857 + ret = release_mem_region_adjustable(&iomem_resource, start, size); 1858 + if (ret) { 1859 + resource_size_t endres = start + size - 1; 1860 + 1861 + pr_warn("Unable to release resource <%pa-%pa> (%d)\n", 1862 + &start, &endres, ret); 1863 + } 1864 + } 1865 + 1800 1866 /** 1801 1867 * remove_memory 1802 1868 * @nid: the node ID ··· 1851 1877 memblock_remove(start, size); 1852 1878 1853 1879 arch_remove_memory(nid, start, size, NULL); 1880 + __release_memory_resource(start, size); 1854 1881 1855 1882 try_offline_node(nid); 1856 1883

+5 -2

mm/migrate.c

··· 463 463 464 464 for (i = 1; i < HPAGE_PMD_NR; i++) { 465 465 xas_next(&xas); 466 - xas_store(&xas, newpage + i); 466 + xas_store(&xas, newpage); 467 467 } 468 468 } 469 469 ··· 2356 2356 mm_walk.mm = migrate->vma->vm_mm; 2357 2357 mm_walk.private = migrate; 2358 2358 2359 - mmu_notifier_range_init(&range, mm_walk.mm, migrate->start, 2359 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm_walk.mm, 2360 + migrate->start, 2360 2361 migrate->end); 2361 2362 mmu_notifier_invalidate_range_start(&range); 2362 2363 walk_page_range(migrate->start, migrate->end, &mm_walk); ··· 2765 2764 notified = true; 2766 2765 2767 2766 mmu_notifier_range_init(&range, 2767 + MMU_NOTIFY_CLEAR, 0, 2768 + NULL, 2768 2769 migrate->vma->vm_mm, 2769 2770 addr, migrate->end); 2770 2771 mmu_notifier_invalidate_range_start(&range);

+11 -1

mm/mmu_notifier.c

··· 180 180 if (_ret) { 181 181 pr_info("%pS callback failed with %d in %sblockable context.\n", 182 182 mn->ops->invalidate_range_start, _ret, 183 - !range->blockable ? "non-" : ""); 183 + !mmu_notifier_range_blockable(range) ? "non-" : ""); 184 184 ret = _ret; 185 185 } 186 186 } ··· 395 395 mmdrop(mm); 396 396 } 397 397 EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); 398 + 399 + bool 400 + mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) 401 + { 402 + if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA) 403 + return false; 404 + /* Return true if the vma still have the read flag set. */ 405 + return range->vma->vm_flags & VM_READ; 406 + } 407 + EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only);

+5 -4

mm/mprotect.c

··· 39 39 unsigned long addr, unsigned long end, pgprot_t newprot, 40 40 int dirty_accountable, int prot_numa) 41 41 { 42 - struct mm_struct *mm = vma->vm_mm; 43 42 pte_t *pte, oldpte; 44 43 spinlock_t *ptl; 45 44 unsigned long pages = 0; ··· 135 136 newpte = swp_entry_to_pte(entry); 136 137 if (pte_swp_soft_dirty(oldpte)) 137 138 newpte = pte_swp_mksoft_dirty(newpte); 138 - set_pte_at(mm, addr, pte, newpte); 139 + set_pte_at(vma->vm_mm, addr, pte, newpte); 139 140 140 141 pages++; 141 142 } ··· 149 150 */ 150 151 make_device_private_entry_read(&entry); 151 152 newpte = swp_entry_to_pte(entry); 152 - set_pte_at(mm, addr, pte, newpte); 153 + set_pte_at(vma->vm_mm, addr, pte, newpte); 153 154 154 155 pages++; 155 156 } ··· 184 185 185 186 /* invoke the mmu notifier if the pmd is populated */ 186 187 if (!range.start) { 187 - mmu_notifier_range_init(&range, vma->vm_mm, addr, end); 188 + mmu_notifier_range_init(&range, 189 + MMU_NOTIFY_PROTECTION_VMA, 0, 190 + vma, vma->vm_mm, addr, end); 188 191 mmu_notifier_invalidate_range_start(&range); 189 192 } 190 193

+2 -1

mm/mremap.c

··· 249 249 old_end = old_addr + len; 250 250 flush_cache_range(vma, old_addr, old_end); 251 251 252 - mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end); 252 + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, 253 + old_addr, old_end); 253 254 mmu_notifier_invalidate_range_start(&range); 254 255 255 256 for (; old_addr < old_end; old_addr += extent, new_addr += extent) {

+14

mm/nommu.c

··· 473 473 } 474 474 EXPORT_SYMBOL(vm_insert_page); 475 475 476 + int vm_map_pages(struct vm_area_struct *vma, struct page **pages, 477 + unsigned long num) 478 + { 479 + return -EINVAL; 480 + } 481 + EXPORT_SYMBOL(vm_map_pages); 482 + 483 + int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, 484 + unsigned long num) 485 + { 486 + return -EINVAL; 487 + } 488 + EXPORT_SYMBOL(vm_map_pages_zero); 489 + 476 490 /* 477 491 * sys_brk() for the most part doesn't need the global kernel 478 492 * lock, except when an application is doing something nasty

+2 -1

mm/oom_kill.c

··· 531 531 struct mmu_notifier_range range; 532 532 struct mmu_gather tlb; 533 533 534 - mmu_notifier_range_init(&range, mm, vma->vm_start, 534 + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, 535 + vma, mm, vma->vm_start, 535 536 vma->vm_end); 536 537 tlb_gather_mmu(&tlb, mm, range.start, range.end); 537 538 if (mmu_notifier_invalidate_range_start_nonblock(&range)) {

+12

mm/page-writeback.c

··· 2808 2808 } 2809 2809 EXPORT_SYMBOL(__test_set_page_writeback); 2810 2810 2811 + /* 2812 + * Wait for a page to complete writeback 2813 + */ 2814 + void wait_on_page_writeback(struct page *page) 2815 + { 2816 + if (PageWriteback(page)) { 2817 + trace_wait_on_page_writeback(page, page_mapping(page)); 2818 + wait_on_page_bit(page, PG_writeback); 2819 + } 2820 + } 2821 + EXPORT_SYMBOL_GPL(wait_on_page_writeback); 2822 + 2811 2823 /** 2812 2824 * wait_for_stable_page() - wait for writeback to finish, if necessary. 2813 2825 * @page: The page to wait on.

+168 -104

mm/page_alloc.c

··· 1416 1416 #endif 1417 1417 1418 1418 #ifdef CONFIG_NODES_SPAN_OTHER_NODES 1419 - static inline bool __meminit __maybe_unused 1420 - meminit_pfn_in_nid(unsigned long pfn, int node, 1421 - struct mminit_pfnnid_cache *state) 1419 + /* Only safe to use early in boot when initialisation is single-threaded */ 1420 + static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 1422 1421 { 1423 1422 int nid; 1424 1423 1425 - nid = __early_pfn_to_nid(pfn, state); 1424 + nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); 1426 1425 if (nid >= 0 && nid != node) 1427 1426 return false; 1428 1427 return true; 1429 1428 } 1430 1429 1431 - /* Only safe to use early in boot when initialisation is single-threaded */ 1432 - static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 1433 - { 1434 - return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); 1435 - } 1436 - 1437 1430 #else 1438 - 1439 1431 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) 1440 - { 1441 - return true; 1442 - } 1443 - static inline bool __meminit __maybe_unused 1444 - meminit_pfn_in_nid(unsigned long pfn, int node, 1445 - struct mminit_pfnnid_cache *state) 1446 1432 { 1447 1433 return true; 1448 1434 } ··· 1560 1574 * 1561 1575 * Then, we check if a current large page is valid by only checking the validity 1562 1576 * of the head pfn. 1563 - * 1564 - * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave 1565 - * within a node: a pfn is between start and end of a node, but does not belong 1566 - * to this memory node. 1567 1577 */ 1568 - static inline bool __init 1569 - deferred_pfn_valid(int nid, unsigned long pfn, 1570 - struct mminit_pfnnid_cache *nid_init_state) 1578 + static inline bool __init deferred_pfn_valid(unsigned long pfn) 1571 1579 { 1572 1580 if (!pfn_valid_within(pfn)) 1573 1581 return false; 1574 1582 if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) 1575 - return false; 1576 - if (!meminit_pfn_in_nid(pfn, nid, nid_init_state)) 1577 1583 return false; 1578 1584 return true; 1579 1585 } ··· 1574 1596 * Free pages to buddy allocator. Try to free aligned pages in 1575 1597 * pageblock_nr_pages sizes. 1576 1598 */ 1577 - static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, 1599 + static void __init deferred_free_pages(unsigned long pfn, 1578 1600 unsigned long end_pfn) 1579 1601 { 1580 - struct mminit_pfnnid_cache nid_init_state = { }; 1581 1602 unsigned long nr_pgmask = pageblock_nr_pages - 1; 1582 1603 unsigned long nr_free = 0; 1583 1604 1584 1605 for (; pfn < end_pfn; pfn++) { 1585 - if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { 1606 + if (!deferred_pfn_valid(pfn)) { 1586 1607 deferred_free_range(pfn - nr_free, nr_free); 1587 1608 nr_free = 0; 1588 1609 } else if (!(pfn & nr_pgmask)) { ··· 1601 1624 * by performing it only once every pageblock_nr_pages. 1602 1625 * Return number of pages initialized. 1603 1626 */ 1604 - static unsigned long __init deferred_init_pages(int nid, int zid, 1627 + static unsigned long __init deferred_init_pages(struct zone *zone, 1605 1628 unsigned long pfn, 1606 1629 unsigned long end_pfn) 1607 1630 { 1608 - struct mminit_pfnnid_cache nid_init_state = { }; 1609 1631 unsigned long nr_pgmask = pageblock_nr_pages - 1; 1632 + int nid = zone_to_nid(zone); 1610 1633 unsigned long nr_pages = 0; 1634 + int zid = zone_idx(zone); 1611 1635 struct page *page = NULL; 1612 1636 1613 1637 for (; pfn < end_pfn; pfn++) { 1614 - if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { 1638 + if (!deferred_pfn_valid(pfn)) { 1615 1639 page = NULL; 1616 1640 continue; 1617 1641 } else if (!page || !(pfn & nr_pgmask)) { ··· 1627 1649 return (nr_pages); 1628 1650 } 1629 1651 1652 + /* 1653 + * This function is meant to pre-load the iterator for the zone init. 1654 + * Specifically it walks through the ranges until we are caught up to the 1655 + * first_init_pfn value and exits there. If we never encounter the value we 1656 + * return false indicating there are no valid ranges left. 1657 + */ 1658 + static bool __init 1659 + deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, 1660 + unsigned long *spfn, unsigned long *epfn, 1661 + unsigned long first_init_pfn) 1662 + { 1663 + u64 j; 1664 + 1665 + /* 1666 + * Start out by walking through the ranges in this zone that have 1667 + * already been initialized. We don't need to do anything with them 1668 + * so we just need to flush them out of the system. 1669 + */ 1670 + for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { 1671 + if (*epfn <= first_init_pfn) 1672 + continue; 1673 + if (*spfn < first_init_pfn) 1674 + *spfn = first_init_pfn; 1675 + *i = j; 1676 + return true; 1677 + } 1678 + 1679 + return false; 1680 + } 1681 + 1682 + /* 1683 + * Initialize and free pages. We do it in two loops: first we initialize 1684 + * struct page, then free to buddy allocator, because while we are 1685 + * freeing pages we can access pages that are ahead (computing buddy 1686 + * page in __free_one_page()). 1687 + * 1688 + * In order to try and keep some memory in the cache we have the loop 1689 + * broken along max page order boundaries. This way we will not cause 1690 + * any issues with the buddy page computation. 1691 + */ 1692 + static unsigned long __init 1693 + deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, 1694 + unsigned long *end_pfn) 1695 + { 1696 + unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); 1697 + unsigned long spfn = *start_pfn, epfn = *end_pfn; 1698 + unsigned long nr_pages = 0; 1699 + u64 j = *i; 1700 + 1701 + /* First we loop through and initialize the page values */ 1702 + for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { 1703 + unsigned long t; 1704 + 1705 + if (mo_pfn <= *start_pfn) 1706 + break; 1707 + 1708 + t = min(mo_pfn, *end_pfn); 1709 + nr_pages += deferred_init_pages(zone, *start_pfn, t); 1710 + 1711 + if (mo_pfn < *end_pfn) { 1712 + *start_pfn = mo_pfn; 1713 + break; 1714 + } 1715 + } 1716 + 1717 + /* Reset values and now loop through freeing pages as needed */ 1718 + swap(j, *i); 1719 + 1720 + for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) { 1721 + unsigned long t; 1722 + 1723 + if (mo_pfn <= spfn) 1724 + break; 1725 + 1726 + t = min(mo_pfn, epfn); 1727 + deferred_free_pages(spfn, t); 1728 + 1729 + if (mo_pfn <= epfn) 1730 + break; 1731 + } 1732 + 1733 + return nr_pages; 1734 + } 1735 + 1630 1736 /* Initialise remaining memory on a node */ 1631 1737 static int __init deferred_init_memmap(void *data) 1632 1738 { 1633 1739 pg_data_t *pgdat = data; 1634 - int nid = pgdat->node_id; 1635 - unsigned long start = jiffies; 1636 - unsigned long nr_pages = 0; 1637 - unsigned long spfn, epfn, first_init_pfn, flags; 1638 - phys_addr_t spa, epa; 1639 - int zid; 1640 - struct zone *zone; 1641 1740 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 1741 + unsigned long spfn = 0, epfn = 0, nr_pages = 0; 1742 + unsigned long first_init_pfn, flags; 1743 + unsigned long start = jiffies; 1744 + struct zone *zone; 1745 + int zid; 1642 1746 u64 i; 1643 1747 1644 1748 /* Bind memory initialisation thread to a local node if possible */ ··· 1746 1686 if (first_init_pfn < zone_end_pfn(zone)) 1747 1687 break; 1748 1688 } 1749 - first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); 1689 + 1690 + /* If the zone is empty somebody else may have cleared out the zone */ 1691 + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, 1692 + first_init_pfn)) 1693 + goto zone_empty; 1750 1694 1751 1695 /* 1752 - * Initialize and free pages. We do it in two loops: first we initialize 1753 - * struct page, than free to buddy allocator, because while we are 1754 - * freeing pages we can access pages that are ahead (computing buddy 1755 - * page in __free_one_page()). 1696 + * Initialize and free pages in MAX_ORDER sized increments so 1697 + * that we can avoid introducing any issues with the buddy 1698 + * allocator. 1756 1699 */ 1757 - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { 1758 - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); 1759 - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); 1760 - nr_pages += deferred_init_pages(nid, zid, spfn, epfn); 1761 - } 1762 - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { 1763 - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); 1764 - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); 1765 - deferred_free_pages(nid, zid, spfn, epfn); 1766 - } 1700 + while (spfn < epfn) 1701 + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); 1702 + zone_empty: 1767 1703 pgdat_resize_unlock(pgdat, &flags); 1768 1704 1769 1705 /* Sanity check that the next zone really is unpopulated */ 1770 1706 WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); 1771 1707 1772 - pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, 1773 - jiffies_to_msecs(jiffies - start)); 1708 + pr_info("node %d initialised, %lu pages in %ums\n", 1709 + pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start)); 1774 1710 1775 1711 pgdat_init_report_one_done(); 1776 1712 return 0; ··· 1790 1734 static noinline bool __init 1791 1735 deferred_grow_zone(struct zone *zone, unsigned int order) 1792 1736 { 1793 - int zid = zone_idx(zone); 1794 - int nid = zone_to_nid(zone); 1795 - pg_data_t *pgdat = NODE_DATA(nid); 1796 1737 unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); 1797 - unsigned long nr_pages = 0; 1798 - unsigned long first_init_pfn, spfn, epfn, t, flags; 1738 + pg_data_t *pgdat = zone->zone_pgdat; 1799 1739 unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; 1800 - phys_addr_t spa, epa; 1740 + unsigned long spfn, epfn, flags; 1741 + unsigned long nr_pages = 0; 1801 1742 u64 i; 1802 1743 1803 1744 /* Only the last zone may have deferred pages */ ··· 1823 1770 return true; 1824 1771 } 1825 1772 1826 - first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn); 1827 - 1828 - if (first_init_pfn >= pgdat_end_pfn(pgdat)) { 1773 + /* If the zone is empty somebody else may have cleared out the zone */ 1774 + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, 1775 + first_deferred_pfn)) { 1776 + pgdat->first_deferred_pfn = ULONG_MAX; 1829 1777 pgdat_resize_unlock(pgdat, &flags); 1830 - return false; 1778 + return true; 1831 1779 } 1832 1780 1833 - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { 1834 - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); 1835 - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); 1781 + /* 1782 + * Initialize and free pages in MAX_ORDER sized increments so 1783 + * that we can avoid introducing any issues with the buddy 1784 + * allocator. 1785 + */ 1786 + while (spfn < epfn) { 1787 + /* update our first deferred PFN for this section */ 1788 + first_deferred_pfn = spfn; 1836 1789 1837 - while (spfn < epfn && nr_pages < nr_pages_needed) { 1838 - t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); 1839 - first_deferred_pfn = min(t, epfn); 1840 - nr_pages += deferred_init_pages(nid, zid, spfn, 1841 - first_deferred_pfn); 1842 - spfn = first_deferred_pfn; 1843 - } 1790 + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); 1844 1791 1792 + /* We should only stop along section boundaries */ 1793 + if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) 1794 + continue; 1795 + 1796 + /* If our quota has been met we can stop here */ 1845 1797 if (nr_pages >= nr_pages_needed) 1846 1798 break; 1847 1799 } 1848 1800 1849 - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { 1850 - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); 1851 - epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa)); 1852 - deferred_free_pages(nid, zid, spfn, epfn); 1853 - 1854 - if (first_deferred_pfn == epfn) 1855 - break; 1856 - } 1857 - pgdat->first_deferred_pfn = first_deferred_pfn; 1801 + pgdat->first_deferred_pfn = spfn; 1858 1802 pgdat_resize_unlock(pgdat, &flags); 1859 1803 1860 1804 return nr_pages > 0; ··· 1896 1846 /* Reinit limits that are based on free pages after the kernel is up */ 1897 1847 files_maxfiles_init(); 1898 1848 #endif 1899 - #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK 1849 + 1900 1850 /* Discard memblock private memory */ 1901 1851 memblock_discard(); 1902 - #endif 1903 1852 1904 1853 for_each_populated_zone(zone) 1905 1854 set_zone_contiguous(zone); ··· 3169 3120 3170 3121 /* Lock and remove page from the per-cpu list */ 3171 3122 static struct page *rmqueue_pcplist(struct zone *preferred_zone, 3172 - struct zone *zone, unsigned int order, 3173 - gfp_t gfp_flags, int migratetype, 3174 - unsigned int alloc_flags) 3123 + struct zone *zone, gfp_t gfp_flags, 3124 + int migratetype, unsigned int alloc_flags) 3175 3125 { 3176 3126 struct per_cpu_pages *pcp; 3177 3127 struct list_head *list; ··· 3182 3134 list = &pcp->lists[migratetype]; 3183 3135 page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); 3184 3136 if (page) { 3185 - __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 3137 + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); 3186 3138 zone_statistics(preferred_zone, zone); 3187 3139 } 3188 3140 local_irq_restore(flags); ··· 3202 3154 struct page *page; 3203 3155 3204 3156 if (likely(order == 0)) { 3205 - page = rmqueue_pcplist(preferred_zone, zone, order, 3206 - gfp_flags, migratetype, alloc_flags); 3157 + page = rmqueue_pcplist(preferred_zone, zone, gfp_flags, 3158 + migratetype, alloc_flags); 3207 3159 goto out; 3208 3160 } 3209 3161 ··· 4869 4821 /** 4870 4822 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 4871 4823 * @size: the number of bytes to allocate 4872 - * @gfp_mask: GFP flags for the allocation 4824 + * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP 4873 4825 * 4874 4826 * This function is similar to alloc_pages(), except that it allocates the 4875 4827 * minimum number of pages to satisfy the request. alloc_pages() can only ··· 4886 4838 unsigned int order = get_order(size); 4887 4839 unsigned long addr; 4888 4840 4841 + if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) 4842 + gfp_mask &= ~__GFP_COMP; 4843 + 4889 4844 addr = __get_free_pages(gfp_mask, order); 4890 4845 return make_alloc_exact(addr, order, size); 4891 4846 } ··· 4899 4848 * pages on a node. 4900 4849 * @nid: the preferred node ID where memory should be allocated 4901 4850 * @size: the number of bytes to allocate 4902 - * @gfp_mask: GFP flags for the allocation 4851 + * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP 4903 4852 * 4904 4853 * Like alloc_pages_exact(), but try to allocate on node nid first before falling 4905 4854 * back. ··· 4909 4858 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 4910 4859 { 4911 4860 unsigned int order = get_order(size); 4912 - struct page *p = alloc_pages_node(nid, gfp_mask, order); 4861 + struct page *p; 4862 + 4863 + if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) 4864 + gfp_mask &= ~__GFP_COMP; 4865 + 4866 + p = alloc_pages_node(nid, gfp_mask, order); 4913 4867 if (!p) 4914 4868 return NULL; 4915 4869 return make_alloc_exact((unsigned long)page_address(p), order, size); ··· 6303 6247 unsigned long *zone_end_pfn, 6304 6248 unsigned long *ignored) 6305 6249 { 6250 + unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 6251 + unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 6306 6252 /* When hotadd a new node from cpu_up(), the node should be empty */ 6307 6253 if (!node_start_pfn && !node_end_pfn) 6308 6254 return 0; 6309 6255 6310 6256 /* Get the start and end of the zone */ 6311 - *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 6312 - *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 6257 + *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 6258 + *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 6313 6259 adjust_zone_range_for_zone_movable(nid, zone_type, 6314 6260 node_start_pfn, node_end_pfn, 6315 6261 zone_start_pfn, zone_end_pfn); ··· 8187 8129 return true; 8188 8130 } 8189 8131 8190 - #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) 8191 - 8132 + #ifdef CONFIG_CONTIG_ALLOC 8192 8133 static unsigned long pfn_max_align_down(unsigned long pfn) 8193 8134 { 8194 8135 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, ··· 8396 8339 pfn_max_align_up(end), migratetype); 8397 8340 return ret; 8398 8341 } 8342 + #endif /* CONFIG_CONTIG_ALLOC */ 8399 8343 8400 - void free_contig_range(unsigned long pfn, unsigned nr_pages) 8344 + void free_contig_range(unsigned long pfn, unsigned int nr_pages) 8401 8345 { 8402 8346 unsigned int count = 0; 8403 8347 ··· 8410 8352 } 8411 8353 WARN(count != 0, "%d pages are still in use!\n", count); 8412 8354 } 8413 - #endif 8414 8355 8415 8356 #ifdef CONFIG_MEMORY_HOTPLUG 8416 8357 /* ··· 8451 8394 * All pages in the range must be in a single zone and isolated 8452 8395 * before calling this. 8453 8396 */ 8454 - void 8397 + unsigned long 8455 8398 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 8456 8399 { 8457 8400 struct page *page; ··· 8459 8402 unsigned int order, i; 8460 8403 unsigned long pfn; 8461 8404 unsigned long flags; 8405 + unsigned long offlined_pages = 0; 8406 + 8462 8407 /* find the first valid pfn */ 8463 8408 for (pfn = start_pfn; pfn < end_pfn; pfn++) 8464 8409 if (pfn_valid(pfn)) 8465 8410 break; 8466 8411 if (pfn == end_pfn) 8467 - return; 8412 + return offlined_pages; 8413 + 8468 8414 offline_mem_sections(pfn, end_pfn); 8469 8415 zone = page_zone(pfn_to_page(pfn)); 8470 8416 spin_lock_irqsave(&zone->lock, flags); ··· 8485 8425 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { 8486 8426 pfn++; 8487 8427 SetPageReserved(page); 8428 + offlined_pages++; 8488 8429 continue; 8489 8430 } 8490 8431 8491 8432 BUG_ON(page_count(page)); 8492 8433 BUG_ON(!PageBuddy(page)); 8493 8434 order = page_order(page); 8435 + offlined_pages += 1 << order; 8494 8436 #ifdef CONFIG_DEBUG_VM 8495 8437 pr_info("remove from free list %lx %d %lx\n", 8496 8438 pfn, 1 << order, end_pfn); ··· 8505 8443 pfn += (1 << order); 8506 8444 } 8507 8445 spin_unlock_irqrestore(&zone->lock, flags); 8446 + 8447 + return offlined_pages; 8508 8448 } 8509 8449 #endif 8510 8450

-2

mm/page_isolation.c

··· 151 151 for (i = 0; i < nr_pages; i++) { 152 152 struct page *page; 153 153 154 - if (!pfn_valid_within(pfn + i)) 155 - continue; 156 154 page = pfn_to_online_page(pfn + i); 157 155 if (!page) 158 156 continue;

+6 -4

mm/rmap.c

··· 850 850 }; 851 851 852 852 *vm_flags = 0; 853 - if (!page_mapped(page)) 853 + if (!pra.mapcount) 854 854 return 0; 855 855 856 856 if (!page_rmapping(page)) ··· 896 896 * We have to assume the worse case ie pmd for invalidation. Note that 897 897 * the page can not be free from this function. 898 898 */ 899 - mmu_notifier_range_init(&range, vma->vm_mm, address, 899 + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 900 + 0, vma, vma->vm_mm, address, 900 901 min(vma->vm_end, address + 901 902 (PAGE_SIZE << compound_order(page)))); 902 903 mmu_notifier_invalidate_range_start(&range); ··· 929 928 continue; 930 929 931 930 flush_cache_page(vma, address, page_to_pfn(page)); 932 - entry = pmdp_huge_clear_flush(vma, address, pmd); 931 + entry = pmdp_invalidate(vma, address, pmd); 933 932 entry = pmd_wrprotect(entry); 934 933 entry = pmd_mkclean(entry); 935 934 set_pmd_at(vma->vm_mm, address, pmd, entry); ··· 1372 1371 * Note that the page can not be free in this function as call of 1373 1372 * try_to_unmap() must hold a reference on the page. 1374 1373 */ 1375 - mmu_notifier_range_init(&range, vma->vm_mm, address, 1374 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 1375 + address, 1376 1376 min(vma->vm_end, address + 1377 1377 (PAGE_SIZE << compound_order(page)))); 1378 1378 if (PageHuge(page)) {

+1 -1

mm/shmem.c

··· 614 614 if (xas_error(&xas)) 615 615 goto unlock; 616 616 next: 617 - xas_store(&xas, page + i); 617 + xas_store(&xas, page); 618 618 if (++i < nr) { 619 619 xas_next(&xas); 620 620 goto next;

+32 -29

mm/slab.c

··· 990 990 991 991 /* cpu is dead; no one can alloc from it. */ 992 992 nc = per_cpu_ptr(cachep->cpu_cache, cpu); 993 - if (nc) { 994 - free_block(cachep, nc->entry, nc->avail, node, &list); 995 - nc->avail = 0; 996 - } 993 + free_block(cachep, nc->entry, nc->avail, node, &list); 994 + nc->avail = 0; 997 995 998 996 if (!cpumask_empty(mask)) { 999 997 spin_unlock_irq(&n->list_lock); ··· 1672 1674 { 1673 1675 struct page *page, *n; 1674 1676 1675 - list_for_each_entry_safe(page, n, list, lru) { 1676 - list_del(&page->lru); 1677 + list_for_each_entry_safe(page, n, list, slab_list) { 1678 + list_del(&page->slab_list); 1677 1679 slab_destroy(cachep, page); 1678 1680 } 1679 1681 } ··· 2229 2231 goto out; 2230 2232 } 2231 2233 2232 - page = list_entry(p, struct page, lru); 2233 - list_del(&page->lru); 2234 + page = list_entry(p, struct page, slab_list); 2235 + list_del(&page->slab_list); 2234 2236 n->free_slabs--; 2235 2237 n->total_slabs--; 2236 2238 /* ··· 2689 2691 if (!page) 2690 2692 return; 2691 2693 2692 - INIT_LIST_HEAD(&page->lru); 2694 + INIT_LIST_HEAD(&page->slab_list); 2693 2695 n = get_node(cachep, page_to_nid(page)); 2694 2696 2695 2697 spin_lock(&n->list_lock); 2696 2698 n->total_slabs++; 2697 2699 if (!page->active) { 2698 - list_add_tail(&page->lru, &(n->slabs_free)); 2700 + list_add_tail(&page->slab_list, &n->slabs_free); 2699 2701 n->free_slabs++; 2700 2702 } else 2701 2703 fixup_slab_list(cachep, n, page, &list); ··· 2804 2806 void **list) 2805 2807 { 2806 2808 /* move slabp to correct slabp list: */ 2807 - list_del(&page->lru); 2809 + list_del(&page->slab_list); 2808 2810 if (page->active == cachep->num) { 2809 - list_add(&page->lru, &n->slabs_full); 2811 + list_add(&page->slab_list, &n->slabs_full); 2810 2812 if (OBJFREELIST_SLAB(cachep)) { 2811 2813 #if DEBUG 2812 2814 /* Poisoning will be done without holding the lock */ ··· 2820 2822 page->freelist = NULL; 2821 2823 } 2822 2824 } else 2823 - list_add(&page->lru, &n->slabs_partial); 2825 + list_add(&page->slab_list, &n->slabs_partial); 2824 2826 } 2825 2827 2826 2828 /* Try to find non-pfmemalloc slab if needed */ ··· 2843 2845 } 2844 2846 2845 2847 /* Move pfmemalloc slab to the end of list to speed up next search */ 2846 - list_del(&page->lru); 2848 + list_del(&page->slab_list); 2847 2849 if (!page->active) { 2848 - list_add_tail(&page->lru, &n->slabs_free); 2850 + list_add_tail(&page->slab_list, &n->slabs_free); 2849 2851 n->free_slabs++; 2850 2852 } else 2851 - list_add_tail(&page->lru, &n->slabs_partial); 2853 + list_add_tail(&page->slab_list, &n->slabs_partial); 2852 2854 2853 - list_for_each_entry(page, &n->slabs_partial, lru) { 2855 + list_for_each_entry(page, &n->slabs_partial, slab_list) { 2854 2856 if (!PageSlabPfmemalloc(page)) 2855 2857 return page; 2856 2858 } 2857 2859 2858 2860 n->free_touched = 1; 2859 - list_for_each_entry(page, &n->slabs_free, lru) { 2861 + list_for_each_entry(page, &n->slabs_free, slab_list) { 2860 2862 if (!PageSlabPfmemalloc(page)) { 2861 2863 n->free_slabs--; 2862 2864 return page; ··· 2871 2873 struct page *page; 2872 2874 2873 2875 assert_spin_locked(&n->list_lock); 2874 - page = list_first_entry_or_null(&n->slabs_partial, struct page, lru); 2876 + page = list_first_entry_or_null(&n->slabs_partial, struct page, 2877 + slab_list); 2875 2878 if (!page) { 2876 2879 n->free_touched = 1; 2877 2880 page = list_first_entry_or_null(&n->slabs_free, struct page, 2878 - lru); 2881 + slab_list); 2879 2882 if (page) 2880 2883 n->free_slabs--; 2881 2884 } ··· 3377 3378 objp = objpp[i]; 3378 3379 3379 3380 page = virt_to_head_page(objp); 3380 - list_del(&page->lru); 3381 + list_del(&page->slab_list); 3381 3382 check_spinlock_acquired_node(cachep, node); 3382 3383 slab_put_obj(cachep, page, objp); 3383 3384 STATS_DEC_ACTIVE(cachep); 3384 3385 3385 3386 /* fixup slab chains */ 3386 3387 if (page->active == 0) { 3387 - list_add(&page->lru, &n->slabs_free); 3388 + list_add(&page->slab_list, &n->slabs_free); 3388 3389 n->free_slabs++; 3389 3390 } else { 3390 3391 /* Unconditionally move a slab to the end of the 3391 3392 * partial list on free - maximum time for the 3392 3393 * other objects to be freed, too. 3393 3394 */ 3394 - list_add_tail(&page->lru, &n->slabs_partial); 3395 + list_add_tail(&page->slab_list, &n->slabs_partial); 3395 3396 } 3396 3397 } 3397 3398 3398 3399 while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) { 3399 3400 n->free_objects -= cachep->num; 3400 3401 3401 - page = list_last_entry(&n->slabs_free, struct page, lru); 3402 - list_move(&page->lru, list); 3402 + page = list_last_entry(&n->slabs_free, struct page, slab_list); 3403 + list_move(&page->slab_list, list); 3403 3404 n->free_slabs--; 3404 3405 n->total_slabs--; 3405 3406 } ··· 3437 3438 int i = 0; 3438 3439 struct page *page; 3439 3440 3440 - list_for_each_entry(page, &n->slabs_free, lru) { 3441 + list_for_each_entry(page, &n->slabs_free, slab_list) { 3441 3442 BUG_ON(page->active); 3442 3443 3443 3444 i++; ··· 4291 4292 * whole processing. 4292 4293 */ 4293 4294 do { 4294 - set_store_user_clean(cachep); 4295 4295 drain_cpu_caches(cachep); 4296 + /* 4297 + * drain_cpu_caches() could make kmemleak_object and 4298 + * debug_objects_cache dirty, so reset afterwards. 4299 + */ 4300 + set_store_user_clean(cachep); 4296 4301 4297 4302 x[1] = 0; 4298 4303 ··· 4305 4302 check_irq_on(); 4306 4303 spin_lock_irq(&n->list_lock); 4307 4304 4308 - list_for_each_entry(page, &n->slabs_full, lru) 4305 + list_for_each_entry(page, &n->slabs_full, slab_list) 4309 4306 handle_slab(x, cachep, page); 4310 - list_for_each_entry(page, &n->slabs_partial, lru) 4307 + list_for_each_entry(page, &n->slabs_partial, slab_list) 4311 4308 handle_slab(x, cachep, page); 4312 4309 spin_unlock_irq(&n->list_lock); 4313 4310 }

+41 -18

mm/slob.c

··· 112 112 113 113 static void set_slob_page_free(struct page *sp, struct list_head *list) 114 114 { 115 - list_add(&sp->lru, list); 115 + list_add(&sp->slab_list, list); 116 116 __SetPageSlobFree(sp); 117 117 } 118 118 119 119 static inline void clear_slob_page_free(struct page *sp) 120 120 { 121 - list_del(&sp->lru); 121 + list_del(&sp->slab_list); 122 122 __ClearPageSlobFree(sp); 123 123 } 124 124 ··· 213 213 } 214 214 215 215 /* 216 - * Allocate a slob block within a given slob_page sp. 216 + * slob_page_alloc() - Allocate a slob block within a given slob_page sp. 217 + * @sp: Page to look in. 218 + * @size: Size of the allocation. 219 + * @align: Allocation alignment. 220 + * @page_removed_from_list: Return parameter. 221 + * 222 + * Tries to find a chunk of memory at least @size bytes big within @page. 223 + * 224 + * Return: Pointer to memory if allocated, %NULL otherwise. If the 225 + * allocation fills up @page then the page is removed from the 226 + * freelist, in this case @page_removed_from_list will be set to 227 + * true (set to false otherwise). 217 228 */ 218 - static void *slob_page_alloc(struct page *sp, size_t size, int align) 229 + static void *slob_page_alloc(struct page *sp, size_t size, int align, 230 + bool *page_removed_from_list) 219 231 { 220 232 slob_t *prev, *cur, *aligned = NULL; 221 233 int delta = 0, units = SLOB_UNITS(size); 222 234 235 + *page_removed_from_list = false; 223 236 for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) { 224 237 slobidx_t avail = slob_units(cur); 225 238 ··· 267 254 } 268 255 269 256 sp->units -= units; 270 - if (!sp->units) 257 + if (!sp->units) { 271 258 clear_slob_page_free(sp); 259 + *page_removed_from_list = true; 260 + } 272 261 return cur; 273 262 } 274 263 if (slob_last(cur)) ··· 284 269 static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) 285 270 { 286 271 struct page *sp; 287 - struct list_head *prev; 288 272 struct list_head *slob_list; 289 273 slob_t *b = NULL; 290 274 unsigned long flags; 275 + bool _unused; 291 276 292 277 if (size < SLOB_BREAK1) 293 278 slob_list = &free_slob_small; ··· 298 283 299 284 spin_lock_irqsave(&slob_lock, flags); 300 285 /* Iterate through each partially free page, try to find room */ 301 - list_for_each_entry(sp, slob_list, lru) { 286 + list_for_each_entry(sp, slob_list, slab_list) { 287 + bool page_removed_from_list = false; 302 288 #ifdef CONFIG_NUMA 303 289 /* 304 290 * If there's a node specification, search for a partial ··· 312 296 if (sp->units < SLOB_UNITS(size)) 313 297 continue; 314 298 315 - /* Attempt to alloc */ 316 - prev = sp->lru.prev; 317 - b = slob_page_alloc(sp, size, align); 299 + b = slob_page_alloc(sp, size, align, &page_removed_from_list); 318 300 if (!b) 319 301 continue; 320 302 321 - /* Improve fragment distribution and reduce our average 322 - * search time by starting our next search here. (see 323 - * Knuth vol 1, sec 2.5, pg 449) */ 324 - if (prev != slob_list->prev && 325 - slob_list->next != prev->next) 326 - list_move_tail(slob_list, prev->next); 303 + /* 304 + * If slob_page_alloc() removed sp from the list then we 305 + * cannot call list functions on sp. If so allocation 306 + * did not fragment the page anyway so optimisation is 307 + * unnecessary. 308 + */ 309 + if (!page_removed_from_list) { 310 + /* 311 + * Improve fragment distribution and reduce our average 312 + * search time by starting our next search here. (see 313 + * Knuth vol 1, sec 2.5, pg 449) 314 + */ 315 + if (!list_is_first(&sp->slab_list, slob_list)) 316 + list_rotate_to_front(&sp->slab_list, slob_list); 317 + } 327 318 break; 328 319 } 329 320 spin_unlock_irqrestore(&slob_lock, flags); ··· 346 323 spin_lock_irqsave(&slob_lock, flags); 347 324 sp->units = SLOB_UNITS(PAGE_SIZE); 348 325 sp->freelist = b; 349 - INIT_LIST_HEAD(&sp->lru); 326 + INIT_LIST_HEAD(&sp->slab_list); 350 327 set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); 351 328 set_slob_page_free(sp, slob_list); 352 - b = slob_page_alloc(sp, size, align); 329 + b = slob_page_alloc(sp, size, align, &_unused); 353 330 BUG_ON(!b); 354 331 spin_unlock_irqrestore(&slob_lock, flags); 355 332 }

+36 -36

mm/slub.c

··· 58 58 * D. page->frozen -> frozen state 59 59 * 60 60 * If a slab is frozen then it is exempt from list management. It is not 61 - * on any list. The processor that froze the slab is the one who can 62 - * perform list operations on the page. Other processors may put objects 63 - * onto the freelist but the processor that froze the slab is the only 64 - * one that can retrieve the objects from the page's freelist. 61 + * on any list except per cpu partial list. The processor that froze the 62 + * slab is the one who can perform list operations on the page. Other 63 + * processors may put objects onto the freelist but the processor that 64 + * froze the slab is the only one that can retrieve the objects from the 65 + * page's freelist. 65 66 * 66 67 * The list_lock protects the partial and full list on each node and 67 68 * the partial slab counter. If taken then no new slabs may be added or ··· 1015 1014 return; 1016 1015 1017 1016 lockdep_assert_held(&n->list_lock); 1018 - list_add(&page->lru, &n->full); 1017 + list_add(&page->slab_list, &n->full); 1019 1018 } 1020 1019 1021 1020 static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) ··· 1024 1023 return; 1025 1024 1026 1025 lockdep_assert_held(&n->list_lock); 1027 - list_del(&page->lru); 1026 + list_del(&page->slab_list); 1028 1027 } 1029 1028 1030 1029 /* Tracking of the number of slabs for debugging purposes */ ··· 1765 1764 { 1766 1765 n->nr_partial++; 1767 1766 if (tail == DEACTIVATE_TO_TAIL) 1768 - list_add_tail(&page->lru, &n->partial); 1767 + list_add_tail(&page->slab_list, &n->partial); 1769 1768 else 1770 - list_add(&page->lru, &n->partial); 1769 + list_add(&page->slab_list, &n->partial); 1771 1770 } 1772 1771 1773 1772 static inline void add_partial(struct kmem_cache_node *n, ··· 1781 1780 struct page *page) 1782 1781 { 1783 1782 lockdep_assert_held(&n->list_lock); 1784 - list_del(&page->lru); 1783 + list_del(&page->slab_list); 1785 1784 n->nr_partial--; 1786 1785 } 1787 1786 ··· 1855 1854 return NULL; 1856 1855 1857 1856 spin_lock(&n->list_lock); 1858 - list_for_each_entry_safe(page, page2, &n->partial, lru) { 1857 + list_for_each_entry_safe(page, page2, &n->partial, slab_list) { 1859 1858 void *t; 1860 1859 1861 1860 if (!pfmemalloc_match(page, flags)) ··· 1943 1942 } 1944 1943 } 1945 1944 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 1946 - #endif 1945 + #endif /* CONFIG_NUMA */ 1947 1946 return NULL; 1948 1947 } 1949 1948 ··· 2241 2240 discard_slab(s, page); 2242 2241 stat(s, FREE_SLAB); 2243 2242 } 2244 - #endif 2243 + #endif /* CONFIG_SLUB_CPU_PARTIAL */ 2245 2244 } 2246 2245 2247 2246 /* ··· 2300 2299 local_irq_restore(flags); 2301 2300 } 2302 2301 preempt_enable(); 2303 - #endif 2302 + #endif /* CONFIG_SLUB_CPU_PARTIAL */ 2304 2303 } 2305 2304 2306 2305 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) ··· 2399 2398 struct page *page; 2400 2399 2401 2400 spin_lock_irqsave(&n->list_lock, flags); 2402 - list_for_each_entry(page, &n->partial, lru) 2401 + list_for_each_entry(page, &n->partial, slab_list) 2403 2402 x += get_count(page); 2404 2403 spin_unlock_irqrestore(&n->list_lock, flags); 2405 2404 return x; ··· 2805 2804 } 2806 2805 EXPORT_SYMBOL(kmem_cache_alloc_node_trace); 2807 2806 #endif 2808 - #endif 2807 + #endif /* CONFIG_NUMA */ 2809 2808 2810 2809 /* 2811 2810 * Slow path handling. This may still be called frequently since objects ··· 2904 2903 * then add it. 2905 2904 */ 2906 2905 if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { 2907 - if (kmem_cache_debug(s)) 2908 - remove_full(s, n, page); 2906 + remove_full(s, n, page); 2909 2907 add_partial(n, page, DEACTIVATE_TO_TAIL); 2910 2908 stat(s, FREE_ADD_PARTIAL); 2911 2909 } ··· 3696 3696 3697 3697 BUG_ON(irqs_disabled()); 3698 3698 spin_lock_irq(&n->list_lock); 3699 - list_for_each_entry_safe(page, h, &n->partial, lru) { 3699 + list_for_each_entry_safe(page, h, &n->partial, slab_list) { 3700 3700 if (!page->inuse) { 3701 3701 remove_partial(n, page); 3702 - list_add(&page->lru, &discard); 3702 + list_add(&page->slab_list, &discard); 3703 3703 } else { 3704 3704 list_slab_objects(s, page, 3705 3705 "Objects remaining in %s on __kmem_cache_shutdown()"); ··· 3707 3707 } 3708 3708 spin_unlock_irq(&n->list_lock); 3709 3709 3710 - list_for_each_entry_safe(page, h, &discard, lru) 3710 + list_for_each_entry_safe(page, h, &discard, slab_list) 3711 3711 discard_slab(s, page); 3712 3712 } 3713 3713 ··· 3839 3839 return ret; 3840 3840 } 3841 3841 EXPORT_SYMBOL(__kmalloc_node); 3842 - #endif 3842 + #endif /* CONFIG_NUMA */ 3843 3843 3844 3844 #ifdef CONFIG_HARDENED_USERCOPY 3845 3845 /* ··· 3987 3987 * Note that concurrent frees may occur while we hold the 3988 3988 * list_lock. page->inuse here is the upper limit. 3989 3989 */ 3990 - list_for_each_entry_safe(page, t, &n->partial, lru) { 3990 + list_for_each_entry_safe(page, t, &n->partial, slab_list) { 3991 3991 int free = page->objects - page->inuse; 3992 3992 3993 3993 /* Do not reread page->inuse */ ··· 3997 3997 BUG_ON(free <= 0); 3998 3998 3999 3999 if (free == page->objects) { 4000 - list_move(&page->lru, &discard); 4000 + list_move(&page->slab_list, &discard); 4001 4001 n->nr_partial--; 4002 4002 } else if (free <= SHRINK_PROMOTE_MAX) 4003 - list_move(&page->lru, promote + free - 1); 4003 + list_move(&page->slab_list, promote + free - 1); 4004 4004 } 4005 4005 4006 4006 /* ··· 4013 4013 spin_unlock_irqrestore(&n->list_lock, flags); 4014 4014 4015 4015 /* Release empty slabs */ 4016 - list_for_each_entry_safe(page, t, &discard, lru) 4016 + list_for_each_entry_safe(page, t, &discard, slab_list) 4017 4017 discard_slab(s, page); 4018 4018 4019 4019 if (slabs_node(s, node)) ··· 4057 4057 */ 4058 4058 slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu); 4059 4059 } 4060 - #endif 4060 + #endif /* CONFIG_MEMCG */ 4061 4061 4062 4062 static int slab_mem_going_offline_callback(void *arg) 4063 4063 { ··· 4205 4205 for_each_kmem_cache_node(s, node, n) { 4206 4206 struct page *p; 4207 4207 4208 - list_for_each_entry(p, &n->partial, lru) 4208 + list_for_each_entry(p, &n->partial, slab_list) 4209 4209 p->slab_cache = s; 4210 4210 4211 4211 #ifdef CONFIG_SLUB_DEBUG 4212 - list_for_each_entry(p, &n->full, lru) 4212 + list_for_each_entry(p, &n->full, slab_list) 4213 4213 p->slab_cache = s; 4214 4214 #endif 4215 4215 } ··· 4426 4426 4427 4427 spin_lock_irqsave(&n->list_lock, flags); 4428 4428 4429 - list_for_each_entry(page, &n->partial, lru) { 4429 + list_for_each_entry(page, &n->partial, slab_list) { 4430 4430 validate_slab_slab(s, page, map); 4431 4431 count++; 4432 4432 } ··· 4437 4437 if (!(s->flags & SLAB_STORE_USER)) 4438 4438 goto out; 4439 4439 4440 - list_for_each_entry(page, &n->full, lru) { 4440 + list_for_each_entry(page, &n->full, slab_list) { 4441 4441 validate_slab_slab(s, page, map); 4442 4442 count++; 4443 4443 } ··· 4633 4633 continue; 4634 4634 4635 4635 spin_lock_irqsave(&n->list_lock, flags); 4636 - list_for_each_entry(page, &n->partial, lru) 4636 + list_for_each_entry(page, &n->partial, slab_list) 4637 4637 process_slab(&t, s, page, alloc, map); 4638 - list_for_each_entry(page, &n->full, lru) 4638 + list_for_each_entry(page, &n->full, slab_list) 4639 4639 process_slab(&t, s, page, alloc, map); 4640 4640 spin_unlock_irqrestore(&n->list_lock, flags); 4641 4641 } ··· 4690 4690 len += sprintf(buf, "No data\n"); 4691 4691 return len; 4692 4692 } 4693 - #endif 4693 + #endif /* CONFIG_SLUB_DEBUG */ 4694 4694 4695 4695 #ifdef SLUB_RESILIENCY_TEST 4696 4696 static void __init resiliency_test(void) ··· 4750 4750 #ifdef CONFIG_SYSFS 4751 4751 static void resiliency_test(void) {}; 4752 4752 #endif 4753 - #endif 4753 + #endif /* SLUB_RESILIENCY_TEST */ 4754 4754 4755 4755 #ifdef CONFIG_SYSFS 4756 4756 enum slab_stat_type { ··· 5407 5407 STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); 5408 5408 STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); 5409 5409 STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); 5410 - #endif 5410 + #endif /* CONFIG_SLUB_STATS */ 5411 5411 5412 5412 static struct attribute *slab_attrs[] = { 5413 5413 &slab_size_attr.attr, ··· 5608 5608 5609 5609 if (buffer) 5610 5610 free_page((unsigned long)buffer); 5611 - #endif 5611 + #endif /* CONFIG_MEMCG */ 5612 5612 } 5613 5613 5614 5614 static void kmem_cache_release(struct kobject *k)

+12 -4

mm/sparse.c

··· 684 684 #endif /* CONFIG_MEMORY_HOTREMOVE */ 685 685 #endif /* CONFIG_SPARSEMEM_VMEMMAP */ 686 686 687 - /* 688 - * returns the number of sections whose mem_maps were properly 689 - * set. If this is <=0, then that means that the passed-in 690 - * map was not consumed and must be freed. 687 + /** 688 + * sparse_add_one_section - add a memory section 689 + * @nid: The node to add section on 690 + * @start_pfn: start pfn of the memory range 691 + * @altmap: device page map 692 + * 693 + * This is only intended for hotplug. 694 + * 695 + * Return: 696 + * * 0 - On success. 697 + * * -EEXIST - Section has been present. 698 + * * -ENOMEM - Out of memory. 691 699 */ 692 700 int __meminit sparse_add_one_section(int nid, unsigned long start_pfn, 693 701 struct vmem_altmap *altmap)

+1 -1

mm/swap.c

··· 867 867 SetPageLRU(page); 868 868 /* 869 869 * Page becomes evictable in two ways: 870 - * 1) Within LRU lock [munlock_vma_pages() and __munlock_pagevec()]. 870 + * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()]. 871 871 * 2) Before acquiring LRU lock to put the page to correct LRU and then 872 872 * a) do PageLRU check with lock [check_move_unevictable_pages] 873 873 * b) do PageLRU check before lock [clear_page_mlock]

+2 -2

mm/swap_state.c

··· 132 132 for (i = 0; i < nr; i++) { 133 133 VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); 134 134 set_page_private(page + i, entry.val + i); 135 - xas_store(&xas, page + i); 135 + xas_store(&xas, page); 136 136 xas_next(&xas); 137 137 } 138 138 address_space->nrpages += nr; ··· 167 167 168 168 for (i = 0; i < nr; i++) { 169 169 void *entry = xas_store(&xas, NULL); 170 - VM_BUG_ON_PAGE(entry != page + i, entry); 170 + VM_BUG_ON_PAGE(entry != page, entry); 171 171 set_page_private(page + i, 0); 172 172 xas_next(&xas); 173 173 }

+1 -2

mm/userfaultfd.c

··· 271 271 */ 272 272 idx = linear_page_index(dst_vma, dst_addr); 273 273 mapping = dst_vma->vm_file->f_mapping; 274 - hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, 275 - idx, dst_addr); 274 + hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr); 276 275 mutex_lock(&hugetlb_fault_mutex_table[hash]); 277 276 278 277 err = -ENOMEM;

+9 -50

mm/util.c

··· 318 318 * get_user_pages_fast() - pin user pages in memory 319 319 * @start: starting user address 320 320 * @nr_pages: number of pages from start to pin 321 - * @write: whether pages will be written to 321 + * @gup_flags: flags modifying pin behaviour 322 322 * @pages: array that receives pointers to the pages pinned. 323 323 * Should be at least nr_pages long. 324 324 * ··· 339 339 * were pinned, returns -errno. 340 340 */ 341 341 int __weak get_user_pages_fast(unsigned long start, 342 - int nr_pages, int write, struct page **pages) 342 + int nr_pages, unsigned int gup_flags, 343 + struct page **pages) 343 344 { 344 - return get_user_pages_unlocked(start, nr_pages, pages, 345 - write ? FOLL_WRITE : 0); 345 + return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); 346 346 } 347 347 EXPORT_SYMBOL_GPL(get_user_pages_fast); 348 348 ··· 652 652 */ 653 653 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) 654 654 { 655 - long free, allowed, reserve; 655 + long allowed; 656 656 657 657 VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < 658 658 -(s64)vm_committed_as_batch * num_online_cpus(), ··· 667 667 return 0; 668 668 669 669 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 670 - free = global_zone_page_state(NR_FREE_PAGES); 671 - free += global_node_page_state(NR_FILE_PAGES); 672 - 673 - /* 674 - * shmem pages shouldn't be counted as free in this 675 - * case, they can't be purged, only swapped out, and 676 - * that won't affect the overall amount of available 677 - * memory in the system. 678 - */ 679 - free -= global_node_page_state(NR_SHMEM); 680 - 681 - free += get_nr_swap_pages(); 682 - 683 - /* 684 - * Any slabs which are created with the 685 - * SLAB_RECLAIM_ACCOUNT flag claim to have contents 686 - * which are reclaimable, under pressure. The dentry 687 - * cache and most inode caches should fall into this 688 - */ 689 - free += global_node_page_state(NR_SLAB_RECLAIMABLE); 690 - 691 - /* 692 - * Part of the kernel memory, which can be released 693 - * under memory pressure. 694 - */ 695 - free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); 696 - 697 - /* 698 - * Leave reserved pages. The pages are not for anonymous pages. 699 - */ 700 - if (free <= totalreserve_pages) 670 + if (pages > totalram_pages() + total_swap_pages) 701 671 goto error; 702 - else 703 - free -= totalreserve_pages; 704 - 705 - /* 706 - * Reserve some for root 707 - */ 708 - if (!cap_sys_admin) 709 - free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 710 - 711 - if (free > pages) 712 - return 0; 713 - 714 - goto error; 672 + return 0; 715 673 } 716 674 717 675 allowed = vm_commit_limit(); ··· 683 725 * Don't let a single process grow so big a user can't recover 684 726 */ 685 727 if (mm) { 686 - reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); 728 + long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); 729 + 687 730 allowed -= min_t(long, mm->total_vm / 32, reserve); 688 731 } 689 732

+75 -128

mm/vmscan.c

··· 346 346 int zid; 347 347 348 348 if (!mem_cgroup_disabled()) 349 - lru_size = mem_cgroup_get_lru_size(lruvec, lru); 349 + lru_size = lruvec_page_state(lruvec, NR_LRU_BASE + lru); 350 350 else 351 351 lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); 352 352 ··· 1107 1107 LIST_HEAD(ret_pages); 1108 1108 LIST_HEAD(free_pages); 1109 1109 unsigned nr_reclaimed = 0; 1110 + unsigned pgactivate = 0; 1110 1111 1111 1112 memset(stat, 0, sizeof(*stat)); 1112 1113 cond_resched(); ··· 1467 1466 try_to_free_swap(page); 1468 1467 VM_BUG_ON_PAGE(PageActive(page), page); 1469 1468 if (!PageMlocked(page)) { 1469 + int type = page_is_file_cache(page); 1470 1470 SetPageActive(page); 1471 - stat->nr_activate++; 1471 + pgactivate++; 1472 + stat->nr_activate[type] += hpage_nr_pages(page); 1472 1473 count_memcg_page_event(page, PGACTIVATE); 1473 1474 } 1474 1475 keep_locked: ··· 1485 1482 free_unref_page_list(&free_pages); 1486 1483 1487 1484 list_splice(&ret_pages, page_list); 1488 - count_vm_events(PGACTIVATE, stat->nr_activate); 1485 + count_vm_events(PGACTIVATE, pgactivate); 1489 1486 1490 1487 return nr_reclaimed; 1491 1488 } ··· 1807 1804 return isolated > inactive; 1808 1805 } 1809 1806 1810 - static noinline_for_stack void 1811 - putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) 1807 + /* 1808 + * This moves pages from @list to corresponding LRU list. 1809 + * 1810 + * We move them the other way if the page is referenced by one or more 1811 + * processes, from rmap. 1812 + * 1813 + * If the pages are mostly unmapped, the processing is fast and it is 1814 + * appropriate to hold zone_lru_lock across the whole operation. But if 1815 + * the pages are mapped, the processing is slow (page_referenced()) so we 1816 + * should drop zone_lru_lock around each page. It's impossible to balance 1817 + * this, so instead we remove the pages from the LRU while processing them. 1818 + * It is safe to rely on PG_active against the non-LRU pages in here because 1819 + * nobody will play with that bit on a non-LRU page. 1820 + * 1821 + * The downside is that we have to touch page->_refcount against each page. 1822 + * But we had to alter page->flags anyway. 1823 + * 1824 + * Returns the number of pages moved to the given lruvec. 1825 + */ 1826 + 1827 + static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, 1828 + struct list_head *list) 1812 1829 { 1813 - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1814 1830 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 1831 + int nr_pages, nr_moved = 0; 1815 1832 LIST_HEAD(pages_to_free); 1833 + struct page *page; 1834 + enum lru_list lru; 1816 1835 1817 - /* 1818 - * Put back any unfreeable pages. 1819 - */ 1820 - while (!list_empty(page_list)) { 1821 - struct page *page = lru_to_page(page_list); 1822 - int lru; 1823 - 1836 + while (!list_empty(list)) { 1837 + page = lru_to_page(list); 1824 1838 VM_BUG_ON_PAGE(PageLRU(page), page); 1825 - list_del(&page->lru); 1826 1839 if (unlikely(!page_evictable(page))) { 1840 + list_del(&page->lru); 1827 1841 spin_unlock_irq(&pgdat->lru_lock); 1828 1842 putback_lru_page(page); 1829 1843 spin_lock_irq(&pgdat->lru_lock); 1830 1844 continue; 1831 1845 } 1832 - 1833 1846 lruvec = mem_cgroup_page_lruvec(page, pgdat); 1834 1847 1835 1848 SetPageLRU(page); 1836 1849 lru = page_lru(page); 1837 - add_page_to_lru_list(page, lruvec, lru); 1838 1850 1839 - if (is_active_lru(lru)) { 1840 - int file = is_file_lru(lru); 1841 - int numpages = hpage_nr_pages(page); 1842 - reclaim_stat->recent_rotated[file] += numpages; 1843 - } 1851 + nr_pages = hpage_nr_pages(page); 1852 + update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); 1853 + list_move(&page->lru, &lruvec->lists[lru]); 1854 + 1844 1855 if (put_page_testzero(page)) { 1845 1856 __ClearPageLRU(page); 1846 1857 __ClearPageActive(page); ··· 1867 1850 spin_lock_irq(&pgdat->lru_lock); 1868 1851 } else 1869 1852 list_add(&page->lru, &pages_to_free); 1853 + } else { 1854 + nr_moved += nr_pages; 1870 1855 } 1871 1856 } 1872 1857 1873 1858 /* 1874 1859 * To save our caller's stack, now use input list for pages to free. 1875 1860 */ 1876 - list_splice(&pages_to_free, page_list); 1861 + list_splice(&pages_to_free, list); 1862 + 1863 + return nr_moved; 1877 1864 } 1878 1865 1879 1866 /* ··· 1907 1886 unsigned long nr_taken; 1908 1887 struct reclaim_stat stat; 1909 1888 int file = is_file_lru(lru); 1889 + enum vm_event_item item; 1910 1890 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 1911 1891 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1912 1892 bool stalled = false; ··· 1935 1913 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); 1936 1914 reclaim_stat->recent_scanned[file] += nr_taken; 1937 1915 1938 - if (current_is_kswapd()) { 1939 - if (global_reclaim(sc)) 1940 - __count_vm_events(PGSCAN_KSWAPD, nr_scanned); 1941 - count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD, 1942 - nr_scanned); 1943 - } else { 1944 - if (global_reclaim(sc)) 1945 - __count_vm_events(PGSCAN_DIRECT, nr_scanned); 1946 - count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT, 1947 - nr_scanned); 1948 - } 1916 + item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; 1917 + if (global_reclaim(sc)) 1918 + __count_vm_events(item, nr_scanned); 1919 + __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); 1949 1920 spin_unlock_irq(&pgdat->lru_lock); 1950 1921 1951 1922 if (nr_taken == 0) ··· 1949 1934 1950 1935 spin_lock_irq(&pgdat->lru_lock); 1951 1936 1952 - if (current_is_kswapd()) { 1953 - if (global_reclaim(sc)) 1954 - __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed); 1955 - count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD, 1956 - nr_reclaimed); 1957 - } else { 1958 - if (global_reclaim(sc)) 1959 - __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed); 1960 - count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT, 1961 - nr_reclaimed); 1962 - } 1937 + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; 1938 + if (global_reclaim(sc)) 1939 + __count_vm_events(item, nr_reclaimed); 1940 + __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); 1941 + reclaim_stat->recent_rotated[0] = stat.nr_activate[0]; 1942 + reclaim_stat->recent_rotated[1] = stat.nr_activate[1]; 1963 1943 1964 - putback_inactive_pages(lruvec, &page_list); 1944 + move_pages_to_lru(lruvec, &page_list); 1965 1945 1966 1946 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); 1967 1947 ··· 1993 1983 return nr_reclaimed; 1994 1984 } 1995 1985 1996 - /* 1997 - * This moves pages from the active list to the inactive list. 1998 - * 1999 - * We move them the other way if the page is referenced by one or more 2000 - * processes, from rmap. 2001 - * 2002 - * If the pages are mostly unmapped, the processing is fast and it is 2003 - * appropriate to hold pgdat->lru_lock across the whole operation. But if 2004 - * the pages are mapped, the processing is slow (page_referenced()) so we 2005 - * should drop pgdat->lru_lock around each page. It's impossible to balance 2006 - * this, so instead we remove the pages from the LRU while processing them. 2007 - * It is safe to rely on PG_active against the non-LRU pages in here because 2008 - * nobody will play with that bit on a non-LRU page. 2009 - * 2010 - * The downside is that we have to touch page->_refcount against each page. 2011 - * But we had to alter page->flags anyway. 2012 - * 2013 - * Returns the number of pages moved to the given lru. 2014 - */ 2015 - 2016 - static unsigned move_active_pages_to_lru(struct lruvec *lruvec, 2017 - struct list_head *list, 2018 - struct list_head *pages_to_free, 2019 - enum lru_list lru) 2020 - { 2021 - struct pglist_data *pgdat = lruvec_pgdat(lruvec); 2022 - struct page *page; 2023 - int nr_pages; 2024 - int nr_moved = 0; 2025 - 2026 - while (!list_empty(list)) { 2027 - page = lru_to_page(list); 2028 - lruvec = mem_cgroup_page_lruvec(page, pgdat); 2029 - 2030 - VM_BUG_ON_PAGE(PageLRU(page), page); 2031 - SetPageLRU(page); 2032 - 2033 - nr_pages = hpage_nr_pages(page); 2034 - update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); 2035 - list_move(&page->lru, &lruvec->lists[lru]); 2036 - 2037 - if (put_page_testzero(page)) { 2038 - __ClearPageLRU(page); 2039 - __ClearPageActive(page); 2040 - del_page_from_lru_list(page, lruvec, lru); 2041 - 2042 - if (unlikely(PageCompound(page))) { 2043 - spin_unlock_irq(&pgdat->lru_lock); 2044 - mem_cgroup_uncharge(page); 2045 - (*get_compound_page_dtor(page))(page); 2046 - spin_lock_irq(&pgdat->lru_lock); 2047 - } else 2048 - list_add(&page->lru, pages_to_free); 2049 - } else { 2050 - nr_moved += nr_pages; 2051 - } 2052 - } 2053 - 2054 - if (!is_active_lru(lru)) { 2055 - __count_vm_events(PGDEACTIVATE, nr_moved); 2056 - count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, 2057 - nr_moved); 2058 - } 2059 - 2060 - return nr_moved; 2061 - } 2062 - 2063 1986 static void shrink_active_list(unsigned long nr_to_scan, 2064 1987 struct lruvec *lruvec, 2065 1988 struct scan_control *sc, ··· 2022 2079 reclaim_stat->recent_scanned[file] += nr_taken; 2023 2080 2024 2081 __count_vm_events(PGREFILL, nr_scanned); 2025 - count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); 2082 + __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); 2026 2083 2027 2084 spin_unlock_irq(&pgdat->lru_lock); 2028 2085 ··· 2079 2136 */ 2080 2137 reclaim_stat->recent_rotated[file] += nr_rotated; 2081 2138 2082 - nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); 2083 - nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); 2139 + nr_activate = move_pages_to_lru(lruvec, &l_active); 2140 + nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); 2141 + /* Keep all free pages in l_active list */ 2142 + list_splice(&l_inactive, &l_active); 2143 + 2144 + __count_vm_events(PGDEACTIVATE, nr_deactivate); 2145 + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); 2146 + 2084 2147 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); 2085 2148 spin_unlock_irq(&pgdat->lru_lock); 2086 2149 2087 - mem_cgroup_uncharge_list(&l_hold); 2088 - free_unref_page_list(&l_hold); 2150 + mem_cgroup_uncharge_list(&l_active); 2151 + free_unref_page_list(&l_active); 2089 2152 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, 2090 2153 nr_deactivate, nr_rotated, sc->priority, file); 2091 2154 } ··· 3161 3212 if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) 3162 3213 return 1; 3163 3214 3164 - trace_mm_vmscan_direct_reclaim_begin(order, 3165 - sc.may_writepage, 3166 - sc.gfp_mask, 3167 - sc.reclaim_idx); 3215 + trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); 3168 3216 3169 3217 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 3170 3218 ··· 3192 3246 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 3193 3247 3194 3248 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, 3195 - sc.may_writepage, 3196 - sc.gfp_mask, 3197 - sc.reclaim_idx); 3249 + sc.gfp_mask); 3198 3250 3199 3251 /* 3200 3252 * NOTE: Although we can get the priority field, using it ··· 3241 3297 3242 3298 zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; 3243 3299 3244 - trace_mm_vmscan_memcg_reclaim_begin(0, 3245 - sc.may_writepage, 3246 - sc.gfp_mask, 3247 - sc.reclaim_idx); 3300 + trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); 3248 3301 3249 3302 psi_memstall_enter(&pflags); 3250 3303 noreclaim_flag = memalloc_noreclaim_save(); ··· 4090 4149 .reclaim_idx = gfp_zone(gfp_mask), 4091 4150 }; 4092 4151 4152 + trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, 4153 + sc.gfp_mask); 4154 + 4093 4155 cond_resched(); 4094 4156 fs_reclaim_acquire(sc.gfp_mask); 4095 4157 /* ··· 4119 4175 current->flags &= ~PF_SWAPWRITE; 4120 4176 memalloc_noreclaim_restore(noreclaim_flag); 4121 4177 fs_reclaim_release(sc.gfp_mask); 4178 + 4179 + trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); 4180 + 4122 4181 return sc.nr_reclaimed >= nr_pages; 4123 4182 } 4124 4183

+3 -2

mm/workingset.c

··· 426 426 #ifdef CONFIG_MEMCG 427 427 if (sc->memcg) { 428 428 struct lruvec *lruvec; 429 + int i; 429 430 430 - pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, 431 - LRU_ALL); 432 431 lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); 432 + for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) 433 + pages += lruvec_page_state(lruvec, NR_LRU_BASE + i); 433 434 pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE); 434 435 pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE); 435 436 } else

+528 -150

mm/z3fold.c

··· 24 24 25 25 #include <linux/atomic.h> 26 26 #include <linux/sched.h> 27 + #include <linux/cpumask.h> 28 + #include <linux/dcache.h> 27 29 #include <linux/list.h> 28 30 #include <linux/mm.h> 29 31 #include <linux/module.h> 32 + #include <linux/page-flags.h> 33 + #include <linux/migrate.h> 34 + #include <linux/node.h> 35 + #include <linux/compaction.h> 30 36 #include <linux/percpu.h> 37 + #include <linux/mount.h> 38 + #include <linux/fs.h> 31 39 #include <linux/preempt.h> 32 40 #include <linux/workqueue.h> 33 41 #include <linux/slab.h> 34 42 #include <linux/spinlock.h> 35 43 #include <linux/zpool.h> 36 - 37 - /***************** 38 - * Structures 39 - *****************/ 40 - struct z3fold_pool; 41 - struct z3fold_ops { 42 - int (*evict)(struct z3fold_pool *pool, unsigned long handle); 43 - }; 44 - 45 - enum buddy { 46 - HEADLESS = 0, 47 - FIRST, 48 - MIDDLE, 49 - LAST, 50 - BUDDIES_MAX 51 - }; 52 - 53 - /* 54 - * struct z3fold_header - z3fold page metadata occupying first chunks of each 55 - * z3fold page, except for HEADLESS pages 56 - * @buddy: links the z3fold page into the relevant list in the 57 - * pool 58 - * @page_lock: per-page lock 59 - * @refcount: reference count for the z3fold page 60 - * @work: work_struct for page layout optimization 61 - * @pool: pointer to the pool which this page belongs to 62 - * @cpu: CPU which this page "belongs" to 63 - * @first_chunks: the size of the first buddy in chunks, 0 if free 64 - * @middle_chunks: the size of the middle buddy in chunks, 0 if free 65 - * @last_chunks: the size of the last buddy in chunks, 0 if free 66 - * @first_num: the starting number (for the first handle) 67 - */ 68 - struct z3fold_header { 69 - struct list_head buddy; 70 - spinlock_t page_lock; 71 - struct kref refcount; 72 - struct work_struct work; 73 - struct z3fold_pool *pool; 74 - short cpu; 75 - unsigned short first_chunks; 76 - unsigned short middle_chunks; 77 - unsigned short last_chunks; 78 - unsigned short start_middle; 79 - unsigned short first_num:2; 80 - }; 81 44 82 45 /* 83 46 * NCHUNKS_ORDER determines the internal allocation granularity, effectively ··· 63 100 64 101 #define BUDDY_MASK (0x3) 65 102 #define BUDDY_SHIFT 2 103 + #define SLOTS_ALIGN (0x40) 104 + 105 + /***************** 106 + * Structures 107 + *****************/ 108 + struct z3fold_pool; 109 + struct z3fold_ops { 110 + int (*evict)(struct z3fold_pool *pool, unsigned long handle); 111 + }; 112 + 113 + enum buddy { 114 + HEADLESS = 0, 115 + FIRST, 116 + MIDDLE, 117 + LAST, 118 + BUDDIES_MAX = LAST 119 + }; 120 + 121 + struct z3fold_buddy_slots { 122 + /* 123 + * we are using BUDDY_MASK in handle_to_buddy etc. so there should 124 + * be enough slots to hold all possible variants 125 + */ 126 + unsigned long slot[BUDDY_MASK + 1]; 127 + unsigned long pool; /* back link + flags */ 128 + }; 129 + #define HANDLE_FLAG_MASK (0x03) 130 + 131 + /* 132 + * struct z3fold_header - z3fold page metadata occupying first chunks of each 133 + * z3fold page, except for HEADLESS pages 134 + * @buddy: links the z3fold page into the relevant list in the 135 + * pool 136 + * @page_lock: per-page lock 137 + * @refcount: reference count for the z3fold page 138 + * @work: work_struct for page layout optimization 139 + * @slots: pointer to the structure holding buddy slots 140 + * @cpu: CPU which this page "belongs" to 141 + * @first_chunks: the size of the first buddy in chunks, 0 if free 142 + * @middle_chunks: the size of the middle buddy in chunks, 0 if free 143 + * @last_chunks: the size of the last buddy in chunks, 0 if free 144 + * @first_num: the starting number (for the first handle) 145 + * @mapped_count: the number of objects currently mapped 146 + */ 147 + struct z3fold_header { 148 + struct list_head buddy; 149 + spinlock_t page_lock; 150 + struct kref refcount; 151 + struct work_struct work; 152 + struct z3fold_buddy_slots *slots; 153 + short cpu; 154 + unsigned short first_chunks; 155 + unsigned short middle_chunks; 156 + unsigned short last_chunks; 157 + unsigned short start_middle; 158 + unsigned short first_num:2; 159 + unsigned short mapped_count:2; 160 + }; 66 161 67 162 /** 68 163 * struct z3fold_pool - stores metadata for each z3fold pool ··· 134 113 * added buddy. 135 114 * @stale: list of pages marked for freeing 136 115 * @pages_nr: number of z3fold pages in the pool. 116 + * @c_handle: cache for z3fold_buddy_slots allocation 137 117 * @ops: pointer to a structure of user defined operations specified at 138 118 * pool creation time. 139 119 * @compact_wq: workqueue for page layout background optimization 140 120 * @release_wq: workqueue for safe page release 141 121 * @work: work_struct for safe page release 122 + * @inode: inode for z3fold pseudo filesystem 142 123 * 143 124 * This structure is allocated at pool creation time and maintains metadata 144 125 * pertaining to a particular z3fold pool. ··· 153 130 struct list_head lru; 154 131 struct list_head stale; 155 132 atomic64_t pages_nr; 133 + struct kmem_cache *c_handle; 156 134 const struct z3fold_ops *ops; 157 135 struct zpool *zpool; 158 136 const struct zpool_ops *zpool_ops; 159 137 struct workqueue_struct *compact_wq; 160 138 struct workqueue_struct *release_wq; 161 139 struct work_struct work; 140 + struct inode *inode; 162 141 }; 163 142 164 143 /* ··· 189 164 190 165 static void compact_page_work(struct work_struct *w); 191 166 167 + static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool) 168 + { 169 + struct z3fold_buddy_slots *slots = kmem_cache_alloc(pool->c_handle, 170 + GFP_KERNEL); 171 + 172 + if (slots) { 173 + memset(slots->slot, 0, sizeof(slots->slot)); 174 + slots->pool = (unsigned long)pool; 175 + } 176 + 177 + return slots; 178 + } 179 + 180 + static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s) 181 + { 182 + return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK); 183 + } 184 + 185 + static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle) 186 + { 187 + return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1)); 188 + } 189 + 190 + static inline void free_handle(unsigned long handle) 191 + { 192 + struct z3fold_buddy_slots *slots; 193 + int i; 194 + bool is_free; 195 + 196 + if (handle & (1 << PAGE_HEADLESS)) 197 + return; 198 + 199 + WARN_ON(*(unsigned long *)handle == 0); 200 + *(unsigned long *)handle = 0; 201 + slots = handle_to_slots(handle); 202 + is_free = true; 203 + for (i = 0; i <= BUDDY_MASK; i++) { 204 + if (slots->slot[i]) { 205 + is_free = false; 206 + break; 207 + } 208 + } 209 + 210 + if (is_free) { 211 + struct z3fold_pool *pool = slots_to_pool(slots); 212 + 213 + kmem_cache_free(pool->c_handle, slots); 214 + } 215 + } 216 + 217 + static struct dentry *z3fold_do_mount(struct file_system_type *fs_type, 218 + int flags, const char *dev_name, void *data) 219 + { 220 + static const struct dentry_operations ops = { 221 + .d_dname = simple_dname, 222 + }; 223 + 224 + return mount_pseudo(fs_type, "z3fold:", NULL, &ops, 0x33); 225 + } 226 + 227 + static struct file_system_type z3fold_fs = { 228 + .name = "z3fold", 229 + .mount = z3fold_do_mount, 230 + .kill_sb = kill_anon_super, 231 + }; 232 + 233 + static struct vfsmount *z3fold_mnt; 234 + static int z3fold_mount(void) 235 + { 236 + int ret = 0; 237 + 238 + z3fold_mnt = kern_mount(&z3fold_fs); 239 + if (IS_ERR(z3fold_mnt)) 240 + ret = PTR_ERR(z3fold_mnt); 241 + 242 + return ret; 243 + } 244 + 245 + static void z3fold_unmount(void) 246 + { 247 + kern_unmount(z3fold_mnt); 248 + } 249 + 250 + static const struct address_space_operations z3fold_aops; 251 + static int z3fold_register_migration(struct z3fold_pool *pool) 252 + { 253 + pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb); 254 + if (IS_ERR(pool->inode)) { 255 + pool->inode = NULL; 256 + return 1; 257 + } 258 + 259 + pool->inode->i_mapping->private_data = pool; 260 + pool->inode->i_mapping->a_ops = &z3fold_aops; 261 + return 0; 262 + } 263 + 264 + static void z3fold_unregister_migration(struct z3fold_pool *pool) 265 + { 266 + if (pool->inode) 267 + iput(pool->inode); 268 + } 269 + 192 270 /* Initializes the z3fold header of a newly allocated z3fold page */ 193 271 static struct z3fold_header *init_z3fold_page(struct page *page, 194 272 struct z3fold_pool *pool) 195 273 { 196 274 struct z3fold_header *zhdr = page_address(page); 275 + struct z3fold_buddy_slots *slots = alloc_slots(pool); 276 + 277 + if (!slots) 278 + return NULL; 197 279 198 280 INIT_LIST_HEAD(&page->lru); 199 281 clear_bit(PAGE_HEADLESS, &page->private); ··· 317 185 zhdr->first_num = 0; 318 186 zhdr->start_middle = 0; 319 187 zhdr->cpu = -1; 320 - zhdr->pool = pool; 188 + zhdr->slots = slots; 321 189 INIT_LIST_HEAD(&zhdr->buddy); 322 190 INIT_WORK(&zhdr->work, compact_page_work); 323 191 return zhdr; 324 192 } 325 193 326 194 /* Resets the struct page fields and frees the page */ 327 - static void free_z3fold_page(struct page *page) 195 + static void free_z3fold_page(struct page *page, bool headless) 328 196 { 197 + if (!headless) { 198 + lock_page(page); 199 + __ClearPageMovable(page); 200 + unlock_page(page); 201 + } 202 + ClearPagePrivate(page); 329 203 __free_page(page); 330 204 } 331 205 ··· 353 215 spin_unlock(&zhdr->page_lock); 354 216 } 355 217 218 + /* Helper function to build the index */ 219 + static inline int __idx(struct z3fold_header *zhdr, enum buddy bud) 220 + { 221 + return (bud + zhdr->first_num) & BUDDY_MASK; 222 + } 223 + 356 224 /* 357 225 * Encodes the handle of a particular buddy within a z3fold page 358 226 * Pool lock should be held as this function accesses first_num 359 227 */ 360 228 static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) 361 229 { 362 - unsigned long handle; 230 + struct z3fold_buddy_slots *slots; 231 + unsigned long h = (unsigned long)zhdr; 232 + int idx = 0; 363 233 364 - handle = (unsigned long)zhdr; 365 - if (bud != HEADLESS) { 366 - handle |= (bud + zhdr->first_num) & BUDDY_MASK; 367 - if (bud == LAST) 368 - handle |= (zhdr->last_chunks << BUDDY_SHIFT); 369 - } 370 - return handle; 234 + /* 235 + * For a headless page, its handle is its pointer with the extra 236 + * PAGE_HEADLESS bit set 237 + */ 238 + if (bud == HEADLESS) 239 + return h | (1 << PAGE_HEADLESS); 240 + 241 + /* otherwise, return pointer to encoded handle */ 242 + idx = __idx(zhdr, bud); 243 + h += idx; 244 + if (bud == LAST) 245 + h |= (zhdr->last_chunks << BUDDY_SHIFT); 246 + 247 + slots = zhdr->slots; 248 + slots->slot[idx] = h; 249 + return (unsigned long)&slots->slot[idx]; 371 250 } 372 251 373 252 /* Returns the z3fold page where a given handle is stored */ 374 - static struct z3fold_header *handle_to_z3fold_header(unsigned long handle) 253 + static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h) 375 254 { 376 - return (struct z3fold_header *)(handle & PAGE_MASK); 255 + unsigned long addr = h; 256 + 257 + if (!(addr & (1 << PAGE_HEADLESS))) 258 + addr = *(unsigned long *)h; 259 + 260 + return (struct z3fold_header *)(addr & PAGE_MASK); 377 261 } 378 262 379 263 /* only for LAST bud, returns zero otherwise */ 380 264 static unsigned short handle_to_chunks(unsigned long handle) 381 265 { 382 - return (handle & ~PAGE_MASK) >> BUDDY_SHIFT; 266 + unsigned long addr = *(unsigned long *)handle; 267 + 268 + return (addr & ~PAGE_MASK) >> BUDDY_SHIFT; 383 269 } 384 270 385 271 /* ··· 413 251 */ 414 252 static enum buddy handle_to_buddy(unsigned long handle) 415 253 { 416 - struct z3fold_header *zhdr = handle_to_z3fold_header(handle); 417 - return (handle - zhdr->first_num) & BUDDY_MASK; 254 + struct z3fold_header *zhdr; 255 + unsigned long addr; 256 + 257 + WARN_ON(handle & (1 << PAGE_HEADLESS)); 258 + addr = *(unsigned long *)handle; 259 + zhdr = (struct z3fold_header *)(addr & PAGE_MASK); 260 + return (addr - zhdr->first_num) & BUDDY_MASK; 261 + } 262 + 263 + static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr) 264 + { 265 + return slots_to_pool(zhdr->slots); 418 266 } 419 267 420 268 static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) 421 269 { 422 270 struct page *page = virt_to_page(zhdr); 423 - struct z3fold_pool *pool = zhdr->pool; 271 + struct z3fold_pool *pool = zhdr_to_pool(zhdr); 424 272 425 273 WARN_ON(!list_empty(&zhdr->buddy)); 426 274 set_bit(PAGE_STALE, &page->private); 427 275 clear_bit(NEEDS_COMPACTING, &page->private); 428 276 spin_lock(&pool->lock); 429 277 if (!list_empty(&page->lru)) 430 - list_del(&page->lru); 278 + list_del_init(&page->lru); 431 279 spin_unlock(&pool->lock); 432 280 if (locked) 433 281 z3fold_page_unlock(zhdr); ··· 467 295 { 468 296 struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, 469 297 refcount); 470 - spin_lock(&zhdr->pool->lock); 298 + struct z3fold_pool *pool = zhdr_to_pool(zhdr); 299 + spin_lock(&pool->lock); 471 300 list_del_init(&zhdr->buddy); 472 - spin_unlock(&zhdr->pool->lock); 301 + spin_unlock(&pool->lock); 473 302 474 303 WARN_ON(z3fold_page_trylock(zhdr)); 475 304 __release_z3fold_page(zhdr, true); ··· 491 318 continue; 492 319 spin_unlock(&pool->stale_lock); 493 320 cancel_work_sync(&zhdr->work); 494 - free_z3fold_page(page); 321 + free_z3fold_page(page, false); 495 322 cond_resched(); 496 323 spin_lock(&pool->stale_lock); 497 324 } ··· 522 349 return nfree; 523 350 } 524 351 352 + /* Add to the appropriate unbuddied list */ 353 + static inline void add_to_unbuddied(struct z3fold_pool *pool, 354 + struct z3fold_header *zhdr) 355 + { 356 + if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || 357 + zhdr->middle_chunks == 0) { 358 + struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied); 359 + 360 + int freechunks = num_free_chunks(zhdr); 361 + spin_lock(&pool->lock); 362 + list_add(&zhdr->buddy, &unbuddied[freechunks]); 363 + spin_unlock(&pool->lock); 364 + zhdr->cpu = smp_processor_id(); 365 + put_cpu_ptr(pool->unbuddied); 366 + } 367 + } 368 + 525 369 static inline void *mchunk_memmove(struct z3fold_header *zhdr, 526 370 unsigned short dst_chunk) 527 371 { ··· 556 366 557 367 if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private)) 558 368 return 0; /* can't move middle chunk, it's used */ 369 + 370 + if (unlikely(PageIsolated(page))) 371 + return 0; 559 372 560 373 if (zhdr->middle_chunks == 0) 561 374 return 0; /* nothing to compact */ ··· 599 406 600 407 static void do_compact_page(struct z3fold_header *zhdr, bool locked) 601 408 { 602 - struct z3fold_pool *pool = zhdr->pool; 409 + struct z3fold_pool *pool = zhdr_to_pool(zhdr); 603 410 struct page *page; 604 - struct list_head *unbuddied; 605 - int fchunks; 606 411 607 412 page = virt_to_page(zhdr); 608 413 if (locked) ··· 620 429 return; 621 430 } 622 431 623 - z3fold_compact_page(zhdr); 624 - unbuddied = get_cpu_ptr(pool->unbuddied); 625 - fchunks = num_free_chunks(zhdr); 626 - if (fchunks < NCHUNKS && 627 - (!zhdr->first_chunks || !zhdr->middle_chunks || 628 - !zhdr->last_chunks)) { 629 - /* the page's not completely free and it's unbuddied */ 630 - spin_lock(&pool->lock); 631 - list_add(&zhdr->buddy, &unbuddied[fchunks]); 632 - spin_unlock(&pool->lock); 633 - zhdr->cpu = smp_processor_id(); 432 + if (unlikely(PageIsolated(page) || 433 + test_bit(PAGE_STALE, &page->private))) { 434 + z3fold_page_unlock(zhdr); 435 + return; 634 436 } 635 - put_cpu_ptr(pool->unbuddied); 437 + 438 + z3fold_compact_page(zhdr); 439 + add_to_unbuddied(pool, zhdr); 636 440 z3fold_page_unlock(zhdr); 637 441 } 638 442 ··· 639 453 do_compact_page(zhdr, false); 640 454 } 641 455 456 + /* returns _locked_ z3fold page header or NULL */ 457 + static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, 458 + size_t size, bool can_sleep) 459 + { 460 + struct z3fold_header *zhdr = NULL; 461 + struct page *page; 462 + struct list_head *unbuddied; 463 + int chunks = size_to_chunks(size), i; 464 + 465 + lookup: 466 + /* First, try to find an unbuddied z3fold page. */ 467 + unbuddied = get_cpu_ptr(pool->unbuddied); 468 + for_each_unbuddied_list(i, chunks) { 469 + struct list_head *l = &unbuddied[i]; 470 + 471 + zhdr = list_first_entry_or_null(READ_ONCE(l), 472 + struct z3fold_header, buddy); 473 + 474 + if (!zhdr) 475 + continue; 476 + 477 + /* Re-check under lock. */ 478 + spin_lock(&pool->lock); 479 + l = &unbuddied[i]; 480 + if (unlikely(zhdr != list_first_entry(READ_ONCE(l), 481 + struct z3fold_header, buddy)) || 482 + !z3fold_page_trylock(zhdr)) { 483 + spin_unlock(&pool->lock); 484 + zhdr = NULL; 485 + put_cpu_ptr(pool->unbuddied); 486 + if (can_sleep) 487 + cond_resched(); 488 + goto lookup; 489 + } 490 + list_del_init(&zhdr->buddy); 491 + zhdr->cpu = -1; 492 + spin_unlock(&pool->lock); 493 + 494 + page = virt_to_page(zhdr); 495 + if (test_bit(NEEDS_COMPACTING, &page->private)) { 496 + z3fold_page_unlock(zhdr); 497 + zhdr = NULL; 498 + put_cpu_ptr(pool->unbuddied); 499 + if (can_sleep) 500 + cond_resched(); 501 + goto lookup; 502 + } 503 + 504 + /* 505 + * this page could not be removed from its unbuddied 506 + * list while pool lock was held, and then we've taken 507 + * page lock so kref_put could not be called before 508 + * we got here, so it's safe to just call kref_get() 509 + */ 510 + kref_get(&zhdr->refcount); 511 + break; 512 + } 513 + put_cpu_ptr(pool->unbuddied); 514 + 515 + if (!zhdr) { 516 + int cpu; 517 + 518 + /* look for _exact_ match on other cpus' lists */ 519 + for_each_online_cpu(cpu) { 520 + struct list_head *l; 521 + 522 + unbuddied = per_cpu_ptr(pool->unbuddied, cpu); 523 + spin_lock(&pool->lock); 524 + l = &unbuddied[chunks]; 525 + 526 + zhdr = list_first_entry_or_null(READ_ONCE(l), 527 + struct z3fold_header, buddy); 528 + 529 + if (!zhdr || !z3fold_page_trylock(zhdr)) { 530 + spin_unlock(&pool->lock); 531 + zhdr = NULL; 532 + continue; 533 + } 534 + list_del_init(&zhdr->buddy); 535 + zhdr->cpu = -1; 536 + spin_unlock(&pool->lock); 537 + 538 + page = virt_to_page(zhdr); 539 + if (test_bit(NEEDS_COMPACTING, &page->private)) { 540 + z3fold_page_unlock(zhdr); 541 + zhdr = NULL; 542 + if (can_sleep) 543 + cond_resched(); 544 + continue; 545 + } 546 + kref_get(&zhdr->refcount); 547 + break; 548 + } 549 + } 550 + 551 + return zhdr; 552 + } 642 553 643 554 /* 644 555 * API Functions ··· 759 476 pool = kzalloc(sizeof(struct z3fold_pool), gfp); 760 477 if (!pool) 761 478 goto out; 479 + pool->c_handle = kmem_cache_create("z3fold_handle", 480 + sizeof(struct z3fold_buddy_slots), 481 + SLOTS_ALIGN, 0, NULL); 482 + if (!pool->c_handle) 483 + goto out_c; 762 484 spin_lock_init(&pool->lock); 763 485 spin_lock_init(&pool->stale_lock); 764 486 pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); ··· 785 497 pool->release_wq = create_singlethread_workqueue(pool->name); 786 498 if (!pool->release_wq) 787 499 goto out_wq; 500 + if (z3fold_register_migration(pool)) 501 + goto out_rwq; 788 502 INIT_WORK(&pool->work, free_pages_work); 789 503 pool->ops = ops; 790 504 return pool; 791 505 506 + out_rwq: 507 + destroy_workqueue(pool->release_wq); 792 508 out_wq: 793 509 destroy_workqueue(pool->compact_wq); 794 510 out_unbuddied: 795 511 free_percpu(pool->unbuddied); 796 512 out_pool: 513 + kmem_cache_destroy(pool->c_handle); 514 + out_c: 797 515 kfree(pool); 798 516 out: 799 517 return NULL; ··· 813 519 */ 814 520 static void z3fold_destroy_pool(struct z3fold_pool *pool) 815 521 { 522 + kmem_cache_destroy(pool->c_handle); 523 + z3fold_unregister_migration(pool); 816 524 destroy_workqueue(pool->release_wq); 817 525 destroy_workqueue(pool->compact_wq); 818 526 kfree(pool); ··· 842 546 static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, 843 547 unsigned long *handle) 844 548 { 845 - int chunks = 0, i, freechunks; 549 + int chunks = size_to_chunks(size); 846 550 struct z3fold_header *zhdr = NULL; 847 551 struct page *page = NULL; 848 552 enum buddy bud; ··· 857 561 if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) 858 562 bud = HEADLESS; 859 563 else { 860 - struct list_head *unbuddied; 861 - chunks = size_to_chunks(size); 862 - 863 - lookup: 864 - /* First, try to find an unbuddied z3fold page. */ 865 - unbuddied = get_cpu_ptr(pool->unbuddied); 866 - for_each_unbuddied_list(i, chunks) { 867 - struct list_head *l = &unbuddied[i]; 868 - 869 - zhdr = list_first_entry_or_null(READ_ONCE(l), 870 - struct z3fold_header, buddy); 871 - 872 - if (!zhdr) 873 - continue; 874 - 875 - /* Re-check under lock. */ 876 - spin_lock(&pool->lock); 877 - l = &unbuddied[i]; 878 - if (unlikely(zhdr != list_first_entry(READ_ONCE(l), 879 - struct z3fold_header, buddy)) || 880 - !z3fold_page_trylock(zhdr)) { 881 - spin_unlock(&pool->lock); 882 - put_cpu_ptr(pool->unbuddied); 883 - goto lookup; 884 - } 885 - list_del_init(&zhdr->buddy); 886 - zhdr->cpu = -1; 887 - spin_unlock(&pool->lock); 888 - 889 - page = virt_to_page(zhdr); 890 - if (test_bit(NEEDS_COMPACTING, &page->private)) { 891 - z3fold_page_unlock(zhdr); 892 - zhdr = NULL; 893 - put_cpu_ptr(pool->unbuddied); 894 - if (can_sleep) 895 - cond_resched(); 896 - goto lookup; 897 - } 898 - 899 - /* 900 - * this page could not be removed from its unbuddied 901 - * list while pool lock was held, and then we've taken 902 - * page lock so kref_put could not be called before 903 - * we got here, so it's safe to just call kref_get() 904 - */ 905 - kref_get(&zhdr->refcount); 906 - break; 907 - } 908 - put_cpu_ptr(pool->unbuddied); 909 - 564 + retry: 565 + zhdr = __z3fold_alloc(pool, size, can_sleep); 910 566 if (zhdr) { 911 567 if (zhdr->first_chunks == 0) { 912 568 if (zhdr->middle_chunks != 0 && ··· 878 630 z3fold_page_unlock(zhdr); 879 631 pr_err("No free chunks in unbuddied\n"); 880 632 WARN_ON(1); 881 - goto lookup; 633 + goto retry; 882 634 } 635 + page = virt_to_page(zhdr); 883 636 goto found; 884 637 } 885 638 bud = FIRST; ··· 911 662 if (!page) 912 663 return -ENOMEM; 913 664 914 - atomic64_inc(&pool->pages_nr); 915 665 zhdr = init_z3fold_page(page, pool); 666 + if (!zhdr) { 667 + __free_page(page); 668 + return -ENOMEM; 669 + } 670 + atomic64_inc(&pool->pages_nr); 916 671 917 672 if (bud == HEADLESS) { 918 673 set_bit(PAGE_HEADLESS, &page->private); 919 674 goto headless; 920 675 } 676 + __SetPageMovable(page, pool->inode->i_mapping); 921 677 z3fold_page_lock(zhdr); 922 678 923 679 found: ··· 934 680 zhdr->middle_chunks = chunks; 935 681 zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; 936 682 } 937 - 938 - if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || 939 - zhdr->middle_chunks == 0) { 940 - struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied); 941 - 942 - /* Add to unbuddied list */ 943 - freechunks = num_free_chunks(zhdr); 944 - spin_lock(&pool->lock); 945 - list_add(&zhdr->buddy, &unbuddied[freechunks]); 946 - spin_unlock(&pool->lock); 947 - zhdr->cpu = smp_processor_id(); 948 - put_cpu_ptr(pool->unbuddied); 949 - } 683 + add_to_unbuddied(pool, zhdr); 950 684 951 685 headless: 952 686 spin_lock(&pool->lock); ··· 981 739 spin_lock(&pool->lock); 982 740 list_del(&page->lru); 983 741 spin_unlock(&pool->lock); 984 - free_z3fold_page(page); 742 + free_z3fold_page(page, true); 985 743 atomic64_dec(&pool->pages_nr); 986 744 } 987 745 return; ··· 1008 766 return; 1009 767 } 1010 768 769 + free_handle(handle); 1011 770 if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) { 1012 771 atomic64_dec(&pool->pages_nr); 1013 772 return; ··· 1017 774 z3fold_page_unlock(zhdr); 1018 775 return; 1019 776 } 1020 - if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { 777 + if (unlikely(PageIsolated(page)) || 778 + test_and_set_bit(NEEDS_COMPACTING, &page->private)) { 1021 779 z3fold_page_unlock(zhdr); 1022 780 return; 1023 781 } ··· 1099 855 if (test_and_set_bit(PAGE_CLAIMED, &page->private)) 1100 856 continue; 1101 857 1102 - zhdr = page_address(page); 858 + if (unlikely(PageIsolated(page))) 859 + continue; 1103 860 if (test_bit(PAGE_HEADLESS, &page->private)) 1104 861 break; 1105 862 863 + zhdr = page_address(page); 1106 864 if (!z3fold_page_trylock(zhdr)) { 1107 865 zhdr = NULL; 1108 866 continue; /* can't evict at this point */ ··· 1165 919 next: 1166 920 if (test_bit(PAGE_HEADLESS, &page->private)) { 1167 921 if (ret == 0) { 1168 - free_z3fold_page(page); 922 + free_z3fold_page(page, true); 1169 923 atomic64_dec(&pool->pages_nr); 1170 924 return 0; 1171 925 } ··· 1242 996 break; 1243 997 } 1244 998 999 + if (addr) 1000 + zhdr->mapped_count++; 1245 1001 z3fold_page_unlock(zhdr); 1246 1002 out: 1247 1003 return addr; ··· 1270 1022 buddy = handle_to_buddy(handle); 1271 1023 if (buddy == MIDDLE) 1272 1024 clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); 1025 + zhdr->mapped_count--; 1273 1026 z3fold_page_unlock(zhdr); 1274 1027 } 1275 1028 ··· 1284 1035 { 1285 1036 return atomic64_read(&pool->pages_nr); 1286 1037 } 1038 + 1039 + static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) 1040 + { 1041 + struct z3fold_header *zhdr; 1042 + struct z3fold_pool *pool; 1043 + 1044 + VM_BUG_ON_PAGE(!PageMovable(page), page); 1045 + VM_BUG_ON_PAGE(PageIsolated(page), page); 1046 + 1047 + if (test_bit(PAGE_HEADLESS, &page->private)) 1048 + return false; 1049 + 1050 + zhdr = page_address(page); 1051 + z3fold_page_lock(zhdr); 1052 + if (test_bit(NEEDS_COMPACTING, &page->private) || 1053 + test_bit(PAGE_STALE, &page->private)) 1054 + goto out; 1055 + 1056 + pool = zhdr_to_pool(zhdr); 1057 + 1058 + if (zhdr->mapped_count == 0) { 1059 + kref_get(&zhdr->refcount); 1060 + if (!list_empty(&zhdr->buddy)) 1061 + list_del_init(&zhdr->buddy); 1062 + spin_lock(&pool->lock); 1063 + if (!list_empty(&page->lru)) 1064 + list_del(&page->lru); 1065 + spin_unlock(&pool->lock); 1066 + z3fold_page_unlock(zhdr); 1067 + return true; 1068 + } 1069 + out: 1070 + z3fold_page_unlock(zhdr); 1071 + return false; 1072 + } 1073 + 1074 + static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage, 1075 + struct page *page, enum migrate_mode mode) 1076 + { 1077 + struct z3fold_header *zhdr, *new_zhdr; 1078 + struct z3fold_pool *pool; 1079 + struct address_space *new_mapping; 1080 + 1081 + VM_BUG_ON_PAGE(!PageMovable(page), page); 1082 + VM_BUG_ON_PAGE(!PageIsolated(page), page); 1083 + 1084 + zhdr = page_address(page); 1085 + pool = zhdr_to_pool(zhdr); 1086 + 1087 + if (!trylock_page(page)) 1088 + return -EAGAIN; 1089 + 1090 + if (!z3fold_page_trylock(zhdr)) { 1091 + unlock_page(page); 1092 + return -EAGAIN; 1093 + } 1094 + if (zhdr->mapped_count != 0) { 1095 + z3fold_page_unlock(zhdr); 1096 + unlock_page(page); 1097 + return -EBUSY; 1098 + } 1099 + new_zhdr = page_address(newpage); 1100 + memcpy(new_zhdr, zhdr, PAGE_SIZE); 1101 + newpage->private = page->private; 1102 + page->private = 0; 1103 + z3fold_page_unlock(zhdr); 1104 + spin_lock_init(&new_zhdr->page_lock); 1105 + new_mapping = page_mapping(page); 1106 + __ClearPageMovable(page); 1107 + ClearPagePrivate(page); 1108 + 1109 + get_page(newpage); 1110 + z3fold_page_lock(new_zhdr); 1111 + if (new_zhdr->first_chunks) 1112 + encode_handle(new_zhdr, FIRST); 1113 + if (new_zhdr->last_chunks) 1114 + encode_handle(new_zhdr, LAST); 1115 + if (new_zhdr->middle_chunks) 1116 + encode_handle(new_zhdr, MIDDLE); 1117 + set_bit(NEEDS_COMPACTING, &newpage->private); 1118 + new_zhdr->cpu = smp_processor_id(); 1119 + spin_lock(&pool->lock); 1120 + list_add(&newpage->lru, &pool->lru); 1121 + spin_unlock(&pool->lock); 1122 + __SetPageMovable(newpage, new_mapping); 1123 + z3fold_page_unlock(new_zhdr); 1124 + 1125 + queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); 1126 + 1127 + page_mapcount_reset(page); 1128 + unlock_page(page); 1129 + put_page(page); 1130 + return 0; 1131 + } 1132 + 1133 + static void z3fold_page_putback(struct page *page) 1134 + { 1135 + struct z3fold_header *zhdr; 1136 + struct z3fold_pool *pool; 1137 + 1138 + zhdr = page_address(page); 1139 + pool = zhdr_to_pool(zhdr); 1140 + 1141 + z3fold_page_lock(zhdr); 1142 + if (!list_empty(&zhdr->buddy)) 1143 + list_del_init(&zhdr->buddy); 1144 + INIT_LIST_HEAD(&page->lru); 1145 + if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { 1146 + atomic64_dec(&pool->pages_nr); 1147 + return; 1148 + } 1149 + spin_lock(&pool->lock); 1150 + list_add(&page->lru, &pool->lru); 1151 + spin_unlock(&pool->lock); 1152 + z3fold_page_unlock(zhdr); 1153 + } 1154 + 1155 + static const struct address_space_operations z3fold_aops = { 1156 + .isolate_page = z3fold_page_isolate, 1157 + .migratepage = z3fold_page_migrate, 1158 + .putback_page = z3fold_page_putback, 1159 + }; 1287 1160 1288 1161 /***************** 1289 1162 * zpool ··· 1504 1133 1505 1134 static int __init init_z3fold(void) 1506 1135 { 1136 + int ret; 1137 + 1507 1138 /* Make sure the z3fold header is not larger than the page size */ 1508 1139 BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE); 1140 + ret = z3fold_mount(); 1141 + if (ret) 1142 + return ret; 1143 + 1509 1144 zpool_register_driver(&z3fold_zpool_driver); 1510 1145 1511 1146 return 0; ··· 1519 1142 1520 1143 static void __exit exit_z3fold(void) 1521 1144 { 1145 + z3fold_unmount(); 1522 1146 zpool_unregister_driver(&z3fold_zpool_driver); 1523 1147 } 1524 1148

+1 -1

net/ceph/pagevec.c

··· 27 27 while (got < num_pages) { 28 28 rc = get_user_pages_fast( 29 29 (unsigned long)data + ((unsigned long)got * PAGE_SIZE), 30 - num_pages - got, write_page, pages + got); 30 + num_pages - got, write_page ? FOLL_WRITE : 0, pages + got); 31 31 if (rc < 0) 32 32 break; 33 33 BUG_ON(rc == 0);

+1 -1

net/rds/info.c

··· 193 193 ret = -ENOMEM; 194 194 goto out; 195 195 } 196 - ret = get_user_pages_fast(start, nr_pages, 1, pages); 196 + ret = get_user_pages_fast(start, nr_pages, FOLL_WRITE, pages); 197 197 if (ret != nr_pages) { 198 198 if (ret > 0) 199 199 nr_pages = ret;

+2 -1

net/rds/rdma.c

··· 158 158 { 159 159 int ret; 160 160 161 - ret = get_user_pages_fast(user_addr, nr_pages, write, pages); 161 + ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0, 162 + pages); 162 163 163 164 if (ret >= 0 && ret < nr_pages) { 164 165 while (ret--)

+2 -2

net/xdp/xdp_umem.c

··· 253 253 return -ENOMEM; 254 254 255 255 down_read(&current->mm->mmap_sem); 256 - npgs = get_user_pages_longterm(umem->address, umem->npgs, 257 - gup_flags, &umem->pgs[0], NULL); 256 + npgs = get_user_pages(umem->address, umem->npgs, 257 + gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); 258 258 up_read(&current->mm->mmap_sem); 259 259 260 260 if (npgs != umem->npgs) {

+2 -1

virt/kvm/kvm_main.c

··· 391 391 spin_unlock(&kvm->mmu_lock); 392 392 393 393 ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start, 394 - range->end, range->blockable); 394 + range->end, 395 + mmu_notifier_range_blockable(range)); 395 396 396 397 srcu_read_unlock(&kvm->srcu, idx); 397 398