Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm/hugetlbfs: update hugetlbfs to use mmap_prepare

Since we can now perform actions after the VMA is established via
mmap_prepare, use desc->action_success_hook to set up the hugetlb lock
once the VMA is setup.

We also make changes throughout hugetlbfs to make this possible.

Note that we must hide newly established hugetlb VMAs from the rmap until
the operation is entirely complete as we establish a hugetlb lock during
VMA setup that can be raced by rmap users.

Link: https://lkml.kernel.org/r/b1afa16d3cfa585a03df9ae215ae9f905b3f0ed7.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Lorenzo Stoakes and committed by
Andrew Morton
ea52cb24 da003453

+97 -54
+33 -13
fs/hugetlbfs/inode.c
··· 96 96 #define PGOFF_LOFFT_MAX \ 97 97 (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) 98 98 99 - static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) 99 + static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma) 100 100 { 101 + /* Unfortunate we have to reassign vma->vm_private_data. */ 102 + return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma); 103 + } 104 + 105 + static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) 106 + { 107 + struct file *file = desc->file; 101 108 struct inode *inode = file_inode(file); 102 109 loff_t len, vma_len; 103 110 int ret; ··· 119 112 * way when do_mmap unwinds (may be important on powerpc 120 113 * and ia64). 121 114 */ 122 - vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); 123 - vma->vm_ops = &hugetlb_vm_ops; 115 + desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; 116 + desc->vm_ops = &hugetlb_vm_ops; 124 117 125 118 /* 126 119 * page based offset in vm_pgoff could be sufficiently large to ··· 129 122 * sizeof(unsigned long). So, only check in those instances. 130 123 */ 131 124 if (sizeof(unsigned long) == sizeof(loff_t)) { 132 - if (vma->vm_pgoff & PGOFF_LOFFT_MAX) 125 + if (desc->pgoff & PGOFF_LOFFT_MAX) 133 126 return -EINVAL; 134 127 } 135 128 136 129 /* must be huge page aligned */ 137 - if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) 130 + if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) 138 131 return -EINVAL; 139 132 140 - vma_len = (loff_t)(vma->vm_end - vma->vm_start); 141 - len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 133 + vma_len = (loff_t)vma_desc_size(desc); 134 + len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT); 142 135 /* check for overflow */ 143 136 if (len < vma_len) 144 137 return -EINVAL; ··· 148 141 149 142 ret = -ENOMEM; 150 143 151 - vm_flags = vma->vm_flags; 144 + vm_flags = desc->vm_flags; 152 145 /* 153 146 * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip 154 147 * reserving here. Note: only for SHM hugetlbfs file, the inode ··· 158 151 vm_flags |= VM_NORESERVE; 159 152 160 153 if (hugetlb_reserve_pages(inode, 161 - vma->vm_pgoff >> huge_page_order(h), 162 - len >> huge_page_shift(h), vma, 163 - vm_flags) < 0) 154 + desc->pgoff >> huge_page_order(h), 155 + len >> huge_page_shift(h), desc, 156 + vm_flags) < 0) 164 157 goto out; 165 158 166 159 ret = 0; 167 - if (vma->vm_flags & VM_WRITE && inode->i_size < len) 160 + if ((desc->vm_flags & VM_WRITE) && inode->i_size < len) 168 161 i_size_write(inode, len); 169 162 out: 170 163 inode_unlock(inode); 171 164 165 + if (!ret) { 166 + /* Allocate the VMA lock after we set it up. */ 167 + desc->action.success_hook = hugetlb_file_mmap_prepare_success; 168 + /* 169 + * We cannot permit the rmap finding this VMA in the time 170 + * between the VMA being inserted into the VMA tree and the 171 + * completion/success hook being invoked. 172 + * 173 + * This is because we establish a per-VMA hugetlb lock which can 174 + * be raced by rmap. 175 + */ 176 + desc->action.hide_from_rmap_until_complete = true; 177 + } 172 178 return ret; 173 179 } 174 180 ··· 1240 1220 1241 1221 static const struct file_operations hugetlbfs_file_operations = { 1242 1222 .read_iter = hugetlbfs_read_iter, 1243 - .mmap = hugetlbfs_file_mmap, 1223 + .mmap_prepare = hugetlbfs_file_mmap_prepare, 1244 1224 .fsync = noop_fsync, 1245 1225 .get_unmapped_area = hugetlb_get_unmapped_area, 1246 1226 .llseek = default_llseek,
+7 -2
include/linux/hugetlb.h
··· 150 150 struct folio **foliop); 151 151 #endif /* CONFIG_USERFAULTFD */ 152 152 long hugetlb_reserve_pages(struct inode *inode, long from, long to, 153 - struct vm_area_struct *vma, 154 - vm_flags_t vm_flags); 153 + struct vm_area_desc *desc, vm_flags_t vm_flags); 155 154 long hugetlb_unreserve_pages(struct inode *inode, long start, long end, 156 155 long freed); 157 156 bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); ··· 279 280 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); 280 281 void fixup_hugetlb_reservations(struct vm_area_struct *vma); 281 282 void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); 283 + int hugetlb_vma_lock_alloc(struct vm_area_struct *vma); 282 284 283 285 #else /* !CONFIG_HUGETLB_PAGE */ 284 286 ··· 465 465 } 466 466 467 467 static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {} 468 + 469 + static inline int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) 470 + { 471 + return 0; 472 + } 468 473 469 474 #endif /* !CONFIG_HUGETLB_PAGE */ 470 475
+10 -5
include/linux/hugetlb_inline.h
··· 2 2 #ifndef _LINUX_HUGETLB_INLINE_H 3 3 #define _LINUX_HUGETLB_INLINE_H 4 4 5 - #ifdef CONFIG_HUGETLB_PAGE 6 - 7 5 #include <linux/mm.h> 8 6 9 - static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) 7 + #ifdef CONFIG_HUGETLB_PAGE 8 + 9 + static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) 10 10 { 11 - return !!(vma->vm_flags & VM_HUGETLB); 11 + return !!(vm_flags & VM_HUGETLB); 12 12 } 13 13 14 14 #else 15 15 16 - static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) 16 + static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) 17 17 { 18 18 return false; 19 19 } 20 20 21 21 #endif 22 + 23 + static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) 24 + { 25 + return is_vm_hugetlb_flags(vma->vm_flags); 26 + } 22 27 23 28 #endif
+47 -34
mm/hugetlb.c
··· 119 119 /* Forward declaration */ 120 120 static int hugetlb_acct_memory(struct hstate *h, long delta); 121 121 static void hugetlb_vma_lock_free(struct vm_area_struct *vma); 122 - static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); 123 122 static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); 124 123 static void hugetlb_unshare_pmds(struct vm_area_struct *vma, 125 124 unsigned long start, unsigned long end, bool take_locks); ··· 437 438 } 438 439 } 439 440 440 - static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) 441 + /* 442 + * vma specific semaphore used for pmd sharing and fault/truncation 443 + * synchronization 444 + */ 445 + int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) 441 446 { 442 447 struct hugetlb_vma_lock *vma_lock; 443 448 444 449 /* Only establish in (flags) sharable vmas */ 445 450 if (!vma || !(vma->vm_flags & VM_MAYSHARE)) 446 - return; 451 + return 0; 447 452 448 453 /* Should never get here with non-NULL vm_private_data */ 449 454 if (vma->vm_private_data) 450 - return; 455 + return -EINVAL; 451 456 452 457 vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); 453 458 if (!vma_lock) { ··· 466 463 * allocation failure. 467 464 */ 468 465 pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); 469 - return; 466 + return -EINVAL; 470 467 } 471 468 472 469 kref_init(&vma_lock->refs); 473 470 init_rwsem(&vma_lock->rw_sema); 474 471 vma_lock->vma = vma; 475 472 vma->vm_private_data = vma_lock; 473 + 474 + return 0; 476 475 } 477 476 478 477 /* Helper that removes a struct file_region from the resv_map cache and returns ··· 1206 1201 } 1207 1202 } 1208 1203 1209 - static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 1210 - { 1211 - VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 1212 - VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 1213 - 1214 - set_vma_private_data(vma, (unsigned long)map); 1215 - } 1216 - 1217 1204 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 1218 1205 { 1219 - VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 1220 - VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 1206 + VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma); 1207 + VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma); 1221 1208 1222 1209 set_vma_private_data(vma, get_vma_private_data(vma) | flags); 1210 + } 1211 + 1212 + static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) 1213 + { 1214 + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); 1215 + VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); 1216 + 1217 + desc->private_data = map; 1218 + } 1219 + 1220 + static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) 1221 + { 1222 + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); 1223 + VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); 1224 + 1225 + desc->private_data = (void *)((unsigned long)desc->private_data | flags); 1223 1226 } 1224 1227 1225 1228 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) ··· 1235 1222 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 1236 1223 1237 1224 return (get_vma_private_data(vma) & flag) != 0; 1225 + } 1226 + 1227 + static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag) 1228 + { 1229 + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); 1230 + 1231 + return ((unsigned long)desc->private_data) & flag; 1238 1232 } 1239 1233 1240 1234 bool __vma_private_lock(struct vm_area_struct *vma) ··· 7290 7270 */ 7291 7271 7292 7272 long hugetlb_reserve_pages(struct inode *inode, 7293 - long from, long to, 7294 - struct vm_area_struct *vma, 7295 - vm_flags_t vm_flags) 7273 + long from, long to, 7274 + struct vm_area_desc *desc, 7275 + vm_flags_t vm_flags) 7296 7276 { 7297 7277 long chg = -1, add = -1, spool_resv, gbl_resv; 7298 7278 struct hstate *h = hstate_inode(inode); ··· 7308 7288 } 7309 7289 7310 7290 /* 7311 - * vma specific semaphore used for pmd sharing and fault/truncation 7312 - * synchronization 7313 - */ 7314 - hugetlb_vma_lock_alloc(vma); 7315 - 7316 - /* 7317 7291 * Only apply hugepage reservation if asked. At fault time, an 7318 7292 * attempt will be made for VM_NORESERVE to allocate a page 7319 7293 * without using reserves ··· 7319 7305 * Shared mappings base their reservation on the number of pages that 7320 7306 * are already allocated on behalf of the file. Private mappings need 7321 7307 * to reserve the full area even if read-only as mprotect() may be 7322 - * called to make the mapping read-write. Assume !vma is a shm mapping 7308 + * called to make the mapping read-write. Assume !desc is a shm mapping 7323 7309 */ 7324 - if (!vma || vma->vm_flags & VM_MAYSHARE) { 7310 + if (!desc || desc->vm_flags & VM_MAYSHARE) { 7325 7311 /* 7326 7312 * resv_map can not be NULL as hugetlb_reserve_pages is only 7327 7313 * called for inodes for which resv_maps were created (see ··· 7338 7324 7339 7325 chg = to - from; 7340 7326 7341 - set_vma_resv_map(vma, resv_map); 7342 - set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 7327 + set_vma_desc_resv_map(desc, resv_map); 7328 + set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER); 7343 7329 } 7344 7330 7345 7331 if (chg < 0) ··· 7349 7335 chg * pages_per_huge_page(h), &h_cg) < 0) 7350 7336 goto out_err; 7351 7337 7352 - if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { 7338 + if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) { 7353 7339 /* For private mappings, the hugetlb_cgroup uncharge info hangs 7354 7340 * of the resv_map. 7355 7341 */ ··· 7383 7369 * consumed reservations are stored in the map. Hence, nothing 7384 7370 * else has to be done for private mappings here 7385 7371 */ 7386 - if (!vma || vma->vm_flags & VM_MAYSHARE) { 7372 + if (!desc || desc->vm_flags & VM_MAYSHARE) { 7387 7373 add = region_add(resv_map, from, to, regions_needed, h, h_cg); 7388 7374 7389 7375 if (unlikely(add < 0)) { ··· 7437 7423 hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), 7438 7424 chg * pages_per_huge_page(h), h_cg); 7439 7425 out_err: 7440 - hugetlb_vma_lock_free(vma); 7441 - if (!vma || vma->vm_flags & VM_MAYSHARE) 7426 + if (!desc || desc->vm_flags & VM_MAYSHARE) 7442 7427 /* Only call region_abort if the region_chg succeeded but the 7443 7428 * region_add failed or didn't run. 7444 7429 */ 7445 7430 if (chg >= 0 && add < 0) 7446 7431 region_abort(resv_map, from, to, regions_needed); 7447 - if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 7432 + if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) { 7448 7433 kref_put(&resv_map->refs, resv_map_release); 7449 - set_vma_resv_map(vma, NULL); 7434 + set_vma_desc_resv_map(desc, NULL); 7450 7435 } 7451 7436 return chg < 0 ? chg : add < 0 ? add : -EINVAL; 7452 7437 }