Merge tag 'fsnotify_for_v6.14-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs

Pull fsnotify reverts from Jan Kara:
"Syzbot has found out that fsnotify HSM events generated on page fault
can be generated while we already hold freeze protection for the
filesystem (when you do buffered write from a buffer which is mmapped
file on the same filesystem) which violates expectations for HSM
events and could lead to deadlocks of HSM clients with filesystem
freezing.

Since it's quite late in the cycle we've decided to revert changes
implementing HSM events on page fault for now and instead just
generate one event for the whole range on mmap(2) so that HSM client
can fetch the data at that moment"

* tag 'fsnotify_for_v6.14-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs:
Revert "fanotify: disable readahead if we have pre-content watches"
Revert "mm: don't allow huge faults for files with pre content watches"
Revert "fsnotify: generate pre-content permission event on page fault"
Revert "xfs: add pre-content fsnotify hook for DAX faults"
Revert "ext4: add pre-content fsnotify hook for DAX faults"
fsnotify: add pre-content hooks on mmap()

+24 -143
-3
fs/ext4/file.c
··· 756 return VM_FAULT_SIGBUS; 757 } 758 } else { 759 - result = filemap_fsnotify_fault(vmf); 760 - if (unlikely(result)) 761 - return result; 762 filemap_invalidate_lock_shared(mapping); 763 } 764 result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops);
··· 756 return VM_FAULT_SIGBUS; 757 } 758 } else { 759 filemap_invalidate_lock_shared(mapping); 760 } 761 result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops);
-13
fs/xfs/xfs_file.c
··· 1451 1452 trace_xfs_read_fault(ip, order); 1453 1454 - ret = filemap_fsnotify_fault(vmf); 1455 - if (unlikely(ret)) 1456 - return ret; 1457 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1458 ret = xfs_dax_fault_locked(vmf, order, false); 1459 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); ··· 1479 vm_fault_t ret; 1480 1481 trace_xfs_write_fault(ip, order); 1482 - /* 1483 - * Usually we get here from ->page_mkwrite callback but in case of DAX 1484 - * we will get here also for ordinary write fault. Handle HSM 1485 - * notifications for that case. 1486 - */ 1487 - if (IS_DAX(inode)) { 1488 - ret = filemap_fsnotify_fault(vmf); 1489 - if (unlikely(ret)) 1490 - return ret; 1491 - } 1492 1493 sb_start_pagefault(inode->i_sb); 1494 file_update_time(vmf->vma->vm_file);
··· 1451 1452 trace_xfs_read_fault(ip, order); 1453 1454 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1455 ret = xfs_dax_fault_locked(vmf, order, false); 1456 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); ··· 1482 vm_fault_t ret; 1483 1484 trace_xfs_write_fault(ip, order); 1485 1486 sb_start_pagefault(inode->i_sb); 1487 file_update_time(vmf->vma->vm_file);
+21
include/linux/fsnotify.h
··· 171 } 172 173 /* 174 * fsnotify_truncate_perm - permission hook before file truncate 175 */ 176 static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) ··· 234 235 static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, 236 const loff_t *ppos, size_t count) 237 { 238 return 0; 239 }
··· 171 } 172 173 /* 174 + * fsnotify_mmap_perm - permission hook before mmap of file range 175 + */ 176 + static inline int fsnotify_mmap_perm(struct file *file, int prot, 177 + const loff_t off, size_t len) 178 + { 179 + /* 180 + * mmap() generates only pre-content events. 181 + */ 182 + if (!file || likely(!FMODE_FSNOTIFY_HSM(file->f_mode))) 183 + return 0; 184 + 185 + return fsnotify_pre_content(&file->f_path, &off, len); 186 + } 187 + 188 + /* 189 * fsnotify_truncate_perm - permission hook before file truncate 190 */ 191 static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) ··· 219 220 static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, 221 const loff_t *ppos, size_t count) 222 + { 223 + return 0; 224 + } 225 + 226 + static inline int fsnotify_mmap_perm(struct file *file, int prot, 227 + const loff_t off, size_t len) 228 { 229 return 0; 230 }
-1
include/linux/mm.h
··· 3420 extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, 3421 pgoff_t start_pgoff, pgoff_t end_pgoff); 3422 extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); 3423 - extern vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf); 3424 3425 extern unsigned long stack_guard_gap; 3426 /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
··· 3420 extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, 3421 pgoff_t start_pgoff, pgoff_t end_pgoff); 3422 extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); 3423 3424 extern unsigned long stack_guard_gap; 3425 /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
-86
mm/filemap.c
··· 47 #include <linux/splice.h> 48 #include <linux/rcupdate_wait.h> 49 #include <linux/sched/mm.h> 50 - #include <linux/fsnotify.h> 51 #include <asm/pgalloc.h> 52 #include <asm/tlbflush.h> 53 #include "internal.h" ··· 3197 unsigned long vm_flags = vmf->vma->vm_flags; 3198 unsigned int mmap_miss; 3199 3200 - /* 3201 - * If we have pre-content watches we need to disable readahead to make 3202 - * sure that we don't populate our mapping with 0 filled pages that we 3203 - * never emitted an event for. 3204 - */ 3205 - if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) 3206 - return fpin; 3207 - 3208 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3209 /* Use the readahead code, even if readahead is disabled */ 3210 if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) { ··· 3265 struct file *fpin = NULL; 3266 unsigned int mmap_miss; 3267 3268 - /* See comment in do_sync_mmap_readahead. */ 3269 - if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) 3270 - return fpin; 3271 - 3272 /* If we don't want any read-ahead, don't bother */ 3273 if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) 3274 return fpin; ··· 3322 pte_unmap(ptep); 3323 return ret; 3324 } 3325 - 3326 - /** 3327 - * filemap_fsnotify_fault - maybe emit a pre-content event. 3328 - * @vmf: struct vm_fault containing details of the fault. 3329 - * 3330 - * If we have a pre-content watch on this file we will emit an event for this 3331 - * range. If we return anything the fault caller should return immediately, we 3332 - * will return VM_FAULT_RETRY if we had to emit an event, which will trigger the 3333 - * fault again and then the fault handler will run the second time through. 3334 - * 3335 - * Return: a bitwise-OR of %VM_FAULT_ codes, 0 if nothing happened. 3336 - */ 3337 - vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf) 3338 - { 3339 - struct file *fpin = NULL; 3340 - int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS; 3341 - loff_t pos = vmf->pgoff >> PAGE_SHIFT; 3342 - size_t count = PAGE_SIZE; 3343 - int err; 3344 - 3345 - /* 3346 - * We already did this and now we're retrying with everything locked, 3347 - * don't emit the event and continue. 3348 - */ 3349 - if (vmf->flags & FAULT_FLAG_TRIED) 3350 - return 0; 3351 - 3352 - /* No watches, we're done. */ 3353 - if (likely(!FMODE_FSNOTIFY_HSM(vmf->vma->vm_file->f_mode))) 3354 - return 0; 3355 - 3356 - fpin = maybe_unlock_mmap_for_io(vmf, fpin); 3357 - if (!fpin) 3358 - return VM_FAULT_SIGBUS; 3359 - 3360 - err = fsnotify_file_area_perm(fpin, mask, &pos, count); 3361 - fput(fpin); 3362 - if (err) 3363 - return VM_FAULT_SIGBUS; 3364 - return VM_FAULT_RETRY; 3365 - } 3366 - EXPORT_SYMBOL_GPL(filemap_fsnotify_fault); 3367 3368 /** 3369 * filemap_fault - read in file data for page fault handling ··· 3426 * or because readahead was otherwise unable to retrieve it. 3427 */ 3428 if (unlikely(!folio_test_uptodate(folio))) { 3429 - /* 3430 - * If this is a precontent file we have can now emit an event to 3431 - * try and populate the folio. 3432 - */ 3433 - if (!(vmf->flags & FAULT_FLAG_TRIED) && 3434 - unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { 3435 - loff_t pos = folio_pos(folio); 3436 - size_t count = folio_size(folio); 3437 - 3438 - /* We're NOWAIT, we have to retry. */ 3439 - if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) { 3440 - folio_unlock(folio); 3441 - goto out_retry; 3442 - } 3443 - 3444 - if (mapping_locked) 3445 - filemap_invalidate_unlock_shared(mapping); 3446 - mapping_locked = false; 3447 - 3448 - folio_unlock(folio); 3449 - fpin = maybe_unlock_mmap_for_io(vmf, fpin); 3450 - if (!fpin) 3451 - goto out_retry; 3452 - 3453 - error = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos, 3454 - count); 3455 - if (error) 3456 - ret = VM_FAULT_SIGBUS; 3457 - goto out_retry; 3458 - } 3459 - 3460 /* 3461 * If the invalidate lock is not held, the folio was in cache 3462 * and uptodate and now it is not. Strange but possible since we
··· 47 #include <linux/splice.h> 48 #include <linux/rcupdate_wait.h> 49 #include <linux/sched/mm.h> 50 #include <asm/pgalloc.h> 51 #include <asm/tlbflush.h> 52 #include "internal.h" ··· 3198 unsigned long vm_flags = vmf->vma->vm_flags; 3199 unsigned int mmap_miss; 3200 3201 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3202 /* Use the readahead code, even if readahead is disabled */ 3203 if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) { ··· 3274 struct file *fpin = NULL; 3275 unsigned int mmap_miss; 3276 3277 /* If we don't want any read-ahead, don't bother */ 3278 if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) 3279 return fpin; ··· 3335 pte_unmap(ptep); 3336 return ret; 3337 } 3338 3339 /** 3340 * filemap_fault - read in file data for page fault handling ··· 3481 * or because readahead was otherwise unable to retrieve it. 3482 */ 3483 if (unlikely(!folio_test_uptodate(folio))) { 3484 /* 3485 * If the invalidate lock is not held, the folio was in cache 3486 * and uptodate and now it is not. Strange but possible since we
-19
mm/memory.c
··· 76 #include <linux/ptrace.h> 77 #include <linux/vmalloc.h> 78 #include <linux/sched/sysctl.h> 79 - #include <linux/fsnotify.h> 80 81 #include <trace/events/kmem.h> 82 ··· 5749 static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) 5750 { 5751 struct vm_area_struct *vma = vmf->vma; 5752 - 5753 if (vma_is_anonymous(vma)) 5754 return do_huge_pmd_anonymous_page(vmf); 5755 - /* 5756 - * Currently we just emit PAGE_SIZE for our fault events, so don't allow 5757 - * a huge fault if we have a pre content watch on this file. This would 5758 - * be trivial to support, but there would need to be tests to ensure 5759 - * this works properly and those don't exist currently. 5760 - */ 5761 - if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) 5762 - return VM_FAULT_FALLBACK; 5763 if (vma->vm_ops->huge_fault) 5764 return vma->vm_ops->huge_fault(vmf, PMD_ORDER); 5765 return VM_FAULT_FALLBACK; ··· 5774 } 5775 5776 if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { 5777 - /* See comment in create_huge_pmd. */ 5778 - if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) 5779 - goto split; 5780 if (vma->vm_ops->huge_fault) { 5781 ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER); 5782 if (!(ret & VM_FAULT_FALLBACK)) ··· 5796 /* No support for anonymous transparent PUD pages yet */ 5797 if (vma_is_anonymous(vma)) 5798 return VM_FAULT_FALLBACK; 5799 - /* See comment in create_huge_pmd. */ 5800 - if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) 5801 - return VM_FAULT_FALLBACK; 5802 if (vma->vm_ops->huge_fault) 5803 return vma->vm_ops->huge_fault(vmf, PUD_ORDER); 5804 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ ··· 5813 if (vma_is_anonymous(vma)) 5814 goto split; 5815 if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { 5816 - /* See comment in create_huge_pmd. */ 5817 - if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) 5818 - goto split; 5819 if (vma->vm_ops->huge_fault) { 5820 ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER); 5821 if (!(ret & VM_FAULT_FALLBACK))
··· 76 #include <linux/ptrace.h> 77 #include <linux/vmalloc.h> 78 #include <linux/sched/sysctl.h> 79 80 #include <trace/events/kmem.h> 81 ··· 5750 static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) 5751 { 5752 struct vm_area_struct *vma = vmf->vma; 5753 if (vma_is_anonymous(vma)) 5754 return do_huge_pmd_anonymous_page(vmf); 5755 if (vma->vm_ops->huge_fault) 5756 return vma->vm_ops->huge_fault(vmf, PMD_ORDER); 5757 return VM_FAULT_FALLBACK; ··· 5784 } 5785 5786 if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { 5787 if (vma->vm_ops->huge_fault) { 5788 ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER); 5789 if (!(ret & VM_FAULT_FALLBACK)) ··· 5809 /* No support for anonymous transparent PUD pages yet */ 5810 if (vma_is_anonymous(vma)) 5811 return VM_FAULT_FALLBACK; 5812 if (vma->vm_ops->huge_fault) 5813 return vma->vm_ops->huge_fault(vmf, PUD_ORDER); 5814 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ ··· 5829 if (vma_is_anonymous(vma)) 5830 goto split; 5831 if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { 5832 if (vma->vm_ops->huge_fault) { 5833 ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER); 5834 if (!(ret & VM_FAULT_FALLBACK))
-7
mm/nommu.c
··· 1613 } 1614 EXPORT_SYMBOL(remap_vmalloc_range); 1615 1616 - vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf) 1617 - { 1618 - BUG(); 1619 - return 0; 1620 - } 1621 - EXPORT_SYMBOL_GPL(filemap_fsnotify_fault); 1622 - 1623 vm_fault_t filemap_fault(struct vm_fault *vmf) 1624 { 1625 BUG();
··· 1613 } 1614 EXPORT_SYMBOL(remap_vmalloc_range); 1615 1616 vm_fault_t filemap_fault(struct vm_fault *vmf) 1617 { 1618 BUG();
-14
mm/readahead.c
··· 128 #include <linux/blk-cgroup.h> 129 #include <linux/fadvise.h> 130 #include <linux/sched/mm.h> 131 - #include <linux/fsnotify.h> 132 133 #include "internal.h" 134 ··· 558 pgoff_t prev_index, miss; 559 560 /* 561 - * If we have pre-content watches we need to disable readahead to make 562 - * sure that we don't find 0 filled pages in cache that we never emitted 563 - * events for. Filesystems supporting HSM must make sure to not call 564 - * this function with ractl->file unset for files handled by HSM. 565 - */ 566 - if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode))) 567 - return; 568 - 569 - /* 570 * Even if readahead is disabled, issue this request as readahead 571 * as we'll need it to satisfy the requested range. The forced 572 * readahead will do the right thing and limit the read to just the ··· 633 634 /* no readahead */ 635 if (!ra->ra_pages) 636 - return; 637 - 638 - /* See the comment in page_cache_sync_ra. */ 639 - if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode))) 640 return; 641 642 /*
··· 128 #include <linux/blk-cgroup.h> 129 #include <linux/fadvise.h> 130 #include <linux/sched/mm.h> 131 132 #include "internal.h" 133 ··· 559 pgoff_t prev_index, miss; 560 561 /* 562 * Even if readahead is disabled, issue this request as readahead 563 * as we'll need it to satisfy the requested range. The forced 564 * readahead will do the right thing and limit the read to just the ··· 643 644 /* no readahead */ 645 if (!ra->ra_pages) 646 return; 647 648 /*
+3
mm/util.c
··· 23 #include <linux/processor.h> 24 #include <linux/sizes.h> 25 #include <linux/compat.h> 26 27 #include <linux/uaccess.h> 28 ··· 570 LIST_HEAD(uf); 571 572 ret = security_mmap_file(file, prot, flag); 573 if (!ret) { 574 if (mmap_write_lock_killable(mm)) 575 return -EINTR;
··· 23 #include <linux/processor.h> 24 #include <linux/sizes.h> 25 #include <linux/compat.h> 26 + #include <linux/fsnotify.h> 27 28 #include <linux/uaccess.h> 29 ··· 569 LIST_HEAD(uf); 570 571 ret = security_mmap_file(file, prot, flag); 572 + if (!ret) 573 + ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len); 574 if (!ret) { 575 if (mmap_write_lock_killable(mm)) 576 return -EINTR;