vfs: get rid of old '->iterate' directory operation

All users now just use '->iterate_shared()', which only takes the
directory inode lock for reading.

Filesystems that never got convered to shared mode now instead use a
wrapper that drops the lock, re-takes it in write mode, calls the old
function, and then downgrades the lock back to read mode.

This way the VFS layer and other callers no longer need to care about
filesystems that never got converted to the modern era.

The filesystems that use the new wrapper are ceph, coda, exfat, jfs,
ntfs, ocfs2, overlayfs, and vboxsf.

Honestly, several of them look like they really could just iterate their
directories in shared mode and skip the wrapper entirely, but the point
of this change is to not change semantics or fix filesystems that
haven't been fixed in the last 7+ years, but to finally get rid of the
dual iterators.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>

authored by Linus Torvalds and committed by Christian Brauner 3e327154 0a2c2baa

Changed files
+95 -58
Documentation
filesystems
fs
ceph
coda
exfat
exportfs
jfs
ntfs
ocfs2
overlayfs
vboxsf
include
linux
+2 -3
Documentation/filesystems/locking.rst
··· 551 551 Note: this does not protect the file->f_pos against concurrent modifications 552 552 since this is something the userspace has to take care about. 553 553 554 - ->iterate() is called with i_rwsem exclusive. 555 - 556 - ->iterate_shared() is called with i_rwsem at least shared. 554 + ->iterate_shared() is called with i_rwsem held for reading, and with the 555 + file f_pos_lock held exclusively 557 556 558 557 ->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags. 559 558 Most instances call fasync_helper(), which does that maintenance, so it's
+10 -15
Documentation/filesystems/porting.rst
··· 537 537 538 538 **mandatory** 539 539 540 - ->readdir() is gone now; switch to ->iterate() 540 + ->readdir() is gone now; switch to ->iterate_shared() 541 541 542 542 **mandatory** 543 543 ··· 693 693 694 694 --- 695 695 696 - **recommended** 696 + **mandatory** 697 697 698 - ->iterate_shared() is added; it's a parallel variant of ->iterate(). 698 + ->iterate_shared() is added. 699 699 Exclusion on struct file level is still provided (as well as that 700 700 between it and lseek on the same struct file), but if your directory 701 701 has been opened several times, you can get these called in parallel. 702 702 Exclusion between that method and all directory-modifying ones is 703 703 still provided, of course. 704 704 705 - Often enough ->iterate() can serve as ->iterate_shared() without any 706 - changes - it is a read-only operation, after all. If you have any 707 - per-inode or per-dentry in-core data structures modified by ->iterate(), 708 - you might need something to serialize the access to them. If you 709 - do dcache pre-seeding, you'll need to switch to d_alloc_parallel() for 710 - that; look for in-tree examples. 711 - 712 - Old method is only used if the new one is absent; eventually it will 713 - be removed. Switch while you still can; the old one won't stay. 705 + If you have any per-inode or per-dentry in-core data structures modified 706 + by ->iterate_shared(), you might need something to serialize the access 707 + to them. If you do dcache pre-seeding, you'll need to switch to 708 + d_alloc_parallel() for that; look for in-tree examples. 714 709 715 710 --- 716 711 ··· 925 930 filldir_t (readdir callbacks) calling conventions have changed. Instead of 926 931 returning 0 or -E... it returns bool now. false means "no more" (as -E... used 927 932 to) and true - "keep going" (as 0 in old calling conventions). Rationale: 928 - callers never looked at specific -E... values anyway. ->iterate() and 929 - ->iterate_shared() instance require no changes at all, all filldir_t ones in 930 - the tree converted. 933 + callers never looked at specific -E... values anyway. -> iterate_shared() 934 + instances require no changes at all, all filldir_t ones in the tree 935 + converted. 931 936 932 937 --- 933 938
+3 -2
fs/ceph/dir.c
··· 2019 2019 } 2020 2020 } 2021 2021 2022 + WRAP_DIR_ITER(ceph_readdir) // FIXME! 2022 2023 const struct file_operations ceph_dir_fops = { 2023 2024 .read = ceph_read_dir, 2024 - .iterate = ceph_readdir, 2025 + .iterate_shared = shared_ceph_readdir, 2025 2026 .llseek = ceph_dir_llseek, 2026 2027 .open = ceph_open, 2027 2028 .release = ceph_release, ··· 2034 2033 }; 2035 2034 2036 2035 const struct file_operations ceph_snapdir_fops = { 2037 - .iterate = ceph_readdir, 2036 + .iterate_shared = shared_ceph_readdir, 2038 2037 .llseek = ceph_dir_llseek, 2039 2038 .open = ceph_open, 2040 2039 .release = ceph_release,
+7 -13
fs/coda/dir.c
··· 429 429 cfi = coda_ftoc(coda_file); 430 430 host_file = cfi->cfi_container; 431 431 432 - if (host_file->f_op->iterate || host_file->f_op->iterate_shared) { 432 + if (host_file->f_op->iterate_shared) { 433 433 struct inode *host_inode = file_inode(host_file); 434 434 ret = -ENOENT; 435 435 if (!IS_DEADDIR(host_inode)) { 436 - if (host_file->f_op->iterate_shared) { 437 - inode_lock_shared(host_inode); 438 - ret = host_file->f_op->iterate_shared(host_file, ctx); 439 - file_accessed(host_file); 440 - inode_unlock_shared(host_inode); 441 - } else { 442 - inode_lock(host_inode); 443 - ret = host_file->f_op->iterate(host_file, ctx); 444 - file_accessed(host_file); 445 - inode_unlock(host_inode); 446 - } 436 + inode_lock_shared(host_inode); 437 + ret = host_file->f_op->iterate_shared(host_file, ctx); 438 + file_accessed(host_file); 439 + inode_unlock_shared(host_inode); 447 440 } 448 441 return ret; 449 442 } ··· 578 585 .setattr = coda_setattr, 579 586 }; 580 587 588 + WRAP_DIR_ITER(coda_readdir) // FIXME! 581 589 const struct file_operations coda_dir_operations = { 582 590 .llseek = generic_file_llseek, 583 591 .read = generic_read_dir, 584 - .iterate = coda_readdir, 592 + .iterate_shared = shared_coda_readdir, 585 593 .open = coda_open, 586 594 .release = coda_release, 587 595 .fsync = coda_fsync,
+2 -1
fs/exfat/dir.c
··· 306 306 return err; 307 307 } 308 308 309 + WRAP_DIR_ITER(exfat_iterate) // FIXME! 309 310 const struct file_operations exfat_dir_operations = { 310 311 .llseek = generic_file_llseek, 311 312 .read = generic_read_dir, 312 - .iterate = exfat_iterate, 313 + .iterate_shared = shared_exfat_iterate, 313 314 .unlocked_ioctl = exfat_ioctl, 314 315 #ifdef CONFIG_COMPAT 315 316 .compat_ioctl = exfat_compat_ioctl,
+1 -1
fs/exportfs/expfs.c
··· 315 315 goto out; 316 316 317 317 error = -EINVAL; 318 - if (!file->f_op->iterate && !file->f_op->iterate_shared) 318 + if (!file->f_op->iterate_shared) 319 319 goto out_close; 320 320 321 321 buffer.sequence = 0;
+2 -1
fs/jfs/namei.c
··· 1535 1535 #endif 1536 1536 }; 1537 1537 1538 + WRAP_DIR_ITER(jfs_readdir) // FIXME! 1538 1539 const struct file_operations jfs_dir_operations = { 1539 1540 .read = generic_read_dir, 1540 - .iterate = jfs_readdir, 1541 + .iterate_shared = shared_jfs_readdir, 1541 1542 .fsync = jfs_fsync, 1542 1543 .unlocked_ioctl = jfs_ioctl, 1543 1544 .compat_ioctl = compat_ptr_ioctl,
+2 -1
fs/ntfs/dir.c
··· 1525 1525 1526 1526 #endif /* NTFS_RW */ 1527 1527 1528 + WRAP_DIR_ITER(ntfs_readdir) // FIXME! 1528 1529 const struct file_operations ntfs_dir_ops = { 1529 1530 .llseek = generic_file_llseek, /* Seek inside directory. */ 1530 1531 .read = generic_read_dir, /* Return -EISDIR. */ 1531 - .iterate = ntfs_readdir, /* Read directory contents. */ 1532 + .iterate_shared = shared_ntfs_readdir, /* Read directory contents. */ 1532 1533 #ifdef NTFS_RW 1533 1534 .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */ 1534 1535 #endif /* NTFS_RW */
+3 -2
fs/ocfs2/file.c
··· 2793 2793 .remap_file_range = ocfs2_remap_file_range, 2794 2794 }; 2795 2795 2796 + WRAP_DIR_ITER(ocfs2_readdir) // FIXME! 2796 2797 const struct file_operations ocfs2_dops = { 2797 2798 .llseek = generic_file_llseek, 2798 2799 .read = generic_read_dir, 2799 - .iterate = ocfs2_readdir, 2800 + .iterate_shared = shared_ocfs2_readdir, 2800 2801 .fsync = ocfs2_sync_file, 2801 2802 .release = ocfs2_dir_release, 2802 2803 .open = ocfs2_dir_open, ··· 2843 2842 const struct file_operations ocfs2_dops_no_plocks = { 2844 2843 .llseek = generic_file_llseek, 2845 2844 .read = generic_read_dir, 2846 - .iterate = ocfs2_readdir, 2845 + .iterate_shared = shared_ocfs2_readdir, 2847 2846 .fsync = ocfs2_sync_file, 2848 2847 .release = ocfs2_dir_release, 2849 2848 .open = ocfs2_dir_open,
+2 -1
fs/overlayfs/readdir.c
··· 954 954 return 0; 955 955 } 956 956 957 + WRAP_DIR_ITER(ovl_iterate) // FIXME! 957 958 const struct file_operations ovl_dir_operations = { 958 959 .read = generic_read_dir, 959 960 .open = ovl_dir_open, 960 - .iterate = ovl_iterate, 961 + .iterate_shared = shared_ovl_iterate, 961 962 .llseek = ovl_dir_llseek, 962 963 .fsync = ovl_dir_fsync, 963 964 .release = ovl_dir_release,
+52 -16
fs/readdir.c
··· 25 25 #include <asm/unaligned.h> 26 26 27 27 /* 28 + * Some filesystems were never converted to '->iterate_shared()' 29 + * and their directory iterators want the inode lock held for 30 + * writing. This wrapper allows for converting from the shared 31 + * semantics to the exclusive inode use. 32 + */ 33 + int wrap_directory_iterator(struct file *file, 34 + struct dir_context *ctx, 35 + int (*iter)(struct file *, struct dir_context *)) 36 + { 37 + struct inode *inode = file_inode(file); 38 + int ret; 39 + 40 + /* 41 + * We'd love to have an 'inode_upgrade_trylock()' operation, 42 + * see the comment in mmap_upgrade_trylock() in mm/memory.c. 43 + * 44 + * But considering this is for "filesystems that never got 45 + * converted", it really doesn't matter. 46 + * 47 + * Also note that since we have to return with the lock held 48 + * for reading, we can't use the "killable()" locking here, 49 + * since we do need to get the lock even if we're dying. 50 + * 51 + * We could do the write part killably and then get the read 52 + * lock unconditionally if it mattered, but see above on why 53 + * this does the very simplistic conversion. 54 + */ 55 + up_read(&inode->i_rwsem); 56 + down_write(&inode->i_rwsem); 57 + 58 + /* 59 + * Since we dropped the inode lock, we should do the 60 + * DEADDIR test again. See 'iterate_dir()' below. 61 + * 62 + * Note that we don't need to re-do the f_pos games, 63 + * since the file must be locked wrt f_pos anyway. 64 + */ 65 + ret = -ENOENT; 66 + if (!IS_DEADDIR(inode)) 67 + ret = iter(file, ctx); 68 + 69 + downgrade_write(&inode->i_rwsem); 70 + return ret; 71 + } 72 + EXPORT_SYMBOL(wrap_directory_iterator); 73 + 74 + /* 28 75 * Note the "unsafe_put_user() semantics: we goto a 29 76 * label for errors. 30 77 */ ··· 87 40 int iterate_dir(struct file *file, struct dir_context *ctx) 88 41 { 89 42 struct inode *inode = file_inode(file); 90 - bool shared = false; 91 43 int res = -ENOTDIR; 92 - if (file->f_op->iterate_shared) 93 - shared = true; 94 - else if (!file->f_op->iterate) 44 + 45 + if (!file->f_op->iterate_shared) 95 46 goto out; 96 47 97 48 res = security_file_permission(file, MAY_READ); 98 49 if (res) 99 50 goto out; 100 51 101 - if (shared) 102 - res = down_read_killable(&inode->i_rwsem); 103 - else 104 - res = down_write_killable(&inode->i_rwsem); 52 + res = down_read_killable(&inode->i_rwsem); 105 53 if (res) 106 54 goto out; 107 55 108 56 res = -ENOENT; 109 57 if (!IS_DEADDIR(inode)) { 110 58 ctx->pos = file->f_pos; 111 - if (shared) 112 - res = file->f_op->iterate_shared(file, ctx); 113 - else 114 - res = file->f_op->iterate(file, ctx); 59 + res = file->f_op->iterate_shared(file, ctx); 115 60 file->f_pos = ctx->pos; 116 61 fsnotify_access(file); 117 62 file_accessed(file); 118 63 } 119 - if (shared) 120 - inode_unlock_shared(inode); 121 - else 122 - inode_unlock(inode); 64 + inode_unlock_shared(inode); 123 65 out: 124 66 return res; 125 67 }
+2 -1
fs/vboxsf/dir.c
··· 179 179 return 0; 180 180 } 181 181 182 + WRAP_DIR_ITER(vboxsf_dir_iterate) // FIXME! 182 183 const struct file_operations vboxsf_dir_fops = { 183 184 .open = vboxsf_dir_open, 184 - .iterate = vboxsf_dir_iterate, 185 + .iterate_shared = shared_vboxsf_dir_iterate, 185 186 .release = vboxsf_dir_release, 186 187 .read = generic_read_dir, 187 188 .llseek = generic_file_llseek,
+7 -1
include/linux/fs.h
··· 1780 1780 ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); 1781 1781 int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *, 1782 1782 unsigned int flags); 1783 - int (*iterate) (struct file *, struct dir_context *); 1784 1783 int (*iterate_shared) (struct file *, struct dir_context *); 1785 1784 __poll_t (*poll) (struct file *, struct poll_table_struct *); 1786 1785 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); ··· 1815 1816 int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, 1816 1817 unsigned int poll_flags); 1817 1818 } __randomize_layout; 1819 + 1820 + /* Wrap a directory iterator that needs exclusive inode access */ 1821 + int wrap_directory_iterator(struct file *, struct dir_context *, 1822 + int (*) (struct file *, struct dir_context *)); 1823 + #define WRAP_DIR_ITER(x) \ 1824 + static int shared_##x(struct file *file , struct dir_context *ctx) \ 1825 + { return wrap_directory_iterator(file, ctx, x); } 1818 1826 1819 1827 struct inode_operations { 1820 1828 struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);