Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge patch series "cheaper MAY_EXEC handling for path lookup"

Mateusz Guzik <mjguzik@gmail.com> says:

In short, MAY_WRITE checks are elided.

This obsoletes the idea of pre-computing if perm checks are necessary as
that turned out to be too hairy. The new code has 2 more branches per
path component compared to that idea, but the perf difference for
typical paths (< 6 components) was basically within noise. To be
revisited if someone(tm) removes other slowdowns.

Instead of the pre-computing thing I added IOP_FASTPERM_MAY_EXEC so that
filesystems like btrfs can still avoid the hard work.

* patches from https://patch.msgid.link/20251107142149.989998-1-mjguzik@gmail.com:
fs: retire now stale MAY_WRITE predicts in inode_permission()
btrfs: utilize IOP_FASTPERM_MAY_EXEC
fs: speed up path lookup with cheaper handling of MAY_EXEC

Link: https://patch.msgid.link/20251107142149.989998-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>

+61 -11
+11 -1
fs/btrfs/inode.c
··· 5837 5837 if (ret) 5838 5838 return ERR_PTR(ret); 5839 5839 5840 + if (S_ISDIR(inode->vfs_inode.i_mode)) 5841 + inode->vfs_inode.i_opflags |= IOP_FASTPERM_MAY_EXEC; 5840 5842 unlock_new_inode(&inode->vfs_inode); 5841 5843 return inode; 5842 5844 } ··· 6790 6788 } 6791 6789 6792 6790 ret = btrfs_create_new_inode(trans, &new_inode_args); 6793 - if (!ret) 6791 + if (!ret) { 6792 + if (S_ISDIR(inode->i_mode)) 6793 + inode->i_opflags |= IOP_FASTPERM_MAY_EXEC; 6794 6794 d_instantiate_new(dentry, inode); 6795 + } 6795 6796 6796 6797 btrfs_end_transaction(trans); 6797 6798 btrfs_btree_balance_dirty(fs_info); ··· 9174 9169 min_size, actual_len, alloc_hint, trans); 9175 9170 } 9176 9171 9172 + /* 9173 + * NOTE: in case you are adding MAY_EXEC check for directories: 9174 + * we are marking them with IOP_FASTPERM_MAY_EXEC, allowing path lookup to 9175 + * elide calls here. 9176 + */ 9177 9177 static int btrfs_permission(struct mnt_idmap *idmap, 9178 9178 struct inode *inode, int mask) 9179 9179 {
+43 -4
fs/namei.c
··· 540 540 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) 541 541 * 542 542 * Separate out file-system wide checks from inode-specific permission checks. 543 + * 544 + * Note: lookup_inode_permission_may_exec() does not call here. If you add 545 + * MAY_EXEC checks, adjust it. 543 546 */ 544 547 static int sb_permission(struct super_block *sb, struct inode *inode, int mask) 545 548 { 546 - if (unlikely(mask & MAY_WRITE)) { 549 + if (mask & MAY_WRITE) { 547 550 umode_t mode = inode->i_mode; 548 551 549 552 /* Nobody gets write access to a read-only fs. */ ··· 577 574 if (unlikely(retval)) 578 575 return retval; 579 576 580 - if (unlikely(mask & MAY_WRITE)) { 577 + if (mask & MAY_WRITE) { 581 578 /* 582 579 * Nobody gets write access to an immutable file. 583 580 */ ··· 604 601 return security_inode_permission(inode, mask); 605 602 } 606 603 EXPORT_SYMBOL(inode_permission); 604 + 605 + /* 606 + * lookup_inode_permission_may_exec - Check traversal right for given inode 607 + * 608 + * This is a special case routine for may_lookup() making assumptions specific 609 + * to path traversal. Use inode_permission() if you are doing something else. 610 + * 611 + * Work is shaved off compared to inode_permission() as follows: 612 + * - we know for a fact there is no MAY_WRITE to worry about 613 + * - it is an invariant the inode is a directory 614 + * 615 + * Since majority of real-world traversal happens on inodes which grant it for 616 + * everyone, we check it upfront and only resort to more expensive work if it 617 + * fails. 618 + * 619 + * Filesystems which have their own ->permission hook and consequently miss out 620 + * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC 621 + * on their directory inodes. 622 + */ 623 + static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap, 624 + struct inode *inode, int mask) 625 + { 626 + /* Lookup already checked this to return -ENOTDIR */ 627 + VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode); 628 + VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0); 629 + 630 + mask |= MAY_EXEC; 631 + 632 + if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC)))) 633 + return inode_permission(idmap, inode, mask); 634 + 635 + if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode))) 636 + return inode_permission(idmap, inode, mask); 637 + 638 + return security_inode_permission(inode, mask); 639 + } 607 640 608 641 /** 609 642 * path_get - get a reference to a path ··· 1894 1855 int err, mask; 1895 1856 1896 1857 mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0; 1897 - err = inode_permission(idmap, nd->inode, mask | MAY_EXEC); 1858 + err = lookup_inode_permission_may_exec(idmap, nd->inode, mask); 1898 1859 if (likely(!err)) 1899 1860 return 0; 1900 1861 ··· 1909 1870 if (err != -ECHILD) // hard error 1910 1871 return err; 1911 1872 1912 - return inode_permission(idmap, nd->inode, MAY_EXEC); 1873 + return lookup_inode_permission_may_exec(idmap, nd->inode, 0); 1913 1874 } 1914 1875 1915 1876 static int reserve_stack(struct nameidata *nd, struct path *link)
+7 -6
include/linux/fs.h
··· 659 659 return (long)acl & 1; 660 660 } 661 661 662 - #define IOP_FASTPERM 0x0001 663 - #define IOP_LOOKUP 0x0002 664 - #define IOP_NOFOLLOW 0x0004 665 - #define IOP_XATTR 0x0008 662 + #define IOP_FASTPERM 0x0001 663 + #define IOP_LOOKUP 0x0002 664 + #define IOP_NOFOLLOW 0x0004 665 + #define IOP_XATTR 0x0008 666 666 #define IOP_DEFAULT_READLINK 0x0010 667 - #define IOP_MGTIME 0x0020 668 - #define IOP_CACHED_LINK 0x0040 667 + #define IOP_MGTIME 0x0020 668 + #define IOP_CACHED_LINK 0x0040 669 + #define IOP_FASTPERM_MAY_EXEC 0x0080 669 670 670 671 /* 671 672 * Inode state bits. Protected by inode->i_lock