Merge patch series "cheaper MAY_EXEC handling for path lookup"

Mateusz Guzik <mjguzik@gmail.com> says:

In short, MAY_WRITE checks are elided.

This obsoletes the idea of pre-computing if perm checks are necessary as
that turned out to be too hairy. The new code has 2 more branches per
path component compared to that idea, but the perf difference for
typical paths (< 6 components) was basically within noise. To be
revisited if someone(tm) removes other slowdowns.

Instead of the pre-computing thing I added IOP_FASTPERM_MAY_EXEC so that
filesystems like btrfs can still avoid the hard work.

* patches from https://patch.msgid.link/20251107142149.989998-1-mjguzik@gmail.com:
  fs: retire now stale MAY_WRITE predicts in inode_permission()
  btrfs: utilize IOP_FASTPERM_MAY_EXEC
  fs: speed up path lookup with cheaper handling of MAY_EXEC

Link: https://patch.msgid.link/20251107142149.989998-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
Christian Brauner 2025-11-11 10:38:44 +01:00
commit 04f0955b60
No known key found for this signature in database
GPG Key ID: 91C61BC06578DCA2
3 changed files with 61 additions and 11 deletions

View File

@ -5837,6 +5837,8 @@ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root)
if (ret) if (ret)
return ERR_PTR(ret); return ERR_PTR(ret);
if (S_ISDIR(inode->vfs_inode.i_mode))
inode->vfs_inode.i_opflags |= IOP_FASTPERM_MAY_EXEC;
unlock_new_inode(&inode->vfs_inode); unlock_new_inode(&inode->vfs_inode);
return inode; return inode;
} }
@ -6788,8 +6790,11 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
} }
ret = btrfs_create_new_inode(trans, &new_inode_args); ret = btrfs_create_new_inode(trans, &new_inode_args);
if (!ret) if (!ret) {
if (S_ISDIR(inode->i_mode))
inode->i_opflags |= IOP_FASTPERM_MAY_EXEC;
d_instantiate_new(dentry, inode); d_instantiate_new(dentry, inode);
}
btrfs_end_transaction(trans); btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info); btrfs_btree_balance_dirty(fs_info);
@ -9169,6 +9174,11 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
min_size, actual_len, alloc_hint, trans); min_size, actual_len, alloc_hint, trans);
} }
/*
* NOTE: in case you are adding MAY_EXEC check for directories:
* we are marking them with IOP_FASTPERM_MAY_EXEC, allowing path lookup to
* elide calls here.
*/
static int btrfs_permission(struct mnt_idmap *idmap, static int btrfs_permission(struct mnt_idmap *idmap,
struct inode *inode, int mask) struct inode *inode, int mask)
{ {

View File

@ -540,10 +540,13 @@ static inline int do_inode_permission(struct mnt_idmap *idmap,
* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
* *
* Separate out file-system wide checks from inode-specific permission checks. * Separate out file-system wide checks from inode-specific permission checks.
*
* Note: lookup_inode_permission_may_exec() does not call here. If you add
* MAY_EXEC checks, adjust it.
*/ */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask) static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{ {
if (unlikely(mask & MAY_WRITE)) { if (mask & MAY_WRITE) {
umode_t mode = inode->i_mode; umode_t mode = inode->i_mode;
/* Nobody gets write access to a read-only fs. */ /* Nobody gets write access to a read-only fs. */
@ -574,7 +577,7 @@ int inode_permission(struct mnt_idmap *idmap,
if (unlikely(retval)) if (unlikely(retval))
return retval; return retval;
if (unlikely(mask & MAY_WRITE)) { if (mask & MAY_WRITE) {
/* /*
* Nobody gets write access to an immutable file. * Nobody gets write access to an immutable file.
*/ */
@ -602,6 +605,42 @@ int inode_permission(struct mnt_idmap *idmap,
} }
EXPORT_SYMBOL(inode_permission); EXPORT_SYMBOL(inode_permission);
/*
* lookup_inode_permission_may_exec - Check traversal right for given inode
*
* This is a special case routine for may_lookup() making assumptions specific
* to path traversal. Use inode_permission() if you are doing something else.
*
* Work is shaved off compared to inode_permission() as follows:
* - we know for a fact there is no MAY_WRITE to worry about
* - it is an invariant the inode is a directory
*
* Since majority of real-world traversal happens on inodes which grant it for
* everyone, we check it upfront and only resort to more expensive work if it
* fails.
*
* Filesystems which have their own ->permission hook and consequently miss out
* on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC
* on their directory inodes.
*/
static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap,
struct inode *inode, int mask)
{
/* Lookup already checked this to return -ENOTDIR */
VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode);
VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0);
mask |= MAY_EXEC;
if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC))))
return inode_permission(idmap, inode, mask);
if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode)))
return inode_permission(idmap, inode, mask);
return security_inode_permission(inode, mask);
}
/** /**
* path_get - get a reference to a path * path_get - get a reference to a path
* @path: path to get the reference to * @path: path to get the reference to
@ -1855,7 +1894,7 @@ static inline int may_lookup(struct mnt_idmap *idmap,
int err, mask; int err, mask;
mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0; mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0;
err = inode_permission(idmap, nd->inode, mask | MAY_EXEC); err = lookup_inode_permission_may_exec(idmap, nd->inode, mask);
if (likely(!err)) if (likely(!err))
return 0; return 0;
@ -1870,7 +1909,7 @@ static inline int may_lookup(struct mnt_idmap *idmap,
if (err != -ECHILD) // hard error if (err != -ECHILD) // hard error
return err; return err;
return inode_permission(idmap, nd->inode, MAY_EXEC); return lookup_inode_permission_may_exec(idmap, nd->inode, 0);
} }
static int reserve_stack(struct nameidata *nd, struct path *link) static int reserve_stack(struct nameidata *nd, struct path *link)

View File

@ -666,6 +666,7 @@ is_uncached_acl(struct posix_acl *acl)
#define IOP_DEFAULT_READLINK 0x0010 #define IOP_DEFAULT_READLINK 0x0010
#define IOP_MGTIME 0x0020 #define IOP_MGTIME 0x0020
#define IOP_CACHED_LINK 0x0040 #define IOP_CACHED_LINK 0x0040
#define IOP_FASTPERM_MAY_EXEC 0x0080
/* /*
* Inode state bits. Protected by inode->i_lock * Inode state bits. Protected by inode->i_lock