Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge patch series "fhandle, pidfs: allow open_by_handle_at() purely based on file handle"

Christian Brauner <brauner@kernel.org> says:

Various filesystems such as pidfs and drm support opening file handles
without having to require a file descriptor to identify the filesystem.
The filesystem are global single instances and can be trivially
identified solely on the information encoded in the file handle.

This makes it possible to not have to keep or acquire a sentinal file
descriptor just to pass it to open_by_handle_at() to identify the
filesystem. That's especially useful when such sentinel file descriptor
cannot or should not be acquired.

For pidfs this means a file handle can function as full replacement for
storing a pid in a file. Instead a file handle can be stored and
reopened purely based on the file handle.

Such autonomous file handles can be opened with or without specifying a
a file descriptor. If no proper file descriptor is used the
FD_PIDFS_ROOT sentinel must be passed. This allows us to define further
special negative fd sentinels in the future.

Userspace can trivially test for support by trying to open the file
handle with an invalid file descriptor.

* patches from https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-0-d02a04858fe3@kernel.org:
selftests/pidfd: decode pidfd file handles withou having to specify an fd
fhandle, pidfs: support open_by_handle_at() purely based on file handle
uapi/fcntl: add FD_PIDFS_ROOT
uapi/fcntl: add FD_INVALID
uapi/fcntl: mark range as reserved
fhandle: reflow get_path_anchor()
pidfs: add pidfs_root_path() helper
fhandle: rename to get_path_anchor()
fhandle: hoist copy_from_user() above get_path_from_fd()
fhandle: raise FILEID_IS_DIR in handle_type

Link: https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-0-d02a04858fe3@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>

+129 -48
+33 -31
fs/fhandle.c
··· 88 88 if (fh_flags & EXPORT_FH_CONNECTABLE) { 89 89 handle->handle_type |= FILEID_IS_CONNECTABLE; 90 90 if (d_is_dir(path->dentry)) 91 - fh_flags |= FILEID_IS_DIR; 91 + handle->handle_type |= FILEID_IS_DIR; 92 92 } 93 93 retval = 0; 94 94 } ··· 168 168 return err; 169 169 } 170 170 171 - static int get_path_from_fd(int fd, struct path *root) 171 + static int get_path_anchor(int fd, struct path *root) 172 172 { 173 + if (fd >= 0) { 174 + CLASS(fd, f)(fd); 175 + if (fd_empty(f)) 176 + return -EBADF; 177 + *root = fd_file(f)->f_path; 178 + path_get(root); 179 + return 0; 180 + } 181 + 173 182 if (fd == AT_FDCWD) { 174 183 struct fs_struct *fs = current->fs; 175 184 spin_lock(&fs->lock); 176 185 *root = fs->pwd; 177 186 path_get(root); 178 187 spin_unlock(&fs->lock); 179 - } else { 180 - CLASS(fd, f)(fd); 181 - if (fd_empty(f)) 182 - return -EBADF; 183 - *root = fd_file(f)->f_path; 184 - path_get(root); 188 + return 0; 185 189 } 186 190 187 - return 0; 191 + if (fd == FD_PIDFS_ROOT) { 192 + pidfs_get_root(root); 193 + return 0; 194 + } 195 + 196 + return -EBADF; 188 197 } 189 198 190 199 static int vfs_dentry_acceptable(void *context, struct dentry *dentry) ··· 332 323 { 333 324 int retval = 0; 334 325 struct file_handle f_handle; 335 - struct file_handle *handle = NULL; 326 + struct file_handle *handle __free(kfree) = NULL; 336 327 struct handle_to_path_ctx ctx = {}; 337 328 const struct export_operations *eops; 338 329 339 - retval = get_path_from_fd(mountdirfd, &ctx.root); 330 + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) 331 + return -EFAULT; 332 + 333 + if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || 334 + (f_handle.handle_bytes == 0)) 335 + return -EINVAL; 336 + 337 + if (f_handle.handle_type < 0 || 338 + FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) 339 + return -EINVAL; 340 + 341 + retval = get_path_anchor(mountdirfd, &ctx.root); 340 342 if (retval) 341 - goto out_err; 343 + return retval; 342 344 343 345 eops = ctx.root.mnt->mnt_sb->s_export_op; 344 346 if (eops && eops->permission) ··· 358 338 retval = may_decode_fh(&ctx, o_flags); 359 339 if (retval) 360 340 goto out_path; 361 - 362 - if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { 363 - retval = -EFAULT; 364 - goto out_path; 365 - } 366 - if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || 367 - (f_handle.handle_bytes == 0)) { 368 - retval = -EINVAL; 369 - goto out_path; 370 - } 371 - if (f_handle.handle_type < 0 || 372 - FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) { 373 - retval = -EINVAL; 374 - goto out_path; 375 - } 376 341 377 342 handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes), 378 343 GFP_KERNEL); ··· 371 366 &ufh->f_handle, 372 367 f_handle.handle_bytes)) { 373 368 retval = -EFAULT; 374 - goto out_handle; 369 + goto out_path; 375 370 } 376 371 377 372 /* ··· 389 384 handle->handle_type &= ~FILEID_USER_FLAGS_MASK; 390 385 retval = do_handle_to_path(handle, path, &ctx); 391 386 392 - out_handle: 393 - kfree(handle); 394 387 out_path: 395 388 path_put(&ctx.root); 396 - out_err: 397 389 return retval; 398 390 } 399 391
+1
fs/internal.h
··· 353 353 unsigned int query_flags); 354 354 int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 355 355 struct iattr *attr); 356 + void pidfs_get_root(struct path *path);
+11
fs/pidfs.c
··· 31 31 static struct kmem_cache *pidfs_attr_cachep __ro_after_init; 32 32 static struct kmem_cache *pidfs_xattr_cachep __ro_after_init; 33 33 34 + static struct path pidfs_root_path = {}; 35 + 36 + void pidfs_get_root(struct path *path) 37 + { 38 + *path = pidfs_root_path; 39 + path_get(path); 40 + } 41 + 34 42 /* 35 43 * Stashes information that userspace needs to access even after the 36 44 * process has been reaped. ··· 1076 1068 pidfs_mnt = kern_mount(&pidfs_type); 1077 1069 if (IS_ERR(pidfs_mnt)) 1078 1070 panic("Failed to mount pidfs pseudo filesystem"); 1071 + 1072 + pidfs_root_path.mnt = pidfs_mnt; 1073 + pidfs_root_path.dentry = pidfs_mnt->mnt_root; 1079 1074 }
+18
include/uapi/linux/fcntl.h
··· 90 90 #define DN_ATTRIB 0x00000020 /* File changed attibutes */ 91 91 #define DN_MULTISHOT 0x80000000 /* Don't remove notifier */ 92 92 93 + /* Reserved kernel ranges [-100], [-10000, -40000]. */ 93 94 #define AT_FDCWD -100 /* Special value for dirfd used to 94 95 indicate openat should use the 95 96 current working directory. */ 96 97 98 + /* 99 + * The concept of process and threads in userland and the kernel is a confusing 100 + * one - within the kernel every thread is a 'task' with its own individual PID, 101 + * however from userland's point of view threads are grouped by a single PID, 102 + * which is that of the 'thread group leader', typically the first thread 103 + * spawned. 104 + * 105 + * To cut the Gideon knot, for internal kernel usage, we refer to 106 + * PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel 107 + * perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread 108 + * group leader... 109 + */ 110 + #define PIDFD_SELF_THREAD -10000 /* Current thread. */ 111 + #define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ 112 + 113 + #define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */ 114 + #define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */ 97 115 98 116 /* Generic flags for the *at(2) family of syscalls. */ 99 117
-15
include/uapi/linux/pidfd.h
··· 43 43 #define PIDFD_COREDUMP_ROOT (1U << 3) /* coredump was done as root. */ 44 44 45 45 /* 46 - * The concept of process and threads in userland and the kernel is a confusing 47 - * one - within the kernel every thread is a 'task' with its own individual PID, 48 - * however from userland's point of view threads are grouped by a single PID, 49 - * which is that of the 'thread group leader', typically the first thread 50 - * spawned. 51 - * 52 - * To cut the Gideon knot, for internal kernel usage, we refer to 53 - * PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel 54 - * perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread 55 - * group leader... 56 - */ 57 - #define PIDFD_SELF_THREAD -10000 /* Current thread. */ 58 - #define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ 59 - 60 - /* 61 46 * ...and for userland we make life simpler - PIDFD_SELF refers to the current 62 47 * thread, PIDFD_SELF_PROCESS refers to the process thread group leader. 63 48 *
+1 -1
tools/testing/selftests/pidfd/Makefile
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 - CFLAGS += -g $(KHDR_INCLUDES) -pthread -Wall 2 + CFLAGS += -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) -pthread -Wall 3 3 4 4 TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \ 5 5 pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test \
+5 -1
tools/testing/selftests/pidfd/pidfd.h
··· 19 19 #include "../kselftest.h" 20 20 #include "../clone3/clone3_selftests.h" 21 21 22 + #ifndef FD_PIDFS_ROOT 23 + #define FD_PIDFS_ROOT -10002 24 + #endif 25 + 22 26 #ifndef P_PIDFD 23 27 #define P_PIDFD 3 24 28 #endif ··· 60 56 #endif 61 57 62 58 #ifndef PIDFD_SELF_THREAD_GROUP 63 - #define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ 59 + #define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ 64 60 #endif 65 61 66 62 #ifndef PIDFD_SELF
+60
tools/testing/selftests/pidfd/pidfd_file_handle_test.c
··· 500 500 ASSERT_EQ(close(pidfd), 0); 501 501 } 502 502 503 + /* 504 + * That we decode a file handle without having to pass a pidfd. 505 + */ 506 + TEST_F(file_handle, decode_purely_based_on_file_handle) 507 + { 508 + int mnt_id; 509 + struct file_handle *fh; 510 + int pidfd = -EBADF; 511 + struct stat st1, st2; 512 + 513 + fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ); 514 + ASSERT_NE(fh, NULL); 515 + memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ); 516 + fh->handle_bytes = MAX_HANDLE_SZ; 517 + 518 + ASSERT_EQ(name_to_handle_at(self->child_pidfd1, "", fh, &mnt_id, AT_EMPTY_PATH), 0); 519 + 520 + ASSERT_EQ(fstat(self->child_pidfd1, &st1), 0); 521 + 522 + pidfd = open_by_handle_at(FD_PIDFS_ROOT, fh, 0); 523 + ASSERT_GE(pidfd, 0); 524 + 525 + ASSERT_EQ(fstat(pidfd, &st2), 0); 526 + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); 527 + 528 + ASSERT_EQ(close(pidfd), 0); 529 + 530 + pidfd = open_by_handle_at(FD_PIDFS_ROOT, fh, O_CLOEXEC); 531 + ASSERT_GE(pidfd, 0); 532 + 533 + ASSERT_EQ(fstat(pidfd, &st2), 0); 534 + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); 535 + 536 + ASSERT_EQ(close(pidfd), 0); 537 + 538 + pidfd = open_by_handle_at(FD_PIDFS_ROOT, fh, O_NONBLOCK); 539 + ASSERT_GE(pidfd, 0); 540 + 541 + ASSERT_EQ(fstat(pidfd, &st2), 0); 542 + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); 543 + 544 + ASSERT_EQ(close(pidfd), 0); 545 + 546 + pidfd = open_by_handle_at(self->pidfd, fh, 0); 547 + ASSERT_GE(pidfd, 0); 548 + 549 + ASSERT_EQ(fstat(pidfd, &st2), 0); 550 + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); 551 + 552 + ASSERT_EQ(close(pidfd), 0); 553 + 554 + pidfd = open_by_handle_at(-EBADF, fh, 0); 555 + ASSERT_LT(pidfd, 0); 556 + 557 + pidfd = open_by_handle_at(AT_FDCWD, fh, 0); 558 + ASSERT_LT(pidfd, 0); 559 + 560 + free(fh); 561 + } 562 + 503 563 TEST_HARNESS_MAIN