fs/pidfs.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / pidfs.c
at master 30 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2#include <linux/anon_inodes.h>
   3#include <linux/exportfs.h>
   4#include <linux/file.h>
   5#include <linux/fs.h>
   6#include <linux/cgroup.h>
   7#include <linux/magic.h>
   8#include <linux/mount.h>
   9#include <linux/pid.h>
  10#include <linux/pidfs.h>
  11#include <linux/pid_namespace.h>
  12#include <linux/poll.h>
  13#include <linux/proc_fs.h>
  14#include <linux/proc_ns.h>
  15#include <linux/pseudo_fs.h>
  16#include <linux/ptrace.h>
  17#include <linux/seq_file.h>
  18#include <uapi/linux/pidfd.h>
  19#include <linux/ipc_namespace.h>
  20#include <linux/time_namespace.h>
  21#include <linux/utsname.h>
  22#include <net/net_namespace.h>
  23#include <linux/coredump.h>
  24#include <linux/xattr.h>
  25
  26#include "internal.h"
  27#include "mount.h"
  28
  29#define PIDFS_PID_DEAD ERR_PTR(-ESRCH)
  30
  31static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
  32static struct kmem_cache *pidfs_xattr_cachep __ro_after_init;
  33
  34static struct path pidfs_root_path = {};
  35
  36void pidfs_get_root(struct path *path)
  37{
  38	*path = pidfs_root_path;
  39	path_get(path);
  40}
  41
  42enum pidfs_attr_mask_bits {
  43	PIDFS_ATTR_BIT_EXIT	= 0,
  44	PIDFS_ATTR_BIT_COREDUMP	= 1,
  45};
  46
  47struct pidfs_attr {
  48	unsigned long attr_mask;
  49	struct simple_xattrs *xattrs;
  50	struct /* exit info */ {
  51		__u64 cgroupid;
  52		__s32 exit_code;
  53	};
  54	__u32 coredump_mask;
  55	__u32 coredump_signal;
  56};
  57
  58static struct rb_root pidfs_ino_tree = RB_ROOT;
  59
  60#if BITS_PER_LONG == 32
  61static inline unsigned long pidfs_ino(u64 ino)
  62{
  63	return lower_32_bits(ino);
  64}
  65
  66/* On 32 bit the generation number are the upper 32 bits. */
  67static inline u32 pidfs_gen(u64 ino)
  68{
  69	return upper_32_bits(ino);
  70}
  71
  72#else
  73
  74/* On 64 bit simply return ino. */
  75static inline unsigned long pidfs_ino(u64 ino)
  76{
  77	return ino;
  78}
  79
  80/* On 64 bit the generation number is 0. */
  81static inline u32 pidfs_gen(u64 ino)
  82{
  83	return 0;
  84}
  85#endif
  86
  87static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b)
  88{
  89	struct pid *pid_a = rb_entry(a, struct pid, pidfs_node);
  90	struct pid *pid_b = rb_entry(b, struct pid, pidfs_node);
  91	u64 pid_ino_a = pid_a->ino;
  92	u64 pid_ino_b = pid_b->ino;
  93
  94	if (pid_ino_a < pid_ino_b)
  95		return -1;
  96	if (pid_ino_a > pid_ino_b)
  97		return 1;
  98	return 0;
  99}
 100
 101void pidfs_add_pid(struct pid *pid)
 102{
 103	static u64 pidfs_ino_nr = 2;
 104
 105	/*
 106	 * On 64 bit nothing special happens. The 64bit number assigned
 107	 * to struct pid is the inode number.
 108	 *
 109	 * On 32 bit the 64 bit number assigned to struct pid is split
 110	 * into two 32 bit numbers. The lower 32 bits are used as the
 111	 * inode number and the upper 32 bits are used as the inode
 112	 * generation number.
 113	 *
 114	 * On 32 bit pidfs_ino() will return the lower 32 bit. When
 115	 * pidfs_ino() returns zero a wrap around happened. When a
 116	 * wraparound happens the 64 bit number will be incremented by 2
 117	 * so inode numbering starts at 2 again.
 118	 *
 119	 * On 64 bit comparing two pidfds is as simple as comparing
 120	 * inode numbers.
 121	 *
 122	 * When a wraparound happens on 32 bit multiple pidfds with the
 123	 * same inode number are likely to exist (This isn't a problem
 124	 * since before pidfs pidfds used the anonymous inode meaning
 125	 * all pidfds had the same inode number.). Userspace can
 126	 * reconstruct the 64 bit identifier by retrieving both the
 127	 * inode number and the inode generation number to compare or
 128	 * use file handles.
 129	 */
 130	if (pidfs_ino(pidfs_ino_nr) == 0)
 131		pidfs_ino_nr += 2;
 132
 133	pid->ino = pidfs_ino_nr;
 134	pid->stashed = NULL;
 135	pid->attr = NULL;
 136	pidfs_ino_nr++;
 137
 138	write_seqcount_begin(&pidmap_lock_seq);
 139	rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp);
 140	write_seqcount_end(&pidmap_lock_seq);
 141}
 142
 143void pidfs_remove_pid(struct pid *pid)
 144{
 145	write_seqcount_begin(&pidmap_lock_seq);
 146	rb_erase(&pid->pidfs_node, &pidfs_ino_tree);
 147	write_seqcount_end(&pidmap_lock_seq);
 148}
 149
 150void pidfs_free_pid(struct pid *pid)
 151{
 152	struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr);
 153	struct simple_xattrs *xattrs __free(kfree) = NULL;
 154
 155	/*
 156	 * Any dentry must've been wiped from the pid by now.
 157	 * Otherwise there's a reference count bug.
 158	 */
 159	VFS_WARN_ON_ONCE(pid->stashed);
 160
 161	/*
 162	 * This if an error occurred during e.g., task creation that
 163	 * causes us to never go through the exit path.
 164	 */
 165	if (unlikely(!attr))
 166		return;
 167
 168	/* This never had a pidfd created. */
 169	if (IS_ERR(attr))
 170		return;
 171
 172	xattrs = no_free_ptr(attr->xattrs);
 173	if (xattrs)
 174		simple_xattrs_free(xattrs, NULL);
 175}
 176
 177#ifdef CONFIG_PROC_FS
 178/**
 179 * pidfd_show_fdinfo - print information about a pidfd
 180 * @m: proc fdinfo file
 181 * @f: file referencing a pidfd
 182 *
 183 * Pid:
 184 * This function will print the pid that a given pidfd refers to in the
 185 * pid namespace of the procfs instance.
 186 * If the pid namespace of the process is not a descendant of the pid
 187 * namespace of the procfs instance 0 will be shown as its pid. This is
 188 * similar to calling getppid() on a process whose parent is outside of
 189 * its pid namespace.
 190 *
 191 * NSpid:
 192 * If pid namespaces are supported then this function will also print
 193 * the pid of a given pidfd refers to for all descendant pid namespaces
 194 * starting from the current pid namespace of the instance, i.e. the
 195 * Pid field and the first entry in the NSpid field will be identical.
 196 * If the pid namespace of the process is not a descendant of the pid
 197 * namespace of the procfs instance 0 will be shown as its first NSpid
 198 * entry and no others will be shown.
 199 * Note that this differs from the Pid and NSpid fields in
 200 * /proc/<pid>/status where Pid and NSpid are always shown relative to
 201 * the  pid namespace of the procfs instance. The difference becomes
 202 * obvious when sending around a pidfd between pid namespaces from a
 203 * different branch of the tree, i.e. where no ancestral relation is
 204 * present between the pid namespaces:
 205 * - create two new pid namespaces ns1 and ns2 in the initial pid
 206 *   namespace (also take care to create new mount namespaces in the
 207 *   new pid namespace and mount procfs)
 208 * - create a process with a pidfd in ns1
 209 * - send pidfd from ns1 to ns2
 210 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
 211 *   have exactly one entry, which is 0
 212 */
 213static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 214{
 215	struct pid *pid = pidfd_pid(f);
 216	struct pid_namespace *ns;
 217	pid_t nr = -1;
 218
 219	if (likely(pid_has_task(pid, PIDTYPE_PID))) {
 220		ns = proc_pid_ns(file_inode(m->file)->i_sb);
 221		nr = pid_nr_ns(pid, ns);
 222	}
 223
 224	seq_put_decimal_ll(m, "Pid:\t", nr);
 225
 226#ifdef CONFIG_PID_NS
 227	seq_put_decimal_ll(m, "\nNSpid:\t", nr);
 228	if (nr > 0) {
 229		int i;
 230
 231		/* If nr is non-zero it means that 'pid' is valid and that
 232		 * ns, i.e. the pid namespace associated with the procfs
 233		 * instance, is in the pid namespace hierarchy of pid.
 234		 * Start at one below the already printed level.
 235		 */
 236		for (i = ns->level + 1; i <= pid->level; i++)
 237			seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
 238	}
 239#endif
 240	seq_putc(m, '\n');
 241}
 242#endif
 243
 244/*
 245 * Poll support for process exit notification.
 246 */
 247static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 248{
 249	struct pid *pid = pidfd_pid(file);
 250	struct task_struct *task;
 251	__poll_t poll_flags = 0;
 252
 253	poll_wait(file, &pid->wait_pidfd, pts);
 254	/*
 255	 * Don't wake waiters if the thread-group leader exited
 256	 * prematurely. They either get notified when the last subthread
 257	 * exits or not at all if one of the remaining subthreads execs
 258	 * and assumes the struct pid of the old thread-group leader.
 259	 */
 260	guard(rcu)();
 261	task = pid_task(pid, PIDTYPE_PID);
 262	if (!task)
 263		poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
 264	else if (task->exit_state && !delay_group_leader(task))
 265		poll_flags = EPOLLIN | EPOLLRDNORM;
 266
 267	return poll_flags;
 268}
 269
 270static inline bool pid_in_current_pidns(const struct pid *pid)
 271{
 272	const struct pid_namespace *ns = task_active_pid_ns(current);
 273
 274	if (ns->level <= pid->level)
 275		return pid->numbers[ns->level].ns == ns;
 276
 277	return false;
 278}
 279
 280static __u32 pidfs_coredump_mask(unsigned long mm_flags)
 281{
 282	switch (__get_dumpable(mm_flags)) {
 283	case SUID_DUMP_USER:
 284		return PIDFD_COREDUMP_USER;
 285	case SUID_DUMP_ROOT:
 286		return PIDFD_COREDUMP_ROOT;
 287	case SUID_DUMP_DISABLE:
 288		return PIDFD_COREDUMP_SKIP;
 289	default:
 290		WARN_ON_ONCE(true);
 291	}
 292
 293	return 0;
 294}
 295
 296/* This must be updated whenever a new flag is added */
 297#define PIDFD_INFO_SUPPORTED (PIDFD_INFO_PID | \
 298			      PIDFD_INFO_CREDS | \
 299			      PIDFD_INFO_CGROUPID | \
 300			      PIDFD_INFO_EXIT | \
 301			      PIDFD_INFO_COREDUMP | \
 302			      PIDFD_INFO_SUPPORTED_MASK | \
 303			      PIDFD_INFO_COREDUMP_SIGNAL)
 304
 305static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 306{
 307	struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
 308	struct task_struct *task __free(put_task) = NULL;
 309	struct pid *pid = pidfd_pid(file);
 310	size_t usize = _IOC_SIZE(cmd);
 311	struct pidfd_info kinfo = {};
 312	struct user_namespace *user_ns;
 313	struct pidfs_attr *attr;
 314	const struct cred *c;
 315	__u64 mask;
 316
 317	BUILD_BUG_ON(sizeof(struct pidfd_info) != PIDFD_INFO_SIZE_VER2);
 318
 319	if (!uinfo)
 320		return -EINVAL;
 321	if (usize < PIDFD_INFO_SIZE_VER0)
 322		return -EINVAL; /* First version, no smaller struct possible */
 323
 324	if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
 325		return -EFAULT;
 326
 327	/*
 328	 * Restrict information retrieval to tasks within the caller's pid
 329	 * namespace hierarchy.
 330	 */
 331	if (!pid_in_current_pidns(pid))
 332		return -ESRCH;
 333
 334	attr = READ_ONCE(pid->attr);
 335	if (mask & PIDFD_INFO_EXIT) {
 336		if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask)) {
 337			smp_rmb();
 338			kinfo.mask |= PIDFD_INFO_EXIT;
 339#ifdef CONFIG_CGROUPS
 340			kinfo.cgroupid = attr->cgroupid;
 341			kinfo.mask |= PIDFD_INFO_CGROUPID;
 342#endif
 343			kinfo.exit_code = attr->exit_code;
 344		}
 345	}
 346
 347	if (mask & PIDFD_INFO_COREDUMP) {
 348		if (test_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask)) {
 349			smp_rmb();
 350			kinfo.mask |= PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL;
 351			kinfo.coredump_mask = attr->coredump_mask;
 352			kinfo.coredump_signal = attr->coredump_signal;
 353		}
 354	}
 355
 356	task = get_pid_task(pid, PIDTYPE_PID);
 357	if (!task) {
 358		/*
 359		 * If the task has already been reaped, only exit
 360		 * information is available
 361		 */
 362		if (!(mask & PIDFD_INFO_EXIT))
 363			return -ESRCH;
 364
 365		goto copy_out;
 366	}
 367
 368	c = get_task_cred(task);
 369	if (!c)
 370		return -ESRCH;
 371
 372	if ((mask & PIDFD_INFO_COREDUMP) && !kinfo.coredump_mask) {
 373		guard(task_lock)(task);
 374		if (task->mm) {
 375			unsigned long flags = __mm_flags_get_dumpable(task->mm);
 376
 377			kinfo.coredump_mask = pidfs_coredump_mask(flags);
 378			kinfo.mask |= PIDFD_INFO_COREDUMP;
 379			/* No coredump actually took place, so no coredump signal. */
 380		}
 381	}
 382
 383	/* Unconditionally return identifiers and credentials, the rest only on request */
 384
 385	user_ns = current_user_ns();
 386	kinfo.ruid = from_kuid_munged(user_ns, c->uid);
 387	kinfo.rgid = from_kgid_munged(user_ns, c->gid);
 388	kinfo.euid = from_kuid_munged(user_ns, c->euid);
 389	kinfo.egid = from_kgid_munged(user_ns, c->egid);
 390	kinfo.suid = from_kuid_munged(user_ns, c->suid);
 391	kinfo.sgid = from_kgid_munged(user_ns, c->sgid);
 392	kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid);
 393	kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid);
 394	kinfo.mask |= PIDFD_INFO_CREDS;
 395	put_cred(c);
 396
 397#ifdef CONFIG_CGROUPS
 398	if (!kinfo.cgroupid) {
 399		struct cgroup *cgrp;
 400
 401		rcu_read_lock();
 402		cgrp = task_dfl_cgroup(task);
 403		kinfo.cgroupid = cgroup_id(cgrp);
 404		kinfo.mask |= PIDFD_INFO_CGROUPID;
 405		rcu_read_unlock();
 406	}
 407#endif
 408
 409	/*
 410	 * Copy pid/tgid last, to reduce the chances the information might be
 411	 * stale. Note that it is not possible to ensure it will be valid as the
 412	 * task might return as soon as the copy_to_user finishes, but that's ok
 413	 * and userspace expects that might happen and can act accordingly, so
 414	 * this is just best-effort. What we can do however is checking that all
 415	 * the fields are set correctly, or return ESRCH to avoid providing
 416	 * incomplete information. */
 417
 418	kinfo.ppid = task_ppid_nr_ns(task, NULL);
 419	kinfo.tgid = task_tgid_vnr(task);
 420	kinfo.pid = task_pid_vnr(task);
 421	kinfo.mask |= PIDFD_INFO_PID;
 422
 423	if (kinfo.pid == 0 || kinfo.tgid == 0)
 424		return -ESRCH;
 425
 426copy_out:
 427	if (mask & PIDFD_INFO_SUPPORTED_MASK) {
 428		kinfo.mask |= PIDFD_INFO_SUPPORTED_MASK;
 429		kinfo.supported_mask = PIDFD_INFO_SUPPORTED;
 430	}
 431
 432	/* Are there bits in the return mask not present in PIDFD_INFO_SUPPORTED? */
 433	WARN_ON_ONCE(~PIDFD_INFO_SUPPORTED & kinfo.mask);
 434	/*
 435	 * If userspace and the kernel have the same struct size it can just
 436	 * be copied. If userspace provides an older struct, only the bits that
 437	 * userspace knows about will be copied. If userspace provides a new
 438	 * struct, only the bits that the kernel knows about will be copied.
 439	 */
 440	return copy_struct_to_user(uinfo, usize, &kinfo, sizeof(kinfo), NULL);
 441}
 442
 443static bool pidfs_ioctl_valid(unsigned int cmd)
 444{
 445	switch (cmd) {
 446	case FS_IOC_GETVERSION:
 447	case PIDFD_GET_CGROUP_NAMESPACE:
 448	case PIDFD_GET_IPC_NAMESPACE:
 449	case PIDFD_GET_MNT_NAMESPACE:
 450	case PIDFD_GET_NET_NAMESPACE:
 451	case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
 452	case PIDFD_GET_TIME_NAMESPACE:
 453	case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
 454	case PIDFD_GET_UTS_NAMESPACE:
 455	case PIDFD_GET_USER_NAMESPACE:
 456	case PIDFD_GET_PID_NAMESPACE:
 457		return true;
 458	}
 459
 460	/* Extensible ioctls require some more careful checks. */
 461	switch (_IOC_NR(cmd)) {
 462	case _IOC_NR(PIDFD_GET_INFO):
 463		/*
 464		 * Try to prevent performing a pidfd ioctl when someone
 465		 * erronously mistook the file descriptor for a pidfd.
 466		 * This is not perfect but will catch most cases.
 467		 */
 468		return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0);
 469	}
 470
 471	return false;
 472}
 473
 474static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 475{
 476	struct task_struct *task __free(put_task) = NULL;
 477	struct nsproxy *nsp __free(put_nsproxy) = NULL;
 478	struct ns_common *ns_common = NULL;
 479
 480	if (!pidfs_ioctl_valid(cmd))
 481		return -ENOIOCTLCMD;
 482
 483	if (cmd == FS_IOC_GETVERSION) {
 484		if (!arg)
 485			return -EINVAL;
 486
 487		__u32 __user *argp = (__u32 __user *)arg;
 488		return put_user(file_inode(file)->i_generation, argp);
 489	}
 490
 491	/* Extensible IOCTL that does not open namespace FDs, take a shortcut */
 492	if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
 493		return pidfd_info(file, cmd, arg);
 494
 495	task = get_pid_task(pidfd_pid(file), PIDTYPE_PID);
 496	if (!task)
 497		return -ESRCH;
 498
 499	if (arg)
 500		return -EINVAL;
 501
 502	scoped_guard(task_lock, task) {
 503		nsp = task->nsproxy;
 504		if (nsp)
 505			get_nsproxy(nsp);
 506	}
 507	if (!nsp)
 508		return -ESRCH; /* just pretend it didn't exist */
 509
 510	/*
 511	 * We're trying to open a file descriptor to the namespace so perform a
 512	 * filesystem cred ptrace check. Also, we mirror nsfs behavior.
 513	 */
 514	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
 515		return -EACCES;
 516
 517	switch (cmd) {
 518	/* Namespaces that hang of nsproxy. */
 519	case PIDFD_GET_CGROUP_NAMESPACE:
 520#ifdef CONFIG_CGROUPS
 521		if (!ns_ref_get(nsp->cgroup_ns))
 522			break;
 523		ns_common = to_ns_common(nsp->cgroup_ns);
 524#endif
 525		break;
 526	case PIDFD_GET_IPC_NAMESPACE:
 527#ifdef CONFIG_IPC_NS
 528		if (!ns_ref_get(nsp->ipc_ns))
 529			break;
 530		ns_common = to_ns_common(nsp->ipc_ns);
 531#endif
 532		break;
 533	case PIDFD_GET_MNT_NAMESPACE:
 534		if (!ns_ref_get(nsp->mnt_ns))
 535			break;
 536		ns_common = to_ns_common(nsp->mnt_ns);
 537		break;
 538	case PIDFD_GET_NET_NAMESPACE:
 539#ifdef CONFIG_NET_NS
 540		if (!ns_ref_get(nsp->net_ns))
 541			break;
 542		ns_common = to_ns_common(nsp->net_ns);
 543#endif
 544		break;
 545	case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
 546#ifdef CONFIG_PID_NS
 547		if (!ns_ref_get(nsp->pid_ns_for_children))
 548			break;
 549		ns_common = to_ns_common(nsp->pid_ns_for_children);
 550#endif
 551		break;
 552	case PIDFD_GET_TIME_NAMESPACE:
 553#ifdef CONFIG_TIME_NS
 554		if (!ns_ref_get(nsp->time_ns))
 555			break;
 556		ns_common = to_ns_common(nsp->time_ns);
 557#endif
 558		break;
 559	case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
 560#ifdef CONFIG_TIME_NS
 561		if (!ns_ref_get(nsp->time_ns_for_children))
 562			break;
 563		ns_common = to_ns_common(nsp->time_ns_for_children);
 564#endif
 565		break;
 566	case PIDFD_GET_UTS_NAMESPACE:
 567#ifdef CONFIG_UTS_NS
 568		if (!ns_ref_get(nsp->uts_ns))
 569			break;
 570		ns_common = to_ns_common(nsp->uts_ns);
 571#endif
 572		break;
 573	/* Namespaces that don't hang of nsproxy. */
 574	case PIDFD_GET_USER_NAMESPACE:
 575#ifdef CONFIG_USER_NS
 576		scoped_guard(rcu) {
 577			struct user_namespace *user_ns;
 578
 579			user_ns = task_cred_xxx(task, user_ns);
 580			if (!ns_ref_get(user_ns))
 581				break;
 582			ns_common = to_ns_common(user_ns);
 583		}
 584#endif
 585		break;
 586	case PIDFD_GET_PID_NAMESPACE:
 587#ifdef CONFIG_PID_NS
 588		scoped_guard(rcu) {
 589			struct pid_namespace *pid_ns;
 590
 591			pid_ns = task_active_pid_ns(task);
 592			if (!ns_ref_get(pid_ns))
 593				break;
 594			ns_common = to_ns_common(pid_ns);
 595		}
 596#endif
 597		break;
 598	default:
 599		return -ENOIOCTLCMD;
 600	}
 601
 602	if (!ns_common)
 603		return -EOPNOTSUPP;
 604
 605	/* open_namespace() unconditionally consumes the reference */
 606	return open_namespace(ns_common);
 607}
 608
 609static const struct file_operations pidfs_file_operations = {
 610	.poll		= pidfd_poll,
 611#ifdef CONFIG_PROC_FS
 612	.show_fdinfo	= pidfd_show_fdinfo,
 613#endif
 614	.unlocked_ioctl	= pidfd_ioctl,
 615	.compat_ioctl   = compat_ptr_ioctl,
 616};
 617
 618struct pid *pidfd_pid(const struct file *file)
 619{
 620	if (file->f_op != &pidfs_file_operations)
 621		return ERR_PTR(-EBADF);
 622	return file_inode(file)->i_private;
 623}
 624
 625/*
 626 * We're called from release_task(). We know there's at least one
 627 * reference to struct pid being held that won't be released until the
 628 * task has been reaped which cannot happen until we're out of
 629 * release_task().
 630 *
 631 * If this struct pid has at least once been referred to by a pidfd then
 632 * pid->attr will be allocated. If not we mark the struct pid as dead so
 633 * anyone who is trying to register it with pidfs will fail to do so.
 634 * Otherwise we would hand out pidfs for reaped tasks without having
 635 * exit information available.
 636 *
 637 * Worst case is that we've filled in the info and the pid gets freed
 638 * right away in free_pid() when no one holds a pidfd anymore. Since
 639 * pidfs_exit() currently is placed after exit_task_work() we know that
 640 * it cannot be us aka the exiting task holding a pidfd to itself.
 641 */
 642void pidfs_exit(struct task_struct *tsk)
 643{
 644	struct pid *pid = task_pid(tsk);
 645	struct pidfs_attr *attr;
 646#ifdef CONFIG_CGROUPS
 647	struct cgroup *cgrp;
 648#endif
 649
 650	might_sleep();
 651
 652	/* Synchronize with pidfs_register_pid(). */
 653	scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) {
 654		attr = pid->attr;
 655		if (!attr) {
 656			/*
 657			 * No one ever held a pidfd for this struct pid.
 658			 * Mark it as dead so no one can add a pidfs
 659			 * entry anymore. We're about to be reaped and
 660			 * so no exit information would be available.
 661			 */
 662			pid->attr = PIDFS_PID_DEAD;
 663			return;
 664		}
 665	}
 666
 667	/*
 668	 * If @pid->attr is set someone might still legitimately hold a
 669	 * pidfd to @pid or someone might concurrently still be getting
 670	 * a reference to an already stashed dentry from @pid->stashed.
 671	 * So defer cleaning @pid->attr until the last reference to @pid
 672	 * is put
 673	 */
 674
 675#ifdef CONFIG_CGROUPS
 676	rcu_read_lock();
 677	cgrp = task_dfl_cgroup(tsk);
 678	attr->cgroupid = cgroup_id(cgrp);
 679	rcu_read_unlock();
 680#endif
 681	attr->exit_code = tsk->exit_code;
 682
 683	/* Ensure that PIDFD_GET_INFO sees either all or nothing. */
 684	smp_wmb();
 685	set_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask);
 686}
 687
 688#ifdef CONFIG_COREDUMP
 689void pidfs_coredump(const struct coredump_params *cprm)
 690{
 691	struct pid *pid = cprm->pid;
 692	struct pidfs_attr *attr;
 693
 694	attr = READ_ONCE(pid->attr);
 695
 696	VFS_WARN_ON_ONCE(!attr);
 697	VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
 698
 699	/* Note how we were coredumped and that we coredumped. */
 700	attr->coredump_mask = pidfs_coredump_mask(cprm->mm_flags) |
 701			      PIDFD_COREDUMPED;
 702	/* If coredumping is set to skip we should never end up here. */
 703	VFS_WARN_ON_ONCE(attr->coredump_mask & PIDFD_COREDUMP_SKIP);
 704	/* Expose the signal number that caused the coredump. */
 705	attr->coredump_signal = cprm->siginfo->si_signo;
 706	smp_wmb();
 707	set_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask);
 708}
 709#endif
 710
 711static struct vfsmount *pidfs_mnt __ro_after_init;
 712
 713/*
 714 * The vfs falls back to simple_setattr() if i_op->setattr() isn't
 715 * implemented. Let's reject it completely until we have a clean
 716 * permission concept for pidfds.
 717 */
 718static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 719			 struct iattr *attr)
 720{
 721	return anon_inode_setattr(idmap, dentry, attr);
 722}
 723
 724static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
 725			 struct kstat *stat, u32 request_mask,
 726			 unsigned int query_flags)
 727{
 728	return anon_inode_getattr(idmap, path, stat, request_mask, query_flags);
 729}
 730
 731static ssize_t pidfs_listxattr(struct dentry *dentry, char *buf, size_t size)
 732{
 733	struct inode *inode = d_inode(dentry);
 734	struct pid *pid = inode->i_private;
 735	struct pidfs_attr *attr = pid->attr;
 736	struct simple_xattrs *xattrs;
 737
 738	xattrs = READ_ONCE(attr->xattrs);
 739	if (!xattrs)
 740		return 0;
 741
 742	return simple_xattr_list(inode, xattrs, buf, size);
 743}
 744
 745static const struct inode_operations pidfs_inode_operations = {
 746	.getattr	= pidfs_getattr,
 747	.setattr	= pidfs_setattr,
 748	.listxattr	= pidfs_listxattr,
 749};
 750
 751static void pidfs_evict_inode(struct inode *inode)
 752{
 753	struct pid *pid = inode->i_private;
 754
 755	clear_inode(inode);
 756	put_pid(pid);
 757}
 758
 759static const struct super_operations pidfs_sops = {
 760	.drop_inode	= inode_just_drop,
 761	.evict_inode	= pidfs_evict_inode,
 762	.statfs		= simple_statfs,
 763};
 764
 765/*
 766 * 'lsof' has knowledge of out historical anon_inode use, and expects
 767 * the pidfs dentry name to start with 'anon_inode'.
 768 */
 769static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
 770{
 771	return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]");
 772}
 773
 774const struct dentry_operations pidfs_dentry_operations = {
 775	.d_dname	= pidfs_dname,
 776	.d_prune	= stashed_dentry_prune,
 777};
 778
 779static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
 780			   struct inode *parent)
 781{
 782	const struct pid *pid = inode->i_private;
 783
 784	if (*max_len < 2) {
 785		*max_len = 2;
 786		return FILEID_INVALID;
 787	}
 788
 789	*max_len = 2;
 790	*(u64 *)fh = pid->ino;
 791	return FILEID_KERNFS;
 792}
 793
 794static int pidfs_ino_find(const void *key, const struct rb_node *node)
 795{
 796	const u64 pid_ino = *(u64 *)key;
 797	const struct pid *pid = rb_entry(node, struct pid, pidfs_node);
 798
 799	if (pid_ino < pid->ino)
 800		return -1;
 801	if (pid_ino > pid->ino)
 802		return 1;
 803	return 0;
 804}
 805
 806/* Find a struct pid based on the inode number. */
 807static struct pid *pidfs_ino_get_pid(u64 ino)
 808{
 809	struct pid *pid;
 810	struct rb_node *node;
 811	unsigned int seq;
 812
 813	guard(rcu)();
 814	do {
 815		seq = read_seqcount_begin(&pidmap_lock_seq);
 816		node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find);
 817		if (node)
 818			break;
 819	} while (read_seqcount_retry(&pidmap_lock_seq, seq));
 820
 821	if (!node)
 822		return NULL;
 823
 824	pid = rb_entry(node, struct pid, pidfs_node);
 825
 826	/* Within our pid namespace hierarchy? */
 827	if (pid_vnr(pid) == 0)
 828		return NULL;
 829
 830	return get_pid(pid);
 831}
 832
 833static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
 834					 struct fid *fid, int fh_len,
 835					 int fh_type)
 836{
 837	int ret;
 838	u64 pid_ino;
 839	struct path path;
 840	struct pid *pid;
 841
 842	if (fh_len < 2)
 843		return NULL;
 844
 845	switch (fh_type) {
 846	case FILEID_KERNFS:
 847		pid_ino = *(u64 *)fid;
 848		break;
 849	default:
 850		return NULL;
 851	}
 852
 853	pid = pidfs_ino_get_pid(pid_ino);
 854	if (!pid)
 855		return NULL;
 856
 857	ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
 858	if (ret < 0)
 859		return ERR_PTR(ret);
 860
 861	VFS_WARN_ON_ONCE(!pid->attr);
 862
 863	mntput(path.mnt);
 864	return path.dentry;
 865}
 866
 867/*
 868 * Make sure that we reject any nonsensical flags that users pass via
 869 * open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and
 870 * PIDFD_NONBLOCK as O_NONBLOCK.
 871 */
 872#define VALID_FILE_HANDLE_OPEN_FLAGS \
 873	(O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL)
 874
 875static int pidfs_export_permission(struct handle_to_path_ctx *ctx,
 876				   unsigned int oflags)
 877{
 878	if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE))
 879		return -EINVAL;
 880
 881	/*
 882	 * pidfd_ino_get_pid() will verify that the struct pid is part
 883	 * of the caller's pid namespace hierarchy. No further
 884	 * permission checks are needed.
 885	 */
 886	return 0;
 887}
 888
 889static struct file *pidfs_export_open(const struct path *path, unsigned int oflags)
 890{
 891	/*
 892	 * Clear O_LARGEFILE as open_by_handle_at() forces it and raise
 893	 * O_RDWR as pidfds always are.
 894	 */
 895	oflags &= ~O_LARGEFILE;
 896	return dentry_open(path, oflags | O_RDWR, current_cred());
 897}
 898
 899static const struct export_operations pidfs_export_operations = {
 900	.encode_fh	= pidfs_encode_fh,
 901	.fh_to_dentry	= pidfs_fh_to_dentry,
 902	.open		= pidfs_export_open,
 903	.permission	= pidfs_export_permission,
 904};
 905
 906static int pidfs_init_inode(struct inode *inode, void *data)
 907{
 908	const struct pid *pid = data;
 909
 910	inode->i_private = data;
 911	inode->i_flags |= S_PRIVATE | S_ANON_INODE;
 912	/* We allow to set xattrs. */
 913	inode->i_flags &= ~S_IMMUTABLE;
 914	inode->i_mode |= S_IRWXU;
 915	inode->i_op = &pidfs_inode_operations;
 916	inode->i_fop = &pidfs_file_operations;
 917	inode->i_ino = pidfs_ino(pid->ino);
 918	inode->i_generation = pidfs_gen(pid->ino);
 919	return 0;
 920}
 921
 922static void pidfs_put_data(void *data)
 923{
 924	struct pid *pid = data;
 925	put_pid(pid);
 926}
 927
 928/**
 929 * pidfs_register_pid - register a struct pid in pidfs
 930 * @pid: pid to pin
 931 *
 932 * Register a struct pid in pidfs.
 933 *
 934 * Return: On success zero, on error a negative error code is returned.
 935 */
 936int pidfs_register_pid(struct pid *pid)
 937{
 938	struct pidfs_attr *new_attr __free(kfree) = NULL;
 939	struct pidfs_attr *attr;
 940
 941	might_sleep();
 942
 943	if (!pid)
 944		return 0;
 945
 946	attr = READ_ONCE(pid->attr);
 947	if (unlikely(attr == PIDFS_PID_DEAD))
 948		return PTR_ERR(PIDFS_PID_DEAD);
 949	if (attr)
 950		return 0;
 951
 952	new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL);
 953	if (!new_attr)
 954		return -ENOMEM;
 955
 956	/* Synchronize with pidfs_exit(). */
 957	guard(spinlock_irq)(&pid->wait_pidfd.lock);
 958
 959	attr = pid->attr;
 960	if (unlikely(attr == PIDFS_PID_DEAD))
 961		return PTR_ERR(PIDFS_PID_DEAD);
 962	if (unlikely(attr))
 963		return 0;
 964
 965	pid->attr = no_free_ptr(new_attr);
 966	return 0;
 967}
 968
 969static struct dentry *pidfs_stash_dentry(struct dentry **stashed,
 970					 struct dentry *dentry)
 971{
 972	int ret;
 973	struct pid *pid = d_inode(dentry)->i_private;
 974
 975	VFS_WARN_ON_ONCE(stashed != &pid->stashed);
 976
 977	ret = pidfs_register_pid(pid);
 978	if (ret)
 979		return ERR_PTR(ret);
 980
 981	return stash_dentry(stashed, dentry);
 982}
 983
 984static const struct stashed_operations pidfs_stashed_ops = {
 985	.stash_dentry	= pidfs_stash_dentry,
 986	.init_inode	= pidfs_init_inode,
 987	.put_data	= pidfs_put_data,
 988};
 989
 990static int pidfs_xattr_get(const struct xattr_handler *handler,
 991			   struct dentry *unused, struct inode *inode,
 992			   const char *suffix, void *value, size_t size)
 993{
 994	struct pid *pid = inode->i_private;
 995	struct pidfs_attr *attr = pid->attr;
 996	const char *name;
 997	struct simple_xattrs *xattrs;
 998
 999	xattrs = READ_ONCE(attr->xattrs);
1000	if (!xattrs)
1001		return 0;
1002
1003	name = xattr_full_name(handler, suffix);
1004	return simple_xattr_get(xattrs, name, value, size);
1005}
1006
1007static int pidfs_xattr_set(const struct xattr_handler *handler,
1008			   struct mnt_idmap *idmap, struct dentry *unused,
1009			   struct inode *inode, const char *suffix,
1010			   const void *value, size_t size, int flags)
1011{
1012	struct pid *pid = inode->i_private;
1013	struct pidfs_attr *attr = pid->attr;
1014	const char *name;
1015	struct simple_xattrs *xattrs;
1016	struct simple_xattr *old_xattr;
1017
1018	/* Ensure we're the only one to set @attr->xattrs. */
1019	WARN_ON_ONCE(!inode_is_locked(inode));
1020
1021	xattrs = READ_ONCE(attr->xattrs);
1022	if (!xattrs) {
1023		xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL);
1024		if (!xattrs)
1025			return -ENOMEM;
1026
1027		simple_xattrs_init(xattrs);
1028		smp_store_release(&pid->attr->xattrs, xattrs);
1029	}
1030
1031	name = xattr_full_name(handler, suffix);
1032	old_xattr = simple_xattr_set(xattrs, name, value, size, flags);
1033	if (IS_ERR(old_xattr))
1034		return PTR_ERR(old_xattr);
1035
1036	simple_xattr_free(old_xattr);
1037	return 0;
1038}
1039
1040static const struct xattr_handler pidfs_trusted_xattr_handler = {
1041	.prefix = XATTR_TRUSTED_PREFIX,
1042	.get	= pidfs_xattr_get,
1043	.set	= pidfs_xattr_set,
1044};
1045
1046static const struct xattr_handler *const pidfs_xattr_handlers[] = {
1047	&pidfs_trusted_xattr_handler,
1048	NULL
1049};
1050
1051static int pidfs_init_fs_context(struct fs_context *fc)
1052{
1053	struct pseudo_fs_context *ctx;
1054
1055	ctx = init_pseudo(fc, PID_FS_MAGIC);
1056	if (!ctx)
1057		return -ENOMEM;
1058
1059	fc->s_iflags |= SB_I_NOEXEC;
1060	fc->s_iflags |= SB_I_NODEV;
1061	ctx->s_d_flags |= DCACHE_DONTCACHE;
1062	ctx->ops = &pidfs_sops;
1063	ctx->eops = &pidfs_export_operations;
1064	ctx->dops = &pidfs_dentry_operations;
1065	ctx->xattr = pidfs_xattr_handlers;
1066	fc->s_fs_info = (void *)&pidfs_stashed_ops;
1067	return 0;
1068}
1069
1070static struct file_system_type pidfs_type = {
1071	.name			= "pidfs",
1072	.init_fs_context	= pidfs_init_fs_context,
1073	.kill_sb		= kill_anon_super,
1074};
1075
1076struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
1077{
1078	struct file *pidfd_file;
1079	struct path path __free(path_put) = {};
1080	int ret;
1081
1082	/*
1083	 * Ensure that PIDFD_STALE can be passed as a flag without
1084	 * overloading other uapi pidfd flags.
1085	 */
1086	BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD);
1087	BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK);
1088
1089	ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
1090	if (ret < 0)
1091		return ERR_PTR(ret);
1092
1093	VFS_WARN_ON_ONCE(!pid->attr);
1094
1095	flags &= ~PIDFD_STALE;
1096	flags |= O_RDWR;
1097	pidfd_file = dentry_open(&path, flags, current_cred());
1098	/* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
1099	if (!IS_ERR(pidfd_file))
1100		pidfd_file->f_flags |= (flags & PIDFD_THREAD);
1101
1102	return pidfd_file;
1103}
1104
1105void __init pidfs_init(void)
1106{
1107	pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
1108					 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
1109					  SLAB_ACCOUNT | SLAB_PANIC), NULL);
1110
1111	pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache",
1112					       sizeof(struct simple_xattrs), 0,
1113					       (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
1114						SLAB_ACCOUNT | SLAB_PANIC), NULL);
1115
1116	pidfs_mnt = kern_mount(&pidfs_type);
1117	if (IS_ERR(pidfs_mnt))
1118		panic("Failed to mount pidfs pseudo filesystem");
1119
1120	pidfs_root_path.mnt = pidfs_mnt;
1121	pidfs_root_path.dentry = pidfs_mnt->mnt_root;
1122}