fs/proc/base.c at v5.8-rc1 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / proc / base.c
at v5.8-rc1 3813 lines 93 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/proc/base.c
   4 *
   5 *  Copyright (C) 1991, 1992 Linus Torvalds
   6 *
   7 *  proc base directory handling functions
   8 *
   9 *  1999, Al Viro. Rewritten. Now it covers the whole per-process part.
  10 *  Instead of using magical inumbers to determine the kind of object
  11 *  we allocate and fill in-core inodes upon lookup. They don't even
  12 *  go into icache. We cache the reference to task_struct upon lookup too.
  13 *  Eventually it should become a filesystem in its own. We don't use the
  14 *  rest of procfs anymore.
  15 *
  16 *
  17 *  Changelog:
  18 *  17-Jan-2005
  19 *  Allan Bezerra
  20 *  Bruna Moreira <bruna.moreira@indt.org.br>
  21 *  Edjard Mota <edjard.mota@indt.org.br>
  22 *  Ilias Biris <ilias.biris@indt.org.br>
  23 *  Mauricio Lin <mauricio.lin@indt.org.br>
  24 *
  25 *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
  26 *
  27 *  A new process specific entry (smaps) included in /proc. It shows the
  28 *  size of rss for each memory area. The maps entry lacks information
  29 *  about physical memory size (rss) for each mapped file, i.e.,
  30 *  rss information for executables and library files.
  31 *  This additional information is useful for any tools that need to know
  32 *  about physical memory consumption for a process specific library.
  33 *
  34 *  Changelog:
  35 *  21-Feb-2005
  36 *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
  37 *  Pud inclusion in the page table walking.
  38 *
  39 *  ChangeLog:
  40 *  10-Mar-2005
  41 *  10LE Instituto Nokia de Tecnologia - INdT:
  42 *  A better way to walks through the page table as suggested by Hugh Dickins.
  43 *
  44 *  Simo Piiroinen <simo.piiroinen@nokia.com>:
  45 *  Smaps information related to shared, private, clean and dirty pages.
  46 *
  47 *  Paul Mundt <paul.mundt@nokia.com>:
  48 *  Overall revision about smaps.
  49 */
  50
  51#include <linux/uaccess.h>
  52
  53#include <linux/errno.h>
  54#include <linux/time.h>
  55#include <linux/proc_fs.h>
  56#include <linux/stat.h>
  57#include <linux/task_io_accounting_ops.h>
  58#include <linux/init.h>
  59#include <linux/capability.h>
  60#include <linux/file.h>
  61#include <linux/fdtable.h>
  62#include <linux/generic-radix-tree.h>
  63#include <linux/string.h>
  64#include <linux/seq_file.h>
  65#include <linux/namei.h>
  66#include <linux/mnt_namespace.h>
  67#include <linux/mm.h>
  68#include <linux/swap.h>
  69#include <linux/rcupdate.h>
  70#include <linux/kallsyms.h>
  71#include <linux/stacktrace.h>
  72#include <linux/resource.h>
  73#include <linux/module.h>
  74#include <linux/mount.h>
  75#include <linux/security.h>
  76#include <linux/ptrace.h>
  77#include <linux/tracehook.h>
  78#include <linux/printk.h>
  79#include <linux/cache.h>
  80#include <linux/cgroup.h>
  81#include <linux/cpuset.h>
  82#include <linux/audit.h>
  83#include <linux/poll.h>
  84#include <linux/nsproxy.h>
  85#include <linux/oom.h>
  86#include <linux/elf.h>
  87#include <linux/pid_namespace.h>
  88#include <linux/user_namespace.h>
  89#include <linux/fs_struct.h>
  90#include <linux/slab.h>
  91#include <linux/sched/autogroup.h>
  92#include <linux/sched/mm.h>
  93#include <linux/sched/coredump.h>
  94#include <linux/sched/debug.h>
  95#include <linux/sched/stat.h>
  96#include <linux/posix-timers.h>
  97#include <linux/time_namespace.h>
  98#include <linux/resctrl.h>
  99#include <trace/events/oom.h>
 100#include "internal.h"
 101#include "fd.h"
 102
 103#include "../../lib/kstrtox.h"
 104
 105/* NOTE:
 106 *	Implementing inode permission operations in /proc is almost
 107 *	certainly an error.  Permission checks need to happen during
 108 *	each system call not at open time.  The reason is that most of
 109 *	what we wish to check for permissions in /proc varies at runtime.
 110 *
 111 *	The classic example of a problem is opening file descriptors
 112 *	in /proc for a task before it execs a suid executable.
 113 */
 114
 115static u8 nlink_tid __ro_after_init;
 116static u8 nlink_tgid __ro_after_init;
 117
 118struct pid_entry {
 119	const char *name;
 120	unsigned int len;
 121	umode_t mode;
 122	const struct inode_operations *iop;
 123	const struct file_operations *fop;
 124	union proc_op op;
 125};
 126
 127#define NOD(NAME, MODE, IOP, FOP, OP) {			\
 128	.name = (NAME),					\
 129	.len  = sizeof(NAME) - 1,			\
 130	.mode = MODE,					\
 131	.iop  = IOP,					\
 132	.fop  = FOP,					\
 133	.op   = OP,					\
 134}
 135
 136#define DIR(NAME, MODE, iops, fops)	\
 137	NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
 138#define LNK(NAME, get_link)					\
 139	NOD(NAME, (S_IFLNK|S_IRWXUGO),				\
 140		&proc_pid_link_inode_operations, NULL,		\
 141		{ .proc_get_link = get_link } )
 142#define REG(NAME, MODE, fops)				\
 143	NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
 144#define ONE(NAME, MODE, show)				\
 145	NOD(NAME, (S_IFREG|(MODE)),			\
 146		NULL, &proc_single_file_operations,	\
 147		{ .proc_show = show } )
 148#define ATTR(LSM, NAME, MODE)				\
 149	NOD(NAME, (S_IFREG|(MODE)),			\
 150		NULL, &proc_pid_attr_operations,	\
 151		{ .lsm = LSM })
 152
 153/*
 154 * Count the number of hardlinks for the pid_entry table, excluding the .
 155 * and .. links.
 156 */
 157static unsigned int __init pid_entry_nlink(const struct pid_entry *entries,
 158	unsigned int n)
 159{
 160	unsigned int i;
 161	unsigned int count;
 162
 163	count = 2;
 164	for (i = 0; i < n; ++i) {
 165		if (S_ISDIR(entries[i].mode))
 166			++count;
 167	}
 168
 169	return count;
 170}
 171
 172static int get_task_root(struct task_struct *task, struct path *root)
 173{
 174	int result = -ENOENT;
 175
 176	task_lock(task);
 177	if (task->fs) {
 178		get_fs_root(task->fs, root);
 179		result = 0;
 180	}
 181	task_unlock(task);
 182	return result;
 183}
 184
 185static int proc_cwd_link(struct dentry *dentry, struct path *path)
 186{
 187	struct task_struct *task = get_proc_task(d_inode(dentry));
 188	int result = -ENOENT;
 189
 190	if (task) {
 191		task_lock(task);
 192		if (task->fs) {
 193			get_fs_pwd(task->fs, path);
 194			result = 0;
 195		}
 196		task_unlock(task);
 197		put_task_struct(task);
 198	}
 199	return result;
 200}
 201
 202static int proc_root_link(struct dentry *dentry, struct path *path)
 203{
 204	struct task_struct *task = get_proc_task(d_inode(dentry));
 205	int result = -ENOENT;
 206
 207	if (task) {
 208		result = get_task_root(task, path);
 209		put_task_struct(task);
 210	}
 211	return result;
 212}
 213
 214/*
 215 * If the user used setproctitle(), we just get the string from
 216 * user space at arg_start, and limit it to a maximum of one page.
 217 */
 218static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
 219				size_t count, unsigned long pos,
 220				unsigned long arg_start)
 221{
 222	char *page;
 223	int ret, got;
 224
 225	if (pos >= PAGE_SIZE)
 226		return 0;
 227
 228	page = (char *)__get_free_page(GFP_KERNEL);
 229	if (!page)
 230		return -ENOMEM;
 231
 232	ret = 0;
 233	got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON);
 234	if (got > 0) {
 235		int len = strnlen(page, got);
 236
 237		/* Include the NUL character if it was found */
 238		if (len < got)
 239			len++;
 240
 241		if (len > pos) {
 242			len -= pos;
 243			if (len > count)
 244				len = count;
 245			len -= copy_to_user(buf, page+pos, len);
 246			if (!len)
 247				len = -EFAULT;
 248			ret = len;
 249		}
 250	}
 251	free_page((unsigned long)page);
 252	return ret;
 253}
 254
 255static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
 256			      size_t count, loff_t *ppos)
 257{
 258	unsigned long arg_start, arg_end, env_start, env_end;
 259	unsigned long pos, len;
 260	char *page, c;
 261
 262	/* Check if process spawned far enough to have cmdline. */
 263	if (!mm->env_end)
 264		return 0;
 265
 266	spin_lock(&mm->arg_lock);
 267	arg_start = mm->arg_start;
 268	arg_end = mm->arg_end;
 269	env_start = mm->env_start;
 270	env_end = mm->env_end;
 271	spin_unlock(&mm->arg_lock);
 272
 273	if (arg_start >= arg_end)
 274		return 0;
 275
 276	/*
 277	 * We allow setproctitle() to overwrite the argument
 278	 * strings, and overflow past the original end. But
 279	 * only when it overflows into the environment area.
 280	 */
 281	if (env_start != arg_end || env_end < env_start)
 282		env_start = env_end = arg_end;
 283	len = env_end - arg_start;
 284
 285	/* We're not going to care if "*ppos" has high bits set */
 286	pos = *ppos;
 287	if (pos >= len)
 288		return 0;
 289	if (count > len - pos)
 290		count = len - pos;
 291	if (!count)
 292		return 0;
 293
 294	/*
 295	 * Magical special case: if the argv[] end byte is not
 296	 * zero, the user has overwritten it with setproctitle(3).
 297	 *
 298	 * Possible future enhancement: do this only once when
 299	 * pos is 0, and set a flag in the 'struct file'.
 300	 */
 301	if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c)
 302		return get_mm_proctitle(mm, buf, count, pos, arg_start);
 303
 304	/*
 305	 * For the non-setproctitle() case we limit things strictly
 306	 * to the [arg_start, arg_end[ range.
 307	 */
 308	pos += arg_start;
 309	if (pos < arg_start || pos >= arg_end)
 310		return 0;
 311	if (count > arg_end - pos)
 312		count = arg_end - pos;
 313
 314	page = (char *)__get_free_page(GFP_KERNEL);
 315	if (!page)
 316		return -ENOMEM;
 317
 318	len = 0;
 319	while (count) {
 320		int got;
 321		size_t size = min_t(size_t, PAGE_SIZE, count);
 322
 323		got = access_remote_vm(mm, pos, page, size, FOLL_ANON);
 324		if (got <= 0)
 325			break;
 326		got -= copy_to_user(buf, page, got);
 327		if (unlikely(!got)) {
 328			if (!len)
 329				len = -EFAULT;
 330			break;
 331		}
 332		pos += got;
 333		buf += got;
 334		len += got;
 335		count -= got;
 336	}
 337
 338	free_page((unsigned long)page);
 339	return len;
 340}
 341
 342static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf,
 343				size_t count, loff_t *pos)
 344{
 345	struct mm_struct *mm;
 346	ssize_t ret;
 347
 348	mm = get_task_mm(tsk);
 349	if (!mm)
 350		return 0;
 351
 352	ret = get_mm_cmdline(mm, buf, count, pos);
 353	mmput(mm);
 354	return ret;
 355}
 356
 357static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 358				     size_t count, loff_t *pos)
 359{
 360	struct task_struct *tsk;
 361	ssize_t ret;
 362
 363	BUG_ON(*pos < 0);
 364
 365	tsk = get_proc_task(file_inode(file));
 366	if (!tsk)
 367		return -ESRCH;
 368	ret = get_task_cmdline(tsk, buf, count, pos);
 369	put_task_struct(tsk);
 370	if (ret > 0)
 371		*pos += ret;
 372	return ret;
 373}
 374
 375static const struct file_operations proc_pid_cmdline_ops = {
 376	.read	= proc_pid_cmdline_read,
 377	.llseek	= generic_file_llseek,
 378};
 379
 380#ifdef CONFIG_KALLSYMS
 381/*
 382 * Provides a wchan file via kallsyms in a proper one-value-per-file format.
 383 * Returns the resolved symbol.  If that fails, simply return the address.
 384 */
 385static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
 386			  struct pid *pid, struct task_struct *task)
 387{
 388	unsigned long wchan;
 389	char symname[KSYM_NAME_LEN];
 390
 391	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
 392		goto print0;
 393
 394	wchan = get_wchan(task);
 395	if (wchan && !lookup_symbol_name(wchan, symname)) {
 396		seq_puts(m, symname);
 397		return 0;
 398	}
 399
 400print0:
 401	seq_putc(m, '0');
 402	return 0;
 403}
 404#endif /* CONFIG_KALLSYMS */
 405
 406static int lock_trace(struct task_struct *task)
 407{
 408	int err = mutex_lock_killable(&task->signal->exec_update_mutex);
 409	if (err)
 410		return err;
 411	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
 412		mutex_unlock(&task->signal->exec_update_mutex);
 413		return -EPERM;
 414	}
 415	return 0;
 416}
 417
 418static void unlock_trace(struct task_struct *task)
 419{
 420	mutex_unlock(&task->signal->exec_update_mutex);
 421}
 422
 423#ifdef CONFIG_STACKTRACE
 424
 425#define MAX_STACK_TRACE_DEPTH	64
 426
 427static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
 428			  struct pid *pid, struct task_struct *task)
 429{
 430	unsigned long *entries;
 431	int err;
 432
 433	/*
 434	 * The ability to racily run the kernel stack unwinder on a running task
 435	 * and then observe the unwinder output is scary; while it is useful for
 436	 * debugging kernel issues, it can also allow an attacker to leak kernel
 437	 * stack contents.
 438	 * Doing this in a manner that is at least safe from races would require
 439	 * some work to ensure that the remote task can not be scheduled; and
 440	 * even then, this would still expose the unwinder as local attack
 441	 * surface.
 442	 * Therefore, this interface is restricted to root.
 443	 */
 444	if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
 445		return -EACCES;
 446
 447	entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
 448				GFP_KERNEL);
 449	if (!entries)
 450		return -ENOMEM;
 451
 452	err = lock_trace(task);
 453	if (!err) {
 454		unsigned int i, nr_entries;
 455
 456		nr_entries = stack_trace_save_tsk(task, entries,
 457						  MAX_STACK_TRACE_DEPTH, 0);
 458
 459		for (i = 0; i < nr_entries; i++) {
 460			seq_printf(m, "[<0>] %pB\n", (void *)entries[i]);
 461		}
 462
 463		unlock_trace(task);
 464	}
 465	kfree(entries);
 466
 467	return err;
 468}
 469#endif
 470
 471#ifdef CONFIG_SCHED_INFO
 472/*
 473 * Provides /proc/PID/schedstat
 474 */
 475static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
 476			      struct pid *pid, struct task_struct *task)
 477{
 478	if (unlikely(!sched_info_on()))
 479		seq_puts(m, "0 0 0\n");
 480	else
 481		seq_printf(m, "%llu %llu %lu\n",
 482		   (unsigned long long)task->se.sum_exec_runtime,
 483		   (unsigned long long)task->sched_info.run_delay,
 484		   task->sched_info.pcount);
 485
 486	return 0;
 487}
 488#endif
 489
 490#ifdef CONFIG_LATENCYTOP
 491static int lstats_show_proc(struct seq_file *m, void *v)
 492{
 493	int i;
 494	struct inode *inode = m->private;
 495	struct task_struct *task = get_proc_task(inode);
 496
 497	if (!task)
 498		return -ESRCH;
 499	seq_puts(m, "Latency Top version : v0.1\n");
 500	for (i = 0; i < LT_SAVECOUNT; i++) {
 501		struct latency_record *lr = &task->latency_record[i];
 502		if (lr->backtrace[0]) {
 503			int q;
 504			seq_printf(m, "%i %li %li",
 505				   lr->count, lr->time, lr->max);
 506			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
 507				unsigned long bt = lr->backtrace[q];
 508
 509				if (!bt)
 510					break;
 511				seq_printf(m, " %ps", (void *)bt);
 512			}
 513			seq_putc(m, '\n');
 514		}
 515
 516	}
 517	put_task_struct(task);
 518	return 0;
 519}
 520
 521static int lstats_open(struct inode *inode, struct file *file)
 522{
 523	return single_open(file, lstats_show_proc, inode);
 524}
 525
 526static ssize_t lstats_write(struct file *file, const char __user *buf,
 527			    size_t count, loff_t *offs)
 528{
 529	struct task_struct *task = get_proc_task(file_inode(file));
 530
 531	if (!task)
 532		return -ESRCH;
 533	clear_tsk_latency_tracing(task);
 534	put_task_struct(task);
 535
 536	return count;
 537}
 538
 539static const struct file_operations proc_lstats_operations = {
 540	.open		= lstats_open,
 541	.read		= seq_read,
 542	.write		= lstats_write,
 543	.llseek		= seq_lseek,
 544	.release	= single_release,
 545};
 546
 547#endif
 548
 549static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
 550			  struct pid *pid, struct task_struct *task)
 551{
 552	unsigned long totalpages = totalram_pages() + total_swap_pages;
 553	unsigned long points = 0;
 554
 555	points = oom_badness(task, totalpages) * 1000 / totalpages;
 556	seq_printf(m, "%lu\n", points);
 557
 558	return 0;
 559}
 560
 561struct limit_names {
 562	const char *name;
 563	const char *unit;
 564};
 565
 566static const struct limit_names lnames[RLIM_NLIMITS] = {
 567	[RLIMIT_CPU] = {"Max cpu time", "seconds"},
 568	[RLIMIT_FSIZE] = {"Max file size", "bytes"},
 569	[RLIMIT_DATA] = {"Max data size", "bytes"},
 570	[RLIMIT_STACK] = {"Max stack size", "bytes"},
 571	[RLIMIT_CORE] = {"Max core file size", "bytes"},
 572	[RLIMIT_RSS] = {"Max resident set", "bytes"},
 573	[RLIMIT_NPROC] = {"Max processes", "processes"},
 574	[RLIMIT_NOFILE] = {"Max open files", "files"},
 575	[RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
 576	[RLIMIT_AS] = {"Max address space", "bytes"},
 577	[RLIMIT_LOCKS] = {"Max file locks", "locks"},
 578	[RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
 579	[RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
 580	[RLIMIT_NICE] = {"Max nice priority", NULL},
 581	[RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
 582	[RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
 583};
 584
 585/* Display limits for a process */
 586static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
 587			   struct pid *pid, struct task_struct *task)
 588{
 589	unsigned int i;
 590	unsigned long flags;
 591
 592	struct rlimit rlim[RLIM_NLIMITS];
 593
 594	if (!lock_task_sighand(task, &flags))
 595		return 0;
 596	memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
 597	unlock_task_sighand(task, &flags);
 598
 599	/*
 600	 * print the file header
 601	 */
 602	seq_puts(m, "Limit                     "
 603		"Soft Limit           "
 604		"Hard Limit           "
 605		"Units     \n");
 606
 607	for (i = 0; i < RLIM_NLIMITS; i++) {
 608		if (rlim[i].rlim_cur == RLIM_INFINITY)
 609			seq_printf(m, "%-25s %-20s ",
 610				   lnames[i].name, "unlimited");
 611		else
 612			seq_printf(m, "%-25s %-20lu ",
 613				   lnames[i].name, rlim[i].rlim_cur);
 614
 615		if (rlim[i].rlim_max == RLIM_INFINITY)
 616			seq_printf(m, "%-20s ", "unlimited");
 617		else
 618			seq_printf(m, "%-20lu ", rlim[i].rlim_max);
 619
 620		if (lnames[i].unit)
 621			seq_printf(m, "%-10s\n", lnames[i].unit);
 622		else
 623			seq_putc(m, '\n');
 624	}
 625
 626	return 0;
 627}
 628
 629#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 630static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
 631			    struct pid *pid, struct task_struct *task)
 632{
 633	struct syscall_info info;
 634	u64 *args = &info.data.args[0];
 635	int res;
 636
 637	res = lock_trace(task);
 638	if (res)
 639		return res;
 640
 641	if (task_current_syscall(task, &info))
 642		seq_puts(m, "running\n");
 643	else if (info.data.nr < 0)
 644		seq_printf(m, "%d 0x%llx 0x%llx\n",
 645			   info.data.nr, info.sp, info.data.instruction_pointer);
 646	else
 647		seq_printf(m,
 648		       "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n",
 649		       info.data.nr,
 650		       args[0], args[1], args[2], args[3], args[4], args[5],
 651		       info.sp, info.data.instruction_pointer);
 652	unlock_trace(task);
 653
 654	return 0;
 655}
 656#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
 657
 658/************************************************************************/
 659/*                       Here the fs part begins                        */
 660/************************************************************************/
 661
 662/* permission checks */
 663static int proc_fd_access_allowed(struct inode *inode)
 664{
 665	struct task_struct *task;
 666	int allowed = 0;
 667	/* Allow access to a task's file descriptors if it is us or we
 668	 * may use ptrace attach to the process and find out that
 669	 * information.
 670	 */
 671	task = get_proc_task(inode);
 672	if (task) {
 673		allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
 674		put_task_struct(task);
 675	}
 676	return allowed;
 677}
 678
 679int proc_setattr(struct dentry *dentry, struct iattr *attr)
 680{
 681	int error;
 682	struct inode *inode = d_inode(dentry);
 683
 684	if (attr->ia_valid & ATTR_MODE)
 685		return -EPERM;
 686
 687	error = setattr_prepare(dentry, attr);
 688	if (error)
 689		return error;
 690
 691	setattr_copy(inode, attr);
 692	mark_inode_dirty(inode);
 693	return 0;
 694}
 695
 696/*
 697 * May current process learn task's sched/cmdline info (for hide_pid_min=1)
 698 * or euid/egid (for hide_pid_min=2)?
 699 */
 700static bool has_pid_permissions(struct proc_fs_info *fs_info,
 701				 struct task_struct *task,
 702				 enum proc_hidepid hide_pid_min)
 703{
 704	/*
 705	 * If 'hidpid' mount option is set force a ptrace check,
 706	 * we indicate that we are using a filesystem syscall
 707	 * by passing PTRACE_MODE_READ_FSCREDS
 708	 */
 709	if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE)
 710		return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
 711
 712	if (fs_info->hide_pid < hide_pid_min)
 713		return true;
 714	if (in_group_p(fs_info->pid_gid))
 715		return true;
 716	return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
 717}
 718
 719
 720static int proc_pid_permission(struct inode *inode, int mask)
 721{
 722	struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
 723	struct task_struct *task;
 724	bool has_perms;
 725
 726	task = get_proc_task(inode);
 727	if (!task)
 728		return -ESRCH;
 729	has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS);
 730	put_task_struct(task);
 731
 732	if (!has_perms) {
 733		if (fs_info->hide_pid == HIDEPID_INVISIBLE) {
 734			/*
 735			 * Let's make getdents(), stat(), and open()
 736			 * consistent with each other.  If a process
 737			 * may not stat() a file, it shouldn't be seen
 738			 * in procfs at all.
 739			 */
 740			return -ENOENT;
 741		}
 742
 743		return -EPERM;
 744	}
 745	return generic_permission(inode, mask);
 746}
 747
 748
 749
 750static const struct inode_operations proc_def_inode_operations = {
 751	.setattr	= proc_setattr,
 752};
 753
 754static int proc_single_show(struct seq_file *m, void *v)
 755{
 756	struct inode *inode = m->private;
 757	struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
 758	struct pid *pid = proc_pid(inode);
 759	struct task_struct *task;
 760	int ret;
 761
 762	task = get_pid_task(pid, PIDTYPE_PID);
 763	if (!task)
 764		return -ESRCH;
 765
 766	ret = PROC_I(inode)->op.proc_show(m, ns, pid, task);
 767
 768	put_task_struct(task);
 769	return ret;
 770}
 771
 772static int proc_single_open(struct inode *inode, struct file *filp)
 773{
 774	return single_open(filp, proc_single_show, inode);
 775}
 776
 777static const struct file_operations proc_single_file_operations = {
 778	.open		= proc_single_open,
 779	.read		= seq_read,
 780	.llseek		= seq_lseek,
 781	.release	= single_release,
 782};
 783
 784
 785struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
 786{
 787	struct task_struct *task = get_proc_task(inode);
 788	struct mm_struct *mm = ERR_PTR(-ESRCH);
 789
 790	if (task) {
 791		mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
 792		put_task_struct(task);
 793
 794		if (!IS_ERR_OR_NULL(mm)) {
 795			/* ensure this mm_struct can't be freed */
 796			mmgrab(mm);
 797			/* but do not pin its memory */
 798			mmput(mm);
 799		}
 800	}
 801
 802	return mm;
 803}
 804
 805static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
 806{
 807	struct mm_struct *mm = proc_mem_open(inode, mode);
 808
 809	if (IS_ERR(mm))
 810		return PTR_ERR(mm);
 811
 812	file->private_data = mm;
 813	return 0;
 814}
 815
 816static int mem_open(struct inode *inode, struct file *file)
 817{
 818	int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
 819
 820	/* OK to pass negative loff_t, we can catch out-of-range */
 821	file->f_mode |= FMODE_UNSIGNED_OFFSET;
 822
 823	return ret;
 824}
 825
 826static ssize_t mem_rw(struct file *file, char __user *buf,
 827			size_t count, loff_t *ppos, int write)
 828{
 829	struct mm_struct *mm = file->private_data;
 830	unsigned long addr = *ppos;
 831	ssize_t copied;
 832	char *page;
 833	unsigned int flags;
 834
 835	if (!mm)
 836		return 0;
 837
 838	page = (char *)__get_free_page(GFP_KERNEL);
 839	if (!page)
 840		return -ENOMEM;
 841
 842	copied = 0;
 843	if (!mmget_not_zero(mm))
 844		goto free;
 845
 846	flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
 847
 848	while (count > 0) {
 849		int this_len = min_t(int, count, PAGE_SIZE);
 850
 851		if (write && copy_from_user(page, buf, this_len)) {
 852			copied = -EFAULT;
 853			break;
 854		}
 855
 856		this_len = access_remote_vm(mm, addr, page, this_len, flags);
 857		if (!this_len) {
 858			if (!copied)
 859				copied = -EIO;
 860			break;
 861		}
 862
 863		if (!write && copy_to_user(buf, page, this_len)) {
 864			copied = -EFAULT;
 865			break;
 866		}
 867
 868		buf += this_len;
 869		addr += this_len;
 870		copied += this_len;
 871		count -= this_len;
 872	}
 873	*ppos = addr;
 874
 875	mmput(mm);
 876free:
 877	free_page((unsigned long) page);
 878	return copied;
 879}
 880
 881static ssize_t mem_read(struct file *file, char __user *buf,
 882			size_t count, loff_t *ppos)
 883{
 884	return mem_rw(file, buf, count, ppos, 0);
 885}
 886
 887static ssize_t mem_write(struct file *file, const char __user *buf,
 888			 size_t count, loff_t *ppos)
 889{
 890	return mem_rw(file, (char __user*)buf, count, ppos, 1);
 891}
 892
 893loff_t mem_lseek(struct file *file, loff_t offset, int orig)
 894{
 895	switch (orig) {
 896	case 0:
 897		file->f_pos = offset;
 898		break;
 899	case 1:
 900		file->f_pos += offset;
 901		break;
 902	default:
 903		return -EINVAL;
 904	}
 905	force_successful_syscall_return();
 906	return file->f_pos;
 907}
 908
 909static int mem_release(struct inode *inode, struct file *file)
 910{
 911	struct mm_struct *mm = file->private_data;
 912	if (mm)
 913		mmdrop(mm);
 914	return 0;
 915}
 916
 917static const struct file_operations proc_mem_operations = {
 918	.llseek		= mem_lseek,
 919	.read		= mem_read,
 920	.write		= mem_write,
 921	.open		= mem_open,
 922	.release	= mem_release,
 923};
 924
 925static int environ_open(struct inode *inode, struct file *file)
 926{
 927	return __mem_open(inode, file, PTRACE_MODE_READ);
 928}
 929
 930static ssize_t environ_read(struct file *file, char __user *buf,
 931			size_t count, loff_t *ppos)
 932{
 933	char *page;
 934	unsigned long src = *ppos;
 935	int ret = 0;
 936	struct mm_struct *mm = file->private_data;
 937	unsigned long env_start, env_end;
 938
 939	/* Ensure the process spawned far enough to have an environment. */
 940	if (!mm || !mm->env_end)
 941		return 0;
 942
 943	page = (char *)__get_free_page(GFP_KERNEL);
 944	if (!page)
 945		return -ENOMEM;
 946
 947	ret = 0;
 948	if (!mmget_not_zero(mm))
 949		goto free;
 950
 951	spin_lock(&mm->arg_lock);
 952	env_start = mm->env_start;
 953	env_end = mm->env_end;
 954	spin_unlock(&mm->arg_lock);
 955
 956	while (count > 0) {
 957		size_t this_len, max_len;
 958		int retval;
 959
 960		if (src >= (env_end - env_start))
 961			break;
 962
 963		this_len = env_end - (env_start + src);
 964
 965		max_len = min_t(size_t, PAGE_SIZE, count);
 966		this_len = min(max_len, this_len);
 967
 968		retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON);
 969
 970		if (retval <= 0) {
 971			ret = retval;
 972			break;
 973		}
 974
 975		if (copy_to_user(buf, page, retval)) {
 976			ret = -EFAULT;
 977			break;
 978		}
 979
 980		ret += retval;
 981		src += retval;
 982		buf += retval;
 983		count -= retval;
 984	}
 985	*ppos = src;
 986	mmput(mm);
 987
 988free:
 989	free_page((unsigned long) page);
 990	return ret;
 991}
 992
 993static const struct file_operations proc_environ_operations = {
 994	.open		= environ_open,
 995	.read		= environ_read,
 996	.llseek		= generic_file_llseek,
 997	.release	= mem_release,
 998};
 999
1000static int auxv_open(struct inode *inode, struct file *file)
1001{
1002	return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
1003}
1004
1005static ssize_t auxv_read(struct file *file, char __user *buf,
1006			size_t count, loff_t *ppos)
1007{
1008	struct mm_struct *mm = file->private_data;
1009	unsigned int nwords = 0;
1010
1011	if (!mm)
1012		return 0;
1013	do {
1014		nwords += 2;
1015	} while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
1016	return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv,
1017				       nwords * sizeof(mm->saved_auxv[0]));
1018}
1019
1020static const struct file_operations proc_auxv_operations = {
1021	.open		= auxv_open,
1022	.read		= auxv_read,
1023	.llseek		= generic_file_llseek,
1024	.release	= mem_release,
1025};
1026
1027static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
1028			    loff_t *ppos)
1029{
1030	struct task_struct *task = get_proc_task(file_inode(file));
1031	char buffer[PROC_NUMBUF];
1032	int oom_adj = OOM_ADJUST_MIN;
1033	size_t len;
1034
1035	if (!task)
1036		return -ESRCH;
1037	if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
1038		oom_adj = OOM_ADJUST_MAX;
1039	else
1040		oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
1041			  OOM_SCORE_ADJ_MAX;
1042	put_task_struct(task);
1043	len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
1044	return simple_read_from_buffer(buf, count, ppos, buffer, len);
1045}
1046
1047static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
1048{
1049	static DEFINE_MUTEX(oom_adj_mutex);
1050	struct mm_struct *mm = NULL;
1051	struct task_struct *task;
1052	int err = 0;
1053
1054	task = get_proc_task(file_inode(file));
1055	if (!task)
1056		return -ESRCH;
1057
1058	mutex_lock(&oom_adj_mutex);
1059	if (legacy) {
1060		if (oom_adj < task->signal->oom_score_adj &&
1061				!capable(CAP_SYS_RESOURCE)) {
1062			err = -EACCES;
1063			goto err_unlock;
1064		}
1065		/*
1066		 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
1067		 * /proc/pid/oom_score_adj instead.
1068		 */
1069		pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
1070			  current->comm, task_pid_nr(current), task_pid_nr(task),
1071			  task_pid_nr(task));
1072	} else {
1073		if ((short)oom_adj < task->signal->oom_score_adj_min &&
1074				!capable(CAP_SYS_RESOURCE)) {
1075			err = -EACCES;
1076			goto err_unlock;
1077		}
1078	}
1079
1080	/*
1081	 * Make sure we will check other processes sharing the mm if this is
1082	 * not vfrok which wants its own oom_score_adj.
1083	 * pin the mm so it doesn't go away and get reused after task_unlock
1084	 */
1085	if (!task->vfork_done) {
1086		struct task_struct *p = find_lock_task_mm(task);
1087
1088		if (p) {
1089			if (atomic_read(&p->mm->mm_users) > 1) {
1090				mm = p->mm;
1091				mmgrab(mm);
1092			}
1093			task_unlock(p);
1094		}
1095	}
1096
1097	task->signal->oom_score_adj = oom_adj;
1098	if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
1099		task->signal->oom_score_adj_min = (short)oom_adj;
1100	trace_oom_score_adj_update(task);
1101
1102	if (mm) {
1103		struct task_struct *p;
1104
1105		rcu_read_lock();
1106		for_each_process(p) {
1107			if (same_thread_group(task, p))
1108				continue;
1109
1110			/* do not touch kernel threads or the global init */
1111			if (p->flags & PF_KTHREAD || is_global_init(p))
1112				continue;
1113
1114			task_lock(p);
1115			if (!p->vfork_done && process_shares_mm(p, mm)) {
1116				p->signal->oom_score_adj = oom_adj;
1117				if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
1118					p->signal->oom_score_adj_min = (short)oom_adj;
1119			}
1120			task_unlock(p);
1121		}
1122		rcu_read_unlock();
1123		mmdrop(mm);
1124	}
1125err_unlock:
1126	mutex_unlock(&oom_adj_mutex);
1127	put_task_struct(task);
1128	return err;
1129}
1130
1131/*
1132 * /proc/pid/oom_adj exists solely for backwards compatibility with previous
1133 * kernels.  The effective policy is defined by oom_score_adj, which has a
1134 * different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
1135 * Values written to oom_adj are simply mapped linearly to oom_score_adj.
1136 * Processes that become oom disabled via oom_adj will still be oom disabled
1137 * with this implementation.
1138 *
1139 * oom_adj cannot be removed since existing userspace binaries use it.
1140 */
1141static ssize_t oom_adj_write(struct file *file, const char __user *buf,
1142			     size_t count, loff_t *ppos)
1143{
1144	char buffer[PROC_NUMBUF];
1145	int oom_adj;
1146	int err;
1147
1148	memset(buffer, 0, sizeof(buffer));
1149	if (count > sizeof(buffer) - 1)
1150		count = sizeof(buffer) - 1;
1151	if (copy_from_user(buffer, buf, count)) {
1152		err = -EFAULT;
1153		goto out;
1154	}
1155
1156	err = kstrtoint(strstrip(buffer), 0, &oom_adj);
1157	if (err)
1158		goto out;
1159	if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
1160	     oom_adj != OOM_DISABLE) {
1161		err = -EINVAL;
1162		goto out;
1163	}
1164
1165	/*
1166	 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
1167	 * value is always attainable.
1168	 */
1169	if (oom_adj == OOM_ADJUST_MAX)
1170		oom_adj = OOM_SCORE_ADJ_MAX;
1171	else
1172		oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
1173
1174	err = __set_oom_adj(file, oom_adj, true);
1175out:
1176	return err < 0 ? err : count;
1177}
1178
1179static const struct file_operations proc_oom_adj_operations = {
1180	.read		= oom_adj_read,
1181	.write		= oom_adj_write,
1182	.llseek		= generic_file_llseek,
1183};
1184
1185static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
1186					size_t count, loff_t *ppos)
1187{
1188	struct task_struct *task = get_proc_task(file_inode(file));
1189	char buffer[PROC_NUMBUF];
1190	short oom_score_adj = OOM_SCORE_ADJ_MIN;
1191	size_t len;
1192
1193	if (!task)
1194		return -ESRCH;
1195	oom_score_adj = task->signal->oom_score_adj;
1196	put_task_struct(task);
1197	len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
1198	return simple_read_from_buffer(buf, count, ppos, buffer, len);
1199}
1200
1201static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1202					size_t count, loff_t *ppos)
1203{
1204	char buffer[PROC_NUMBUF];
1205	int oom_score_adj;
1206	int err;
1207
1208	memset(buffer, 0, sizeof(buffer));
1209	if (count > sizeof(buffer) - 1)
1210		count = sizeof(buffer) - 1;
1211	if (copy_from_user(buffer, buf, count)) {
1212		err = -EFAULT;
1213		goto out;
1214	}
1215
1216	err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
1217	if (err)
1218		goto out;
1219	if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
1220			oom_score_adj > OOM_SCORE_ADJ_MAX) {
1221		err = -EINVAL;
1222		goto out;
1223	}
1224
1225	err = __set_oom_adj(file, oom_score_adj, false);
1226out:
1227	return err < 0 ? err : count;
1228}
1229
1230static const struct file_operations proc_oom_score_adj_operations = {
1231	.read		= oom_score_adj_read,
1232	.write		= oom_score_adj_write,
1233	.llseek		= default_llseek,
1234};
1235
1236#ifdef CONFIG_AUDIT
1237#define TMPBUFLEN 11
1238static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
1239				  size_t count, loff_t *ppos)
1240{
1241	struct inode * inode = file_inode(file);
1242	struct task_struct *task = get_proc_task(inode);
1243	ssize_t length;
1244	char tmpbuf[TMPBUFLEN];
1245
1246	if (!task)
1247		return -ESRCH;
1248	length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1249			   from_kuid(file->f_cred->user_ns,
1250				     audit_get_loginuid(task)));
1251	put_task_struct(task);
1252	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1253}
1254
1255static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1256				   size_t count, loff_t *ppos)
1257{
1258	struct inode * inode = file_inode(file);
1259	uid_t loginuid;
1260	kuid_t kloginuid;
1261	int rv;
1262
1263	rcu_read_lock();
1264	if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
1265		rcu_read_unlock();
1266		return -EPERM;
1267	}
1268	rcu_read_unlock();
1269
1270	if (*ppos != 0) {
1271		/* No partial writes. */
1272		return -EINVAL;
1273	}
1274
1275	rv = kstrtou32_from_user(buf, count, 10, &loginuid);
1276	if (rv < 0)
1277		return rv;
1278
1279	/* is userspace tring to explicitly UNSET the loginuid? */
1280	if (loginuid == AUDIT_UID_UNSET) {
1281		kloginuid = INVALID_UID;
1282	} else {
1283		kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
1284		if (!uid_valid(kloginuid))
1285			return -EINVAL;
1286	}
1287
1288	rv = audit_set_loginuid(kloginuid);
1289	if (rv < 0)
1290		return rv;
1291	return count;
1292}
1293
1294static const struct file_operations proc_loginuid_operations = {
1295	.read		= proc_loginuid_read,
1296	.write		= proc_loginuid_write,
1297	.llseek		= generic_file_llseek,
1298};
1299
1300static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
1301				  size_t count, loff_t *ppos)
1302{
1303	struct inode * inode = file_inode(file);
1304	struct task_struct *task = get_proc_task(inode);
1305	ssize_t length;
1306	char tmpbuf[TMPBUFLEN];
1307
1308	if (!task)
1309		return -ESRCH;
1310	length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1311				audit_get_sessionid(task));
1312	put_task_struct(task);
1313	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1314}
1315
1316static const struct file_operations proc_sessionid_operations = {
1317	.read		= proc_sessionid_read,
1318	.llseek		= generic_file_llseek,
1319};
1320#endif
1321
1322#ifdef CONFIG_FAULT_INJECTION
1323static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
1324				      size_t count, loff_t *ppos)
1325{
1326	struct task_struct *task = get_proc_task(file_inode(file));
1327	char buffer[PROC_NUMBUF];
1328	size_t len;
1329	int make_it_fail;
1330
1331	if (!task)
1332		return -ESRCH;
1333	make_it_fail = task->make_it_fail;
1334	put_task_struct(task);
1335
1336	len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);
1337
1338	return simple_read_from_buffer(buf, count, ppos, buffer, len);
1339}
1340
1341static ssize_t proc_fault_inject_write(struct file * file,
1342			const char __user * buf, size_t count, loff_t *ppos)
1343{
1344	struct task_struct *task;
1345	char buffer[PROC_NUMBUF];
1346	int make_it_fail;
1347	int rv;
1348
1349	if (!capable(CAP_SYS_RESOURCE))
1350		return -EPERM;
1351	memset(buffer, 0, sizeof(buffer));
1352	if (count > sizeof(buffer) - 1)
1353		count = sizeof(buffer) - 1;
1354	if (copy_from_user(buffer, buf, count))
1355		return -EFAULT;
1356	rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
1357	if (rv < 0)
1358		return rv;
1359	if (make_it_fail < 0 || make_it_fail > 1)
1360		return -EINVAL;
1361
1362	task = get_proc_task(file_inode(file));
1363	if (!task)
1364		return -ESRCH;
1365	task->make_it_fail = make_it_fail;
1366	put_task_struct(task);
1367
1368	return count;
1369}
1370
1371static const struct file_operations proc_fault_inject_operations = {
1372	.read		= proc_fault_inject_read,
1373	.write		= proc_fault_inject_write,
1374	.llseek		= generic_file_llseek,
1375};
1376
1377static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf,
1378				   size_t count, loff_t *ppos)
1379{
1380	struct task_struct *task;
1381	int err;
1382	unsigned int n;
1383
1384	err = kstrtouint_from_user(buf, count, 0, &n);
1385	if (err)
1386		return err;
1387
1388	task = get_proc_task(file_inode(file));
1389	if (!task)
1390		return -ESRCH;
1391	task->fail_nth = n;
1392	put_task_struct(task);
1393
1394	return count;
1395}
1396
1397static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
1398				  size_t count, loff_t *ppos)
1399{
1400	struct task_struct *task;
1401	char numbuf[PROC_NUMBUF];
1402	ssize_t len;
1403
1404	task = get_proc_task(file_inode(file));
1405	if (!task)
1406		return -ESRCH;
1407	len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth);
1408	put_task_struct(task);
1409	return simple_read_from_buffer(buf, count, ppos, numbuf, len);
1410}
1411
1412static const struct file_operations proc_fail_nth_operations = {
1413	.read		= proc_fail_nth_read,
1414	.write		= proc_fail_nth_write,
1415};
1416#endif
1417
1418
1419#ifdef CONFIG_SCHED_DEBUG
1420/*
1421 * Print out various scheduling related per-task fields:
1422 */
1423static int sched_show(struct seq_file *m, void *v)
1424{
1425	struct inode *inode = m->private;
1426	struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
1427	struct task_struct *p;
1428
1429	p = get_proc_task(inode);
1430	if (!p)
1431		return -ESRCH;
1432	proc_sched_show_task(p, ns, m);
1433
1434	put_task_struct(p);
1435
1436	return 0;
1437}
1438
1439static ssize_t
1440sched_write(struct file *file, const char __user *buf,
1441	    size_t count, loff_t *offset)
1442{
1443	struct inode *inode = file_inode(file);
1444	struct task_struct *p;
1445
1446	p = get_proc_task(inode);
1447	if (!p)
1448		return -ESRCH;
1449	proc_sched_set_task(p);
1450
1451	put_task_struct(p);
1452
1453	return count;
1454}
1455
1456static int sched_open(struct inode *inode, struct file *filp)
1457{
1458	return single_open(filp, sched_show, inode);
1459}
1460
1461static const struct file_operations proc_pid_sched_operations = {
1462	.open		= sched_open,
1463	.read		= seq_read,
1464	.write		= sched_write,
1465	.llseek		= seq_lseek,
1466	.release	= single_release,
1467};
1468
1469#endif
1470
1471#ifdef CONFIG_SCHED_AUTOGROUP
1472/*
1473 * Print out autogroup related information:
1474 */
1475static int sched_autogroup_show(struct seq_file *m, void *v)
1476{
1477	struct inode *inode = m->private;
1478	struct task_struct *p;
1479
1480	p = get_proc_task(inode);
1481	if (!p)
1482		return -ESRCH;
1483	proc_sched_autogroup_show_task(p, m);
1484
1485	put_task_struct(p);
1486
1487	return 0;
1488}
1489
1490static ssize_t
1491sched_autogroup_write(struct file *file, const char __user *buf,
1492	    size_t count, loff_t *offset)
1493{
1494	struct inode *inode = file_inode(file);
1495	struct task_struct *p;
1496	char buffer[PROC_NUMBUF];
1497	int nice;
1498	int err;
1499
1500	memset(buffer, 0, sizeof(buffer));
1501	if (count > sizeof(buffer) - 1)
1502		count = sizeof(buffer) - 1;
1503	if (copy_from_user(buffer, buf, count))
1504		return -EFAULT;
1505
1506	err = kstrtoint(strstrip(buffer), 0, &nice);
1507	if (err < 0)
1508		return err;
1509
1510	p = get_proc_task(inode);
1511	if (!p)
1512		return -ESRCH;
1513
1514	err = proc_sched_autogroup_set_nice(p, nice);
1515	if (err)
1516		count = err;
1517
1518	put_task_struct(p);
1519
1520	return count;
1521}
1522
1523static int sched_autogroup_open(struct inode *inode, struct file *filp)
1524{
1525	int ret;
1526
1527	ret = single_open(filp, sched_autogroup_show, NULL);
1528	if (!ret) {
1529		struct seq_file *m = filp->private_data;
1530
1531		m->private = inode;
1532	}
1533	return ret;
1534}
1535
1536static const struct file_operations proc_pid_sched_autogroup_operations = {
1537	.open		= sched_autogroup_open,
1538	.read		= seq_read,
1539	.write		= sched_autogroup_write,
1540	.llseek		= seq_lseek,
1541	.release	= single_release,
1542};
1543
1544#endif /* CONFIG_SCHED_AUTOGROUP */
1545
1546#ifdef CONFIG_TIME_NS
1547static int timens_offsets_show(struct seq_file *m, void *v)
1548{
1549	struct task_struct *p;
1550
1551	p = get_proc_task(file_inode(m->file));
1552	if (!p)
1553		return -ESRCH;
1554	proc_timens_show_offsets(p, m);
1555
1556	put_task_struct(p);
1557
1558	return 0;
1559}
1560
1561static ssize_t timens_offsets_write(struct file *file, const char __user *buf,
1562				    size_t count, loff_t *ppos)
1563{
1564	struct inode *inode = file_inode(file);
1565	struct proc_timens_offset offsets[2];
1566	char *kbuf = NULL, *pos, *next_line;
1567	struct task_struct *p;
1568	int ret, noffsets;
1569
1570	/* Only allow < page size writes at the beginning of the file */
1571	if ((*ppos != 0) || (count >= PAGE_SIZE))
1572		return -EINVAL;
1573
1574	/* Slurp in the user data */
1575	kbuf = memdup_user_nul(buf, count);
1576	if (IS_ERR(kbuf))
1577		return PTR_ERR(kbuf);
1578
1579	/* Parse the user data */
1580	ret = -EINVAL;
1581	noffsets = 0;
1582	for (pos = kbuf; pos; pos = next_line) {
1583		struct proc_timens_offset *off = &offsets[noffsets];
1584		char clock[10];
1585		int err;
1586
1587		/* Find the end of line and ensure we don't look past it */
1588		next_line = strchr(pos, '\n');
1589		if (next_line) {
1590			*next_line = '\0';
1591			next_line++;
1592			if (*next_line == '\0')
1593				next_line = NULL;
1594		}
1595
1596		err = sscanf(pos, "%9s %lld %lu", clock,
1597				&off->val.tv_sec, &off->val.tv_nsec);
1598		if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC)
1599			goto out;
1600
1601		clock[sizeof(clock) - 1] = 0;
1602		if (strcmp(clock, "monotonic") == 0 ||
1603		    strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0)
1604			off->clockid = CLOCK_MONOTONIC;
1605		else if (strcmp(clock, "boottime") == 0 ||
1606			 strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0)
1607			off->clockid = CLOCK_BOOTTIME;
1608		else
1609			goto out;
1610
1611		noffsets++;
1612		if (noffsets == ARRAY_SIZE(offsets)) {
1613			if (next_line)
1614				count = next_line - kbuf;
1615			break;
1616		}
1617	}
1618
1619	ret = -ESRCH;
1620	p = get_proc_task(inode);
1621	if (!p)
1622		goto out;
1623	ret = proc_timens_set_offset(file, p, offsets, noffsets);
1624	put_task_struct(p);
1625	if (ret)
1626		goto out;
1627
1628	ret = count;
1629out:
1630	kfree(kbuf);
1631	return ret;
1632}
1633
1634static int timens_offsets_open(struct inode *inode, struct file *filp)
1635{
1636	return single_open(filp, timens_offsets_show, inode);
1637}
1638
1639static const struct file_operations proc_timens_offsets_operations = {
1640	.open		= timens_offsets_open,
1641	.read		= seq_read,
1642	.write		= timens_offsets_write,
1643	.llseek		= seq_lseek,
1644	.release	= single_release,
1645};
1646#endif /* CONFIG_TIME_NS */
1647
1648static ssize_t comm_write(struct file *file, const char __user *buf,
1649				size_t count, loff_t *offset)
1650{
1651	struct inode *inode = file_inode(file);
1652	struct task_struct *p;
1653	char buffer[TASK_COMM_LEN];
1654	const size_t maxlen = sizeof(buffer) - 1;
1655
1656	memset(buffer, 0, sizeof(buffer));
1657	if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
1658		return -EFAULT;
1659
1660	p = get_proc_task(inode);
1661	if (!p)
1662		return -ESRCH;
1663
1664	if (same_thread_group(current, p))
1665		set_task_comm(p, buffer);
1666	else
1667		count = -EINVAL;
1668
1669	put_task_struct(p);
1670
1671	return count;
1672}
1673
1674static int comm_show(struct seq_file *m, void *v)
1675{
1676	struct inode *inode = m->private;
1677	struct task_struct *p;
1678
1679	p = get_proc_task(inode);
1680	if (!p)
1681		return -ESRCH;
1682
1683	proc_task_name(m, p, false);
1684	seq_putc(m, '\n');
1685
1686	put_task_struct(p);
1687
1688	return 0;
1689}
1690
1691static int comm_open(struct inode *inode, struct file *filp)
1692{
1693	return single_open(filp, comm_show, inode);
1694}
1695
1696static const struct file_operations proc_pid_set_comm_operations = {
1697	.open		= comm_open,
1698	.read		= seq_read,
1699	.write		= comm_write,
1700	.llseek		= seq_lseek,
1701	.release	= single_release,
1702};
1703
1704static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
1705{
1706	struct task_struct *task;
1707	struct file *exe_file;
1708
1709	task = get_proc_task(d_inode(dentry));
1710	if (!task)
1711		return -ENOENT;
1712	exe_file = get_task_exe_file(task);
1713	put_task_struct(task);
1714	if (exe_file) {
1715		*exe_path = exe_file->f_path;
1716		path_get(&exe_file->f_path);
1717		fput(exe_file);
1718		return 0;
1719	} else
1720		return -ENOENT;
1721}
1722
1723static const char *proc_pid_get_link(struct dentry *dentry,
1724				     struct inode *inode,
1725				     struct delayed_call *done)
1726{
1727	struct path path;
1728	int error = -EACCES;
1729
1730	if (!dentry)
1731		return ERR_PTR(-ECHILD);
1732
1733	/* Are we allowed to snoop on the tasks file descriptors? */
1734	if (!proc_fd_access_allowed(inode))
1735		goto out;
1736
1737	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
1738	if (error)
1739		goto out;
1740
1741	error = nd_jump_link(&path);
1742out:
1743	return ERR_PTR(error);
1744}
1745
1746static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
1747{
1748	char *tmp = (char *)__get_free_page(GFP_KERNEL);
1749	char *pathname;
1750	int len;
1751
1752	if (!tmp)
1753		return -ENOMEM;
1754
1755	pathname = d_path(path, tmp, PAGE_SIZE);
1756	len = PTR_ERR(pathname);
1757	if (IS_ERR(pathname))
1758		goto out;
1759	len = tmp + PAGE_SIZE - 1 - pathname;
1760
1761	if (len > buflen)
1762		len = buflen;
1763	if (copy_to_user(buffer, pathname, len))
1764		len = -EFAULT;
1765 out:
1766	free_page((unsigned long)tmp);
1767	return len;
1768}
1769
1770static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
1771{
1772	int error = -EACCES;
1773	struct inode *inode = d_inode(dentry);
1774	struct path path;
1775
1776	/* Are we allowed to snoop on the tasks file descriptors? */
1777	if (!proc_fd_access_allowed(inode))
1778		goto out;
1779
1780	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
1781	if (error)
1782		goto out;
1783
1784	error = do_proc_readlink(&path, buffer, buflen);
1785	path_put(&path);
1786out:
1787	return error;
1788}
1789
1790const struct inode_operations proc_pid_link_inode_operations = {
1791	.readlink	= proc_pid_readlink,
1792	.get_link	= proc_pid_get_link,
1793	.setattr	= proc_setattr,
1794};
1795
1796
1797/* building an inode */
1798
1799void task_dump_owner(struct task_struct *task, umode_t mode,
1800		     kuid_t *ruid, kgid_t *rgid)
1801{
1802	/* Depending on the state of dumpable compute who should own a
1803	 * proc file for a task.
1804	 */
1805	const struct cred *cred;
1806	kuid_t uid;
1807	kgid_t gid;
1808
1809	if (unlikely(task->flags & PF_KTHREAD)) {
1810		*ruid = GLOBAL_ROOT_UID;
1811		*rgid = GLOBAL_ROOT_GID;
1812		return;
1813	}
1814
1815	/* Default to the tasks effective ownership */
1816	rcu_read_lock();
1817	cred = __task_cred(task);
1818	uid = cred->euid;
1819	gid = cred->egid;
1820	rcu_read_unlock();
1821
1822	/*
1823	 * Before the /proc/pid/status file was created the only way to read
1824	 * the effective uid of a /process was to stat /proc/pid.  Reading
1825	 * /proc/pid/status is slow enough that procps and other packages
1826	 * kept stating /proc/pid.  To keep the rules in /proc simple I have
1827	 * made this apply to all per process world readable and executable
1828	 * directories.
1829	 */
1830	if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
1831		struct mm_struct *mm;
1832		task_lock(task);
1833		mm = task->mm;
1834		/* Make non-dumpable tasks owned by some root */
1835		if (mm) {
1836			if (get_dumpable(mm) != SUID_DUMP_USER) {
1837				struct user_namespace *user_ns = mm->user_ns;
1838
1839				uid = make_kuid(user_ns, 0);
1840				if (!uid_valid(uid))
1841					uid = GLOBAL_ROOT_UID;
1842
1843				gid = make_kgid(user_ns, 0);
1844				if (!gid_valid(gid))
1845					gid = GLOBAL_ROOT_GID;
1846			}
1847		} else {
1848			uid = GLOBAL_ROOT_UID;
1849			gid = GLOBAL_ROOT_GID;
1850		}
1851		task_unlock(task);
1852	}
1853	*ruid = uid;
1854	*rgid = gid;
1855}
1856
1857void proc_pid_evict_inode(struct proc_inode *ei)
1858{
1859	struct pid *pid = ei->pid;
1860
1861	if (S_ISDIR(ei->vfs_inode.i_mode)) {
1862		spin_lock(&pid->lock);
1863		hlist_del_init_rcu(&ei->sibling_inodes);
1864		spin_unlock(&pid->lock);
1865	}
1866
1867	put_pid(pid);
1868}
1869
1870struct inode *proc_pid_make_inode(struct super_block * sb,
1871				  struct task_struct *task, umode_t mode)
1872{
1873	struct inode * inode;
1874	struct proc_inode *ei;
1875	struct pid *pid;
1876
1877	/* We need a new inode */
1878
1879	inode = new_inode(sb);
1880	if (!inode)
1881		goto out;
1882
1883	/* Common stuff */
1884	ei = PROC_I(inode);
1885	inode->i_mode = mode;
1886	inode->i_ino = get_next_ino();
1887	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
1888	inode->i_op = &proc_def_inode_operations;
1889
1890	/*
1891	 * grab the reference to task.
1892	 */
1893	pid = get_task_pid(task, PIDTYPE_PID);
1894	if (!pid)
1895		goto out_unlock;
1896
1897	/* Let the pid remember us for quick removal */
1898	ei->pid = pid;
1899	if (S_ISDIR(mode)) {
1900		spin_lock(&pid->lock);
1901		hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
1902		spin_unlock(&pid->lock);
1903	}
1904
1905	task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
1906	security_task_to_inode(task, inode);
1907
1908out:
1909	return inode;
1910
1911out_unlock:
1912	iput(inode);
1913	return NULL;
1914}
1915
1916int pid_getattr(const struct path *path, struct kstat *stat,
1917		u32 request_mask, unsigned int query_flags)
1918{
1919	struct inode *inode = d_inode(path->dentry);
1920	struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
1921	struct task_struct *task;
1922
1923	generic_fillattr(inode, stat);
1924
1925	stat->uid = GLOBAL_ROOT_UID;
1926	stat->gid = GLOBAL_ROOT_GID;
1927	rcu_read_lock();
1928	task = pid_task(proc_pid(inode), PIDTYPE_PID);
1929	if (task) {
1930		if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) {
1931			rcu_read_unlock();
1932			/*
1933			 * This doesn't prevent learning whether PID exists,
1934			 * it only makes getattr() consistent with readdir().
1935			 */
1936			return -ENOENT;
1937		}
1938		task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
1939	}
1940	rcu_read_unlock();
1941	return 0;
1942}
1943
1944/* dentry stuff */
1945
1946/*
1947 * Set <pid>/... inode ownership (can change due to setuid(), etc.)
1948 */
1949void pid_update_inode(struct task_struct *task, struct inode *inode)
1950{
1951	task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
1952
1953	inode->i_mode &= ~(S_ISUID | S_ISGID);
1954	security_task_to_inode(task, inode);
1955}
1956
1957/*
1958 * Rewrite the inode's ownerships here because the owning task may have
1959 * performed a setuid(), etc.
1960 *
1961 */
1962static int pid_revalidate(struct dentry *dentry, unsigned int flags)
1963{
1964	struct inode *inode;
1965	struct task_struct *task;
1966
1967	if (flags & LOOKUP_RCU)
1968		return -ECHILD;
1969
1970	inode = d_inode(dentry);
1971	task = get_proc_task(inode);
1972
1973	if (task) {
1974		pid_update_inode(task, inode);
1975		put_task_struct(task);
1976		return 1;
1977	}
1978	return 0;
1979}
1980
1981static inline bool proc_inode_is_dead(struct inode *inode)
1982{
1983	return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
1984}
1985
1986int pid_delete_dentry(const struct dentry *dentry)
1987{
1988	/* Is the task we represent dead?
1989	 * If so, then don't put the dentry on the lru list,
1990	 * kill it immediately.
1991	 */
1992	return proc_inode_is_dead(d_inode(dentry));
1993}
1994
1995const struct dentry_operations pid_dentry_operations =
1996{
1997	.d_revalidate	= pid_revalidate,
1998	.d_delete	= pid_delete_dentry,
1999};
2000
2001/* Lookups */
2002
2003/*
2004 * Fill a directory entry.
2005 *
2006 * If possible create the dcache entry and derive our inode number and
2007 * file type from dcache entry.
2008 *
2009 * Since all of the proc inode numbers are dynamically generated, the inode
2010 * numbers do not exist until the inode is cache.  This means creating the
2011 * the dcache entry in readdir is necessary to keep the inode numbers
2012 * reported by readdir in sync with the inode numbers reported
2013 * by stat.
2014 */
2015bool proc_fill_cache(struct file *file, struct dir_context *ctx,
2016	const char *name, unsigned int len,
2017	instantiate_t instantiate, struct task_struct *task, const void *ptr)
2018{
2019	struct dentry *child, *dir = file->f_path.dentry;
2020	struct qstr qname = QSTR_INIT(name, len);
2021	struct inode *inode;
2022	unsigned type = DT_UNKNOWN;
2023	ino_t ino = 1;
2024
2025	child = d_hash_and_lookup(dir, &qname);
2026	if (!child) {
2027		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
2028		child = d_alloc_parallel(dir, &qname, &wq);
2029		if (IS_ERR(child))
2030			goto end_instantiate;
2031		if (d_in_lookup(child)) {
2032			struct dentry *res;
2033			res = instantiate(child, task, ptr);
2034			d_lookup_done(child);
2035			if (unlikely(res)) {
2036				dput(child);
2037				child = res;
2038				if (IS_ERR(child))
2039					goto end_instantiate;
2040			}
2041		}
2042	}
2043	inode = d_inode(child);
2044	ino = inode->i_ino;
2045	type = inode->i_mode >> 12;
2046	dput(child);
2047end_instantiate:
2048	return dir_emit(ctx, name, len, ino, type);
2049}
2050
2051/*
2052 * dname_to_vma_addr - maps a dentry name into two unsigned longs
2053 * which represent vma start and end addresses.
2054 */
2055static int dname_to_vma_addr(struct dentry *dentry,
2056			     unsigned long *start, unsigned long *end)
2057{
2058	const char *str = dentry->d_name.name;
2059	unsigned long long sval, eval;
2060	unsigned int len;
2061
2062	if (str[0] == '0' && str[1] != '-')
2063		return -EINVAL;
2064	len = _parse_integer(str, 16, &sval);
2065	if (len & KSTRTOX_OVERFLOW)
2066		return -EINVAL;
2067	if (sval != (unsigned long)sval)
2068		return -EINVAL;
2069	str += len;
2070
2071	if (*str != '-')
2072		return -EINVAL;
2073	str++;
2074
2075	if (str[0] == '0' && str[1])
2076		return -EINVAL;
2077	len = _parse_integer(str, 16, &eval);
2078	if (len & KSTRTOX_OVERFLOW)
2079		return -EINVAL;
2080	if (eval != (unsigned long)eval)
2081		return -EINVAL;
2082	str += len;
2083
2084	if (*str != '\0')
2085		return -EINVAL;
2086
2087	*start = sval;
2088	*end = eval;
2089
2090	return 0;
2091}
2092
2093static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
2094{
2095	unsigned long vm_start, vm_end;
2096	bool exact_vma_exists = false;
2097	struct mm_struct *mm = NULL;
2098	struct task_struct *task;
2099	struct inode *inode;
2100	int status = 0;
2101
2102	if (flags & LOOKUP_RCU)
2103		return -ECHILD;
2104
2105	inode = d_inode(dentry);
2106	task = get_proc_task(inode);
2107	if (!task)
2108		goto out_notask;
2109
2110	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
2111	if (IS_ERR_OR_NULL(mm))
2112		goto out;
2113
2114	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
2115		status = mmap_read_lock_killable(mm);
2116		if (!status) {
2117			exact_vma_exists = !!find_exact_vma(mm, vm_start,
2118							    vm_end);
2119			mmap_read_unlock(mm);
2120		}
2121	}
2122
2123	mmput(mm);
2124
2125	if (exact_vma_exists) {
2126		task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
2127
2128		security_task_to_inode(task, inode);
2129		status = 1;
2130	}
2131
2132out:
2133	put_task_struct(task);
2134
2135out_notask:
2136	return status;
2137}
2138
2139static const struct dentry_operations tid_map_files_dentry_operations = {
2140	.d_revalidate	= map_files_d_revalidate,
2141	.d_delete	= pid_delete_dentry,
2142};
2143
2144static int map_files_get_link(struct dentry *dentry, struct path *path)
2145{
2146	unsigned long vm_start, vm_end;
2147	struct vm_area_struct *vma;
2148	struct task_struct *task;
2149	struct mm_struct *mm;
2150	int rc;
2151
2152	rc = -ENOENT;
2153	task = get_proc_task(d_inode(dentry));
2154	if (!task)
2155		goto out;
2156
2157	mm = get_task_mm(task);
2158	put_task_struct(task);
2159	if (!mm)
2160		goto out;
2161
2162	rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
2163	if (rc)
2164		goto out_mmput;
2165
2166	rc = mmap_read_lock_killable(mm);
2167	if (rc)
2168		goto out_mmput;
2169
2170	rc = -ENOENT;
2171	vma = find_exact_vma(mm, vm_start, vm_end);
2172	if (vma && vma->vm_file) {
2173		*path = vma->vm_file->f_path;
2174		path_get(path);
2175		rc = 0;
2176	}
2177	mmap_read_unlock(mm);
2178
2179out_mmput:
2180	mmput(mm);
2181out:
2182	return rc;
2183}
2184
2185struct map_files_info {
2186	unsigned long	start;
2187	unsigned long	end;
2188	fmode_t		mode;
2189};
2190
2191/*
2192 * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
2193 * symlinks may be used to bypass permissions on ancestor directories in the
2194 * path to the file in question.
2195 */
2196static const char *
2197proc_map_files_get_link(struct dentry *dentry,
2198			struct inode *inode,
2199		        struct delayed_call *done)
2200{
2201	if (!capable(CAP_SYS_ADMIN))
2202		return ERR_PTR(-EPERM);
2203
2204	return proc_pid_get_link(dentry, inode, done);
2205}
2206
2207/*
2208 * Identical to proc_pid_link_inode_operations except for get_link()
2209 */
2210static const struct inode_operations proc_map_files_link_inode_operations = {
2211	.readlink	= proc_pid_readlink,
2212	.get_link	= proc_map_files_get_link,
2213	.setattr	= proc_setattr,
2214};
2215
2216static struct dentry *
2217proc_map_files_instantiate(struct dentry *dentry,
2218			   struct task_struct *task, const void *ptr)
2219{
2220	fmode_t mode = (fmode_t)(unsigned long)ptr;
2221	struct proc_inode *ei;
2222	struct inode *inode;
2223
2224	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK |
2225				    ((mode & FMODE_READ ) ? S_IRUSR : 0) |
2226				    ((mode & FMODE_WRITE) ? S_IWUSR : 0));
2227	if (!inode)
2228		return ERR_PTR(-ENOENT);
2229
2230	ei = PROC_I(inode);
2231	ei->op.proc_get_link = map_files_get_link;
2232
2233	inode->i_op = &proc_map_files_link_inode_operations;
2234	inode->i_size = 64;
2235
2236	d_set_d_op(dentry, &tid_map_files_dentry_operations);
2237	return d_splice_alias(inode, dentry);
2238}
2239
2240static struct dentry *proc_map_files_lookup(struct inode *dir,
2241		struct dentry *dentry, unsigned int flags)
2242{
2243	unsigned long vm_start, vm_end;
2244	struct vm_area_struct *vma;
2245	struct task_struct *task;
2246	struct dentry *result;
2247	struct mm_struct *mm;
2248
2249	result = ERR_PTR(-ENOENT);
2250	task = get_proc_task(dir);
2251	if (!task)
2252		goto out;
2253
2254	result = ERR_PTR(-EACCES);
2255	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
2256		goto out_put_task;
2257
2258	result = ERR_PTR(-ENOENT);
2259	if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
2260		goto out_put_task;
2261
2262	mm = get_task_mm(task);
2263	if (!mm)
2264		goto out_put_task;
2265
2266	result = ERR_PTR(-EINTR);
2267	if (mmap_read_lock_killable(mm))
2268		goto out_put_mm;
2269
2270	result = ERR_PTR(-ENOENT);
2271	vma = find_exact_vma(mm, vm_start, vm_end);
2272	if (!vma)
2273		goto out_no_vma;
2274
2275	if (vma->vm_file)
2276		result = proc_map_files_instantiate(dentry, task,
2277				(void *)(unsigned long)vma->vm_file->f_mode);
2278
2279out_no_vma:
2280	mmap_read_unlock(mm);
2281out_put_mm:
2282	mmput(mm);
2283out_put_task:
2284	put_task_struct(task);
2285out:
2286	return result;
2287}
2288
2289static const struct inode_operations proc_map_files_inode_operations = {
2290	.lookup		= proc_map_files_lookup,
2291	.permission	= proc_fd_permission,
2292	.setattr	= proc_setattr,
2293};
2294
2295static int
2296proc_map_files_readdir(struct file *file, struct dir_context *ctx)
2297{
2298	struct vm_area_struct *vma;
2299	struct task_struct *task;
2300	struct mm_struct *mm;
2301	unsigned long nr_files, pos, i;
2302	GENRADIX(struct map_files_info) fa;
2303	struct map_files_info *p;
2304	int ret;
2305
2306	genradix_init(&fa);
2307
2308	ret = -ENOENT;
2309	task = get_proc_task(file_inode(file));
2310	if (!task)
2311		goto out;
2312
2313	ret = -EACCES;
2314	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
2315		goto out_put_task;
2316
2317	ret = 0;
2318	if (!dir_emit_dots(file, ctx))
2319		goto out_put_task;
2320
2321	mm = get_task_mm(task);
2322	if (!mm)
2323		goto out_put_task;
2324
2325	ret = mmap_read_lock_killable(mm);
2326	if (ret) {
2327		mmput(mm);
2328		goto out_put_task;
2329	}
2330
2331	nr_files = 0;
2332
2333	/*
2334	 * We need two passes here:
2335	 *
2336	 *  1) Collect vmas of mapped files with mmap_lock taken
2337	 *  2) Release mmap_lock and instantiate entries
2338	 *
2339	 * otherwise we get lockdep complained, since filldir()
2340	 * routine might require mmap_lock taken in might_fault().
2341	 */
2342
2343	for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
2344		if (!vma->vm_file)
2345			continue;
2346		if (++pos <= ctx->pos)
2347			continue;
2348
2349		p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
2350		if (!p) {
2351			ret = -ENOMEM;
2352			mmap_read_unlock(mm);
2353			mmput(mm);
2354			goto out_put_task;
2355		}
2356
2357		p->start = vma->vm_start;
2358		p->end = vma->vm_end;
2359		p->mode = vma->vm_file->f_mode;
2360	}
2361	mmap_read_unlock(mm);
2362	mmput(mm);
2363
2364	for (i = 0; i < nr_files; i++) {
2365		char buf[4 * sizeof(long) + 2];	/* max: %lx-%lx\0 */
2366		unsigned int len;
2367
2368		p = genradix_ptr(&fa, i);
2369		len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
2370		if (!proc_fill_cache(file, ctx,
2371				      buf, len,
2372				      proc_map_files_instantiate,
2373				      task,
2374				      (void *)(unsigned long)p->mode))
2375			break;
2376		ctx->pos++;
2377	}
2378
2379out_put_task:
2380	put_task_struct(task);
2381out:
2382	genradix_free(&fa);
2383	return ret;
2384}
2385
2386static const struct file_operations proc_map_files_operations = {
2387	.read		= generic_read_dir,
2388	.iterate_shared	= proc_map_files_readdir,
2389	.llseek		= generic_file_llseek,
2390};
2391
2392#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
2393struct timers_private {
2394	struct pid *pid;
2395	struct task_struct *task;
2396	struct sighand_struct *sighand;
2397	struct pid_namespace *ns;
2398	unsigned long flags;
2399};
2400
2401static void *timers_start(struct seq_file *m, loff_t *pos)
2402{
2403	struct timers_private *tp = m->private;
2404
2405	tp->task = get_pid_task(tp->pid, PIDTYPE_PID);
2406	if (!tp->task)
2407		return ERR_PTR(-ESRCH);
2408
2409	tp->sighand = lock_task_sighand(tp->task, &tp->flags);
2410	if (!tp->sighand)
2411		return ERR_PTR(-ESRCH);
2412
2413	return seq_list_start(&tp->task->signal->posix_timers, *pos);
2414}
2415
2416static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
2417{
2418	struct timers_private *tp = m->private;
2419	return seq_list_next(v, &tp->task->signal->posix_timers, pos);
2420}
2421
2422static void timers_stop(struct seq_file *m, void *v)
2423{
2424	struct timers_private *tp = m->private;
2425
2426	if (tp->sighand) {
2427		unlock_task_sighand(tp->task, &tp->flags);
2428		tp->sighand = NULL;
2429	}
2430
2431	if (tp->task) {
2432		put_task_struct(tp->task);
2433		tp->task = NULL;
2434	}
2435}
2436
2437static int show_timer(struct seq_file *m, void *v)
2438{
2439	struct k_itimer *timer;
2440	struct timers_private *tp = m->private;
2441	int notify;
2442	static const char * const nstr[] = {
2443		[SIGEV_SIGNAL] = "signal",
2444		[SIGEV_NONE] = "none",
2445		[SIGEV_THREAD] = "thread",
2446	};
2447
2448	timer = list_entry((struct list_head *)v, struct k_itimer, list);
2449	notify = timer->it_sigev_notify;
2450
2451	seq_printf(m, "ID: %d\n", timer->it_id);
2452	seq_printf(m, "signal: %d/%px\n",
2453		   timer->sigq->info.si_signo,
2454		   timer->sigq->info.si_value.sival_ptr);
2455	seq_printf(m, "notify: %s/%s.%d\n",
2456		   nstr[notify & ~SIGEV_THREAD_ID],
2457		   (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
2458		   pid_nr_ns(timer->it_pid, tp->ns));
2459	seq_printf(m, "ClockID: %d\n", timer->it_clock);
2460
2461	return 0;
2462}
2463
2464static const struct seq_operations proc_timers_seq_ops = {
2465	.start	= timers_start,
2466	.next	= timers_next,
2467	.stop	= timers_stop,
2468	.show	= show_timer,
2469};
2470
2471static int proc_timers_open(struct inode *inode, struct file *file)
2472{
2473	struct timers_private *tp;
2474
2475	tp = __seq_open_private(file, &proc_timers_seq_ops,
2476			sizeof(struct timers_private));
2477	if (!tp)
2478		return -ENOMEM;
2479
2480	tp->pid = proc_pid(inode);
2481	tp->ns = proc_pid_ns(inode->i_sb);
2482	return 0;
2483}
2484
2485static const struct file_operations proc_timers_operations = {
2486	.open		= proc_timers_open,
2487	.read		= seq_read,
2488	.llseek		= seq_lseek,
2489	.release	= seq_release_private,
2490};
2491#endif
2492
2493static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
2494					size_t count, loff_t *offset)
2495{
2496	struct inode *inode = file_inode(file);
2497	struct task_struct *p;
2498	u64 slack_ns;
2499	int err;
2500
2501	err = kstrtoull_from_user(buf, count, 10, &slack_ns);
2502	if (err < 0)
2503		return err;
2504
2505	p = get_proc_task(inode);
2506	if (!p)
2507		return -ESRCH;
2508
2509	if (p != current) {
2510		rcu_read_lock();
2511		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
2512			rcu_read_unlock();
2513			count = -EPERM;
2514			goto out;
2515		}
2516		rcu_read_unlock();
2517
2518		err = security_task_setscheduler(p);
2519		if (err) {
2520			count = err;
2521			goto out;
2522		}
2523	}
2524
2525	task_lock(p);
2526	if (slack_ns == 0)
2527		p->timer_slack_ns = p->default_timer_slack_ns;
2528	else
2529		p->timer_slack_ns = slack_ns;
2530	task_unlock(p);
2531
2532out:
2533	put_task_struct(p);
2534
2535	return count;
2536}
2537
2538static int timerslack_ns_show(struct seq_file *m, void *v)
2539{
2540	struct inode *inode = m->private;
2541	struct task_struct *p;
2542	int err = 0;
2543
2544	p = get_proc_task(inode);
2545	if (!p)
2546		return -ESRCH;
2547
2548	if (p != current) {
2549		rcu_read_lock();
2550		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
2551			rcu_read_unlock();
2552			err = -EPERM;
2553			goto out;
2554		}
2555		rcu_read_unlock();
2556
2557		err = security_task_getscheduler(p);
2558		if (err)
2559			goto out;
2560	}
2561
2562	task_lock(p);
2563	seq_printf(m, "%llu\n", p->timer_slack_ns);
2564	task_unlock(p);
2565
2566out:
2567	put_task_struct(p);
2568
2569	return err;
2570}
2571
2572static int timerslack_ns_open(struct inode *inode, struct file *filp)
2573{
2574	return single_open(filp, timerslack_ns_show, inode);
2575}
2576
2577static const struct file_operations proc_pid_set_timerslack_ns_operations = {
2578	.open		= timerslack_ns_open,
2579	.read		= seq_read,
2580	.write		= timerslack_ns_write,
2581	.llseek		= seq_lseek,
2582	.release	= single_release,
2583};
2584
2585static struct dentry *proc_pident_instantiate(struct dentry *dentry,
2586	struct task_struct *task, const void *ptr)
2587{
2588	const struct pid_entry *p = ptr;
2589	struct inode *inode;
2590	struct proc_inode *ei;
2591
2592	inode = proc_pid_make_inode(dentry->d_sb, task, p->mode);
2593	if (!inode)
2594		return ERR_PTR(-ENOENT);
2595
2596	ei = PROC_I(inode);
2597	if (S_ISDIR(inode->i_mode))
2598		set_nlink(inode, 2);	/* Use getattr to fix if necessary */
2599	if (p->iop)
2600		inode->i_op = p->iop;
2601	if (p->fop)
2602		inode->i_fop = p->fop;
2603	ei->op = p->op;
2604	pid_update_inode(task, inode);
2605	d_set_d_op(dentry, &pid_dentry_operations);
2606	return d_splice_alias(inode, dentry);
2607}
2608
2609static struct dentry *proc_pident_lookup(struct inode *dir, 
2610					 struct dentry *dentry,
2611					 const struct pid_entry *p,
2612					 const struct pid_entry *end)
2613{
2614	struct task_struct *task = get_proc_task(dir);
2615	struct dentry *res = ERR_PTR(-ENOENT);
2616
2617	if (!task)
2618		goto out_no_task;
2619
2620	/*
2621	 * Yes, it does not scale. And it should not. Don't add
2622	 * new entries into /proc/<tgid>/ without very good reasons.
2623	 */
2624	for (; p < end; p++) {
2625		if (p->len != dentry->d_name.len)
2626			continue;
2627		if (!memcmp(dentry->d_name.name, p->name, p->len)) {
2628			res = proc_pident_instantiate(dentry, task, p);
2629			break;
2630		}
2631	}
2632	put_task_struct(task);
2633out_no_task:
2634	return res;
2635}
2636
2637static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
2638		const struct pid_entry *ents, unsigned int nents)
2639{
2640	struct task_struct *task = get_proc_task(file_inode(file));
2641	const struct pid_entry *p;
2642
2643	if (!task)
2644		return -ENOENT;
2645
2646	if (!dir_emit_dots(file, ctx))
2647		goto out;
2648
2649	if (ctx->pos >= nents + 2)
2650		goto out;
2651
2652	for (p = ents + (ctx->pos - 2); p < ents + nents; p++) {
2653		if (!proc_fill_cache(file, ctx, p->name, p->len,
2654				proc_pident_instantiate, task, p))
2655			break;
2656		ctx->pos++;
2657	}
2658out:
2659	put_task_struct(task);
2660	return 0;
2661}
2662
2663#ifdef CONFIG_SECURITY
2664static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
2665				  size_t count, loff_t *ppos)
2666{
2667	struct inode * inode = file_inode(file);
2668	char *p = NULL;
2669	ssize_t length;
2670	struct task_struct *task = get_proc_task(inode);
2671
2672	if (!task)
2673		return -ESRCH;
2674
2675	length = security_getprocattr(task, PROC_I(inode)->op.lsm,
2676				      (char*)file->f_path.dentry->d_name.name,
2677				      &p);
2678	put_task_struct(task);
2679	if (length > 0)
2680		length = simple_read_from_buffer(buf, count, ppos, p, length);
2681	kfree(p);
2682	return length;
2683}
2684
2685static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
2686				   size_t count, loff_t *ppos)
2687{
2688	struct inode * inode = file_inode(file);
2689	struct task_struct *task;
2690	void *page;
2691	int rv;
2692
2693	rcu_read_lock();
2694	task = pid_task(proc_pid(inode), PIDTYPE_PID);
2695	if (!task) {
2696		rcu_read_unlock();
2697		return -ESRCH;
2698	}
2699	/* A task may only write its own attributes. */
2700	if (current != task) {
2701		rcu_read_unlock();
2702		return -EACCES;
2703	}
2704	/* Prevent changes to overridden credentials. */
2705	if (current_cred() != current_real_cred()) {
2706		rcu_read_unlock();
2707		return -EBUSY;
2708	}
2709	rcu_read_unlock();
2710
2711	if (count > PAGE_SIZE)
2712		count = PAGE_SIZE;
2713
2714	/* No partial writes. */
2715	if (*ppos != 0)
2716		return -EINVAL;
2717
2718	page = memdup_user(buf, count);
2719	if (IS_ERR(page)) {
2720		rv = PTR_ERR(page);
2721		goto out;
2722	}
2723
2724	/* Guard against adverse ptrace interaction */
2725	rv = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
2726	if (rv < 0)
2727		goto out_free;
2728
2729	rv = security_setprocattr(PROC_I(inode)->op.lsm,
2730				  file->f_path.dentry->d_name.name, page,
2731				  count);
2732	mutex_unlock(&current->signal->cred_guard_mutex);
2733out_free:
2734	kfree(page);
2735out:
2736	return rv;
2737}
2738
2739static const struct file_operations proc_pid_attr_operations = {
2740	.read		= proc_pid_attr_read,
2741	.write		= proc_pid_attr_write,
2742	.llseek		= generic_file_llseek,
2743};
2744
2745#define LSM_DIR_OPS(LSM) \
2746static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
2747			     struct dir_context *ctx) \
2748{ \
2749	return proc_pident_readdir(filp, ctx, \
2750				   LSM##_attr_dir_stuff, \
2751				   ARRAY_SIZE(LSM##_attr_dir_stuff)); \
2752} \
2753\
2754static const struct file_operations proc_##LSM##_attr_dir_ops = { \
2755	.read		= generic_read_dir, \
2756	.iterate	= proc_##LSM##_attr_dir_iterate, \
2757	.llseek		= default_llseek, \
2758}; \
2759\
2760static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
2761				struct dentry *dentry, unsigned int flags) \
2762{ \
2763	return proc_pident_lookup(dir, dentry, \
2764				  LSM##_attr_dir_stuff, \
2765				  LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \
2766} \
2767\
2768static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
2769	.lookup		= proc_##LSM##_attr_dir_lookup, \
2770	.getattr	= pid_getattr, \
2771	.setattr	= proc_setattr, \
2772}
2773
2774#ifdef CONFIG_SECURITY_SMACK
2775static const struct pid_entry smack_attr_dir_stuff[] = {
2776	ATTR("smack", "current",	0666),
2777};
2778LSM_DIR_OPS(smack);
2779#endif
2780
2781#ifdef CONFIG_SECURITY_APPARMOR
2782static const struct pid_entry apparmor_attr_dir_stuff[] = {
2783	ATTR("apparmor", "current",	0666),
2784	ATTR("apparmor", "prev",	0444),
2785	ATTR("apparmor", "exec",	0666),
2786};
2787LSM_DIR_OPS(apparmor);
2788#endif
2789
2790static const struct pid_entry attr_dir_stuff[] = {
2791	ATTR(NULL, "current",		0666),
2792	ATTR(NULL, "prev",		0444),
2793	ATTR(NULL, "exec",		0666),
2794	ATTR(NULL, "fscreate",		0666),
2795	ATTR(NULL, "keycreate",		0666),
2796	ATTR(NULL, "sockcreate",	0666),
2797#ifdef CONFIG_SECURITY_SMACK
2798	DIR("smack",			0555,
2799	    proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
2800#endif
2801#ifdef CONFIG_SECURITY_APPARMOR
2802	DIR("apparmor",			0555,
2803	    proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops),
2804#endif
2805};
2806
2807static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
2808{
2809	return proc_pident_readdir(file, ctx, 
2810				   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
2811}
2812
2813static const struct file_operations proc_attr_dir_operations = {
2814	.read		= generic_read_dir,
2815	.iterate_shared	= proc_attr_dir_readdir,
2816	.llseek		= generic_file_llseek,
2817};
2818
2819static struct dentry *proc_attr_dir_lookup(struct inode *dir,
2820				struct dentry *dentry, unsigned int flags)
2821{
2822	return proc_pident_lookup(dir, dentry,
2823				  attr_dir_stuff,
2824				  attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff));
2825}
2826
2827static const struct inode_operations proc_attr_dir_inode_operations = {
2828	.lookup		= proc_attr_dir_lookup,
2829	.getattr	= pid_getattr,
2830	.setattr	= proc_setattr,
2831};
2832
2833#endif
2834
2835#ifdef CONFIG_ELF_CORE
2836static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
2837					 size_t count, loff_t *ppos)
2838{
2839	struct task_struct *task = get_proc_task(file_inode(file));
2840	struct mm_struct *mm;
2841	char buffer[PROC_NUMBUF];
2842	size_t len;
2843	int ret;
2844
2845	if (!task)
2846		return -ESRCH;
2847
2848	ret = 0;
2849	mm = get_task_mm(task);
2850	if (mm) {
2851		len = snprintf(buffer, sizeof(buffer), "%08lx\n",
2852			       ((mm->flags & MMF_DUMP_FILTER_MASK) >>
2853				MMF_DUMP_FILTER_SHIFT));
2854		mmput(mm);
2855		ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
2856	}
2857
2858	put_task_struct(task);
2859
2860	return ret;
2861}
2862
2863static ssize_t proc_coredump_filter_write(struct file *file,
2864					  const char __user *buf,
2865					  size_t count,
2866					  loff_t *ppos)
2867{
2868	struct task_struct *task;
2869	struct mm_struct *mm;
2870	unsigned int val;
2871	int ret;
2872	int i;
2873	unsigned long mask;
2874
2875	ret = kstrtouint_from_user(buf, count, 0, &val);
2876	if (ret < 0)
2877		return ret;
2878
2879	ret = -ESRCH;
2880	task = get_proc_task(file_inode(file));
2881	if (!task)
2882		goto out_no_task;
2883
2884	mm = get_task_mm(task);
2885	if (!mm)
2886		goto out_no_mm;
2887	ret = 0;
2888
2889	for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
2890		if (val & mask)
2891			set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2892		else
2893			clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2894	}
2895
2896	mmput(mm);
2897 out_no_mm:
2898	put_task_struct(task);
2899 out_no_task:
2900	if (ret < 0)
2901		return ret;
2902	return count;
2903}
2904
2905static const struct file_operations proc_coredump_filter_operations = {
2906	.read		= proc_coredump_filter_read,
2907	.write		= proc_coredump_filter_write,
2908	.llseek		= generic_file_llseek,
2909};
2910#endif
2911
2912#ifdef CONFIG_TASK_IO_ACCOUNTING
2913static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
2914{
2915	struct task_io_accounting acct = task->ioac;
2916	unsigned long flags;
2917	int result;
2918
2919	result = mutex_lock_killable(&task->signal->exec_update_mutex);
2920	if (result)
2921		return result;
2922
2923	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
2924		result = -EACCES;
2925		goto out_unlock;
2926	}
2927
2928	if (whole && lock_task_sighand(task, &flags)) {
2929		struct task_struct *t = task;
2930
2931		task_io_accounting_add(&acct, &task->signal->ioac);
2932		while_each_thread(task, t)
2933			task_io_accounting_add(&acct, &t->ioac);
2934
2935		unlock_task_sighand(task, &flags);
2936	}
2937	seq_printf(m,
2938		   "rchar: %llu\n"
2939		   "wchar: %llu\n"
2940		   "syscr: %llu\n"
2941		   "syscw: %llu\n"
2942		   "read_bytes: %llu\n"
2943		   "write_bytes: %llu\n"
2944		   "cancelled_write_bytes: %llu\n",
2945		   (unsigned long long)acct.rchar,
2946		   (unsigned long long)acct.wchar,
2947		   (unsigned long long)acct.syscr,
2948		   (unsigned long long)acct.syscw,
2949		   (unsigned long long)acct.read_bytes,
2950		   (unsigned long long)acct.write_bytes,
2951		   (unsigned long long)acct.cancelled_write_bytes);
2952	result = 0;
2953
2954out_unlock:
2955	mutex_unlock(&task->signal->exec_update_mutex);
2956	return result;
2957}
2958
2959static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
2960				  struct pid *pid, struct task_struct *task)
2961{
2962	return do_io_accounting(task, m, 0);
2963}
2964
2965static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
2966				   struct pid *pid, struct task_struct *task)
2967{
2968	return do_io_accounting(task, m, 1);
2969}
2970#endif /* CONFIG_TASK_IO_ACCOUNTING */
2971
2972#ifdef CONFIG_USER_NS
2973static int proc_id_map_open(struct inode *inode, struct file *file,
2974	const struct seq_operations *seq_ops)
2975{
2976	struct user_namespace *ns = NULL;
2977	struct task_struct *task;
2978	struct seq_file *seq;
2979	int ret = -EINVAL;
2980
2981	task = get_proc_task(inode);
2982	if (task) {
2983		rcu_read_lock();
2984		ns = get_user_ns(task_cred_xxx(task, user_ns));
2985		rcu_read_unlock();
2986		put_task_struct(task);
2987	}
2988	if (!ns)
2989		goto err;
2990
2991	ret = seq_open(file, seq_ops);
2992	if (ret)
2993		goto err_put_ns;
2994
2995	seq = file->private_data;
2996	seq->private = ns;
2997
2998	return 0;
2999err_put_ns:
3000	put_user_ns(ns);
3001err:
3002	return ret;
3003}
3004
3005static int proc_id_map_release(struct inode *inode, struct file *file)
3006{
3007	struct seq_file *seq = file->private_data;
3008	struct user_namespace *ns = seq->private;
3009	put_user_ns(ns);
3010	return seq_release(inode, file);
3011}
3012
3013static int proc_uid_map_open(struct inode *inode, struct file *file)
3014{
3015	return proc_id_map_open(inode, file, &proc_uid_seq_operations);
3016}
3017
3018static int proc_gid_map_open(struct inode *inode, struct file *file)
3019{
3020	return proc_id_map_open(inode, file, &proc_gid_seq_operations);
3021}
3022
3023static int proc_projid_map_open(struct inode *inode, struct file *file)
3024{
3025	return proc_id_map_open(inode, file, &proc_projid_seq_operations);
3026}
3027
3028static const struct file_operations proc_uid_map_operations = {
3029	.open		= proc_uid_map_open,
3030	.write		= proc_uid_map_write,
3031	.read		= seq_read,
3032	.llseek		= seq_lseek,
3033	.release	= proc_id_map_release,
3034};
3035
3036static const struct file_operations proc_gid_map_operations = {
3037	.open		= proc_gid_map_open,
3038	.write		= proc_gid_map_write,
3039	.read		= seq_read,
3040	.llseek		= seq_lseek,
3041	.release	= proc_id_map_release,
3042};
3043
3044static const struct file_operations proc_projid_map_operations = {
3045	.open		= proc_projid_map_open,
3046	.write		= proc_projid_map_write,
3047	.read		= seq_read,
3048	.llseek		= seq_lseek,
3049	.release	= proc_id_map_release,
3050};
3051
3052static int proc_setgroups_open(struct inode *inode, struct file *file)
3053{
3054	struct user_namespace *ns = NULL;
3055	struct task_struct *task;
3056	int ret;
3057
3058	ret = -ESRCH;
3059	task = get_proc_task(inode);
3060	if (task) {
3061		rcu_read_lock();
3062		ns = get_user_ns(task_cred_xxx(task, user_ns));
3063		rcu_read_unlock();
3064		put_task_struct(task);
3065	}
3066	if (!ns)
3067		goto err;
3068
3069	if (file->f_mode & FMODE_WRITE) {
3070		ret = -EACCES;
3071		if (!ns_capable(ns, CAP_SYS_ADMIN))
3072			goto err_put_ns;
3073	}
3074
3075	ret = single_open(file, &proc_setgroups_show, ns);
3076	if (ret)
3077		goto err_put_ns;
3078
3079	return 0;
3080err_put_ns:
3081	put_user_ns(ns);
3082err:
3083	return ret;
3084}
3085
3086static int proc_setgroups_release(struct inode *inode, struct file *file)
3087{
3088	struct seq_file *seq = file->private_data;
3089	struct user_namespace *ns = seq->private;
3090	int ret = single_release(inode, file);
3091	put_user_ns(ns);
3092	return ret;
3093}
3094
3095static const struct file_operations proc_setgroups_operations = {
3096	.open		= proc_setgroups_open,
3097	.write		= proc_setgroups_write,
3098	.read		= seq_read,
3099	.llseek		= seq_lseek,
3100	.release	= proc_setgroups_release,
3101};
3102#endif /* CONFIG_USER_NS */
3103
3104static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
3105				struct pid *pid, struct task_struct *task)
3106{
3107	int err = lock_trace(task);
3108	if (!err) {
3109		seq_printf(m, "%08x\n", task->personality);
3110		unlock_trace(task);
3111	}
3112	return err;
3113}
3114
3115#ifdef CONFIG_LIVEPATCH
3116static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
3117				struct pid *pid, struct task_struct *task)
3118{
3119	seq_printf(m, "%d\n", task->patch_state);
3120	return 0;
3121}
3122#endif /* CONFIG_LIVEPATCH */
3123
3124#ifdef CONFIG_STACKLEAK_METRICS
3125static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
3126				struct pid *pid, struct task_struct *task)
3127{
3128	unsigned long prev_depth = THREAD_SIZE -
3129				(task->prev_lowest_stack & (THREAD_SIZE - 1));
3130	unsigned long depth = THREAD_SIZE -
3131				(task->lowest_stack & (THREAD_SIZE - 1));
3132
3133	seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n",
3134							prev_depth, depth);
3135	return 0;
3136}
3137#endif /* CONFIG_STACKLEAK_METRICS */
3138
3139/*
3140 * Thread groups
3141 */
3142static const struct file_operations proc_task_operations;
3143static const struct inode_operations proc_task_inode_operations;
3144
3145static const struct pid_entry tgid_base_stuff[] = {
3146	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
3147	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
3148	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
3149	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
3150	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
3151#ifdef CONFIG_NET
3152	DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
3153#endif
3154	REG("environ",    S_IRUSR, proc_environ_operations),
3155	REG("auxv",       S_IRUSR, proc_auxv_operations),
3156	ONE("status",     S_IRUGO, proc_pid_status),
3157	ONE("personality", S_IRUSR, proc_pid_personality),
3158	ONE("limits",	  S_IRUGO, proc_pid_limits),
3159#ifdef CONFIG_SCHED_DEBUG
3160	REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
3161#endif
3162#ifdef CONFIG_SCHED_AUTOGROUP
3163	REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
3164#endif
3165#ifdef CONFIG_TIME_NS
3166	REG("timens_offsets",  S_IRUGO|S_IWUSR, proc_timens_offsets_operations),
3167#endif
3168	REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
3169#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
3170	ONE("syscall",    S_IRUSR, proc_pid_syscall),
3171#endif
3172	REG("cmdline",    S_IRUGO, proc_pid_cmdline_ops),
3173	ONE("stat",       S_IRUGO, proc_tgid_stat),
3174	ONE("statm",      S_IRUGO, proc_pid_statm),
3175	REG("maps",       S_IRUGO, proc_pid_maps_operations),
3176#ifdef CONFIG_NUMA
3177	REG("numa_maps",  S_IRUGO, proc_pid_numa_maps_operations),
3178#endif
3179	REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
3180	LNK("cwd",        proc_cwd_link),
3181	LNK("root",       proc_root_link),
3182	LNK("exe",        proc_exe_link),
3183	REG("mounts",     S_IRUGO, proc_mounts_operations),
3184	REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
3185	REG("mountstats", S_IRUSR, proc_mountstats_operations),
3186#ifdef CONFIG_PROC_PAGE_MONITOR
3187	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
3188	REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
3189	REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
3190	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
3191#endif
3192#ifdef CONFIG_SECURITY
3193	DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
3194#endif
3195#ifdef CONFIG_KALLSYMS
3196	ONE("wchan",      S_IRUGO, proc_pid_wchan),
3197#endif
3198#ifdef CONFIG_STACKTRACE
3199	ONE("stack",      S_IRUSR, proc_pid_stack),
3200#endif
3201#ifdef CONFIG_SCHED_INFO
3202	ONE("schedstat",  S_IRUGO, proc_pid_schedstat),
3203#endif
3204#ifdef CONFIG_LATENCYTOP
3205	REG("latency",  S_IRUGO, proc_lstats_operations),
3206#endif
3207#ifdef CONFIG_PROC_PID_CPUSET
3208	ONE("cpuset",     S_IRUGO, proc_cpuset_show),
3209#endif
3210#ifdef CONFIG_CGROUPS
3211	ONE("cgroup",  S_IRUGO, proc_cgroup_show),
3212#endif
3213#ifdef CONFIG_PROC_CPU_RESCTRL
3214	ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
3215#endif
3216	ONE("oom_score",  S_IRUGO, proc_oom_score),
3217	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
3218	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
3219#ifdef CONFIG_AUDIT
3220	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
3221	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
3222#endif
3223#ifdef CONFIG_FAULT_INJECTION
3224	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
3225	REG("fail-nth", 0644, proc_fail_nth_operations),
3226#endif
3227#ifdef CONFIG_ELF_CORE
3228	REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
3229#endif
3230#ifdef CONFIG_TASK_IO_ACCOUNTING
3231	ONE("io",	S_IRUSR, proc_tgid_io_accounting),
3232#endif
3233#ifdef CONFIG_USER_NS
3234	REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
3235	REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
3236	REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
3237	REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
3238#endif
3239#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
3240	REG("timers",	  S_IRUGO, proc_timers_operations),
3241#endif
3242	REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
3243#ifdef CONFIG_LIVEPATCH
3244	ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
3245#endif
3246#ifdef CONFIG_STACKLEAK_METRICS
3247	ONE("stack_depth", S_IRUGO, proc_stack_depth),
3248#endif
3249#ifdef CONFIG_PROC_PID_ARCH_STATUS
3250	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
3251#endif
3252};
3253
3254static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
3255{
3256	return proc_pident_readdir(file, ctx,
3257				   tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
3258}
3259
3260static const struct file_operations proc_tgid_base_operations = {
3261	.read		= generic_read_dir,
3262	.iterate_shared	= proc_tgid_base_readdir,
3263	.llseek		= generic_file_llseek,
3264};
3265
3266struct pid *tgid_pidfd_to_pid(const struct file *file)
3267{
3268	if (file->f_op != &proc_tgid_base_operations)
3269		return ERR_PTR(-EBADF);
3270
3271	return proc_pid(file_inode(file));
3272}
3273
3274static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
3275{
3276	return proc_pident_lookup(dir, dentry,
3277				  tgid_base_stuff,
3278				  tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff));
3279}
3280
3281static const struct inode_operations proc_tgid_base_inode_operations = {
3282	.lookup		= proc_tgid_base_lookup,
3283	.getattr	= pid_getattr,
3284	.setattr	= proc_setattr,
3285	.permission	= proc_pid_permission,
3286};
3287
3288/**
3289 * proc_flush_pid -  Remove dcache entries for @pid from the /proc dcache.
3290 * @pid: pid that should be flushed.
3291 *
3292 * This function walks a list of inodes (that belong to any proc
3293 * filesystem) that are attached to the pid and flushes them from
3294 * the dentry cache.
3295 *
3296 * It is safe and reasonable to cache /proc entries for a task until
3297 * that task exits.  After that they just clog up the dcache with
3298 * useless entries, possibly causing useful dcache entries to be
3299 * flushed instead.  This routine is provided to flush those useless
3300 * dcache entries when a process is reaped.
3301 *
3302 * NOTE: This routine is just an optimization so it does not guarantee
3303 *       that no dcache entries will exist after a process is reaped
3304 *       it just makes it very unlikely that any will persist.
3305 */
3306
3307void proc_flush_pid(struct pid *pid)
3308{
3309	proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock);
3310}
3311
3312static struct dentry *proc_pid_instantiate(struct dentry * dentry,
3313				   struct task_struct *task, const void *ptr)
3314{
3315	struct inode *inode;
3316
3317	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
3318	if (!inode)
3319		return ERR_PTR(-ENOENT);
3320
3321	inode->i_op = &proc_tgid_base_inode_operations;
3322	inode->i_fop = &proc_tgid_base_operations;
3323	inode->i_flags|=S_IMMUTABLE;
3324
3325	set_nlink(inode, nlink_tgid);
3326	pid_update_inode(task, inode);
3327
3328	d_set_d_op(dentry, &pid_dentry_operations);
3329	return d_splice_alias(inode, dentry);
3330}
3331
3332struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
3333{
3334	struct task_struct *task;
3335	unsigned tgid;
3336	struct proc_fs_info *fs_info;
3337	struct pid_namespace *ns;
3338	struct dentry *result = ERR_PTR(-ENOENT);
3339
3340	tgid = name_to_int(&dentry->d_name);
3341	if (tgid == ~0U)
3342		goto out;
3343
3344	fs_info = proc_sb_info(dentry->d_sb);
3345	ns = fs_info->pid_ns;
3346	rcu_read_lock();
3347	task = find_task_by_pid_ns(tgid, ns);
3348	if (task)
3349		get_task_struct(task);
3350	rcu_read_unlock();
3351	if (!task)
3352		goto out;
3353
3354	/* Limit procfs to only ptraceable tasks */
3355	if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) {
3356		if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS))
3357			goto out_put_task;
3358	}
3359
3360	result = proc_pid_instantiate(dentry, task, NULL);
3361out_put_task:
3362	put_task_struct(task);
3363out:
3364	return result;
3365}
3366
3367/*
3368 * Find the first task with tgid >= tgid
3369 *
3370 */
3371struct tgid_iter {
3372	unsigned int tgid;
3373	struct task_struct *task;
3374};
3375static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
3376{
3377	struct pid *pid;
3378
3379	if (iter.task)
3380		put_task_struct(iter.task);
3381	rcu_read_lock();
3382retry:
3383	iter.task = NULL;
3384	pid = find_ge_pid(iter.tgid, ns);
3385	if (pid) {
3386		iter.tgid = pid_nr_ns(pid, ns);
3387		iter.task = pid_task(pid, PIDTYPE_TGID);
3388		if (!iter.task) {
3389			iter.tgid += 1;
3390			goto retry;
3391		}
3392		get_task_struct(iter.task);
3393	}
3394	rcu_read_unlock();
3395	return iter;
3396}
3397
3398#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)
3399
3400/* for the /proc/ directory itself, after non-process stuff has been done */
3401int proc_pid_readdir(struct file *file, struct dir_context *ctx)
3402{
3403	struct tgid_iter iter;
3404	struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb);
3405	struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb);
3406	loff_t pos = ctx->pos;
3407
3408	if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
3409		return 0;
3410
3411	if (pos == TGID_OFFSET - 2) {
3412		struct inode *inode = d_inode(fs_info->proc_self);
3413		if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
3414			return 0;
3415		ctx->pos = pos = pos + 1;
3416	}
3417	if (pos == TGID_OFFSET - 1) {
3418		struct inode *inode = d_inode(fs_info->proc_thread_self);
3419		if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
3420			return 0;
3421		ctx->pos = pos = pos + 1;
3422	}
3423	iter.tgid = pos - TGID_OFFSET;
3424	iter.task = NULL;
3425	for (iter = next_tgid(ns, iter);
3426	     iter.task;
3427	     iter.tgid += 1, iter = next_tgid(ns, iter)) {
3428		char name[10 + 1];
3429		unsigned int len;
3430
3431		cond_resched();
3432		if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE))
3433			continue;
3434
3435		len = snprintf(name, sizeof(name), "%u", iter.tgid);
3436		ctx->pos = iter.tgid + TGID_OFFSET;
3437		if (!proc_fill_cache(file, ctx, name, len,
3438				     proc_pid_instantiate, iter.task, NULL)) {
3439			put_task_struct(iter.task);
3440			return 0;
3441		}
3442	}
3443	ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
3444	return 0;
3445}
3446
3447/*
3448 * proc_tid_comm_permission is a special permission function exclusively
3449 * used for the node /proc/<pid>/task/<tid>/comm.
3450 * It bypasses generic permission checks in the case where a task of the same
3451 * task group attempts to access the node.
3452 * The rationale behind this is that glibc and bionic access this node for
3453 * cross thread naming (pthread_set/getname_np(!self)). However, if
3454 * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
3455 * which locks out the cross thread naming implementation.
3456 * This function makes sure that the node is always accessible for members of
3457 * same thread group.
3458 */
3459static int proc_tid_comm_permission(struct inode *inode, int mask)
3460{
3461	bool is_same_tgroup;
3462	struct task_struct *task;
3463
3464	task = get_proc_task(inode);
3465	if (!task)
3466		return -ESRCH;
3467	is_same_tgroup = same_thread_group(current, task);
3468	put_task_struct(task);
3469
3470	if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
3471		/* This file (/proc/<pid>/task/<tid>/comm) can always be
3472		 * read or written by the members of the corresponding
3473		 * thread group.
3474		 */
3475		return 0;
3476	}
3477
3478	return generic_permission(inode, mask);
3479}
3480
3481static const struct inode_operations proc_tid_comm_inode_operations = {
3482		.permission = proc_tid_comm_permission,
3483};
3484
3485/*
3486 * Tasks
3487 */
3488static const struct pid_entry tid_base_stuff[] = {
3489	DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
3490	DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
3491	DIR("ns",	 S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
3492#ifdef CONFIG_NET
3493	DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
3494#endif
3495	REG("environ",   S_IRUSR, proc_environ_operations),
3496	REG("auxv",      S_IRUSR, proc_auxv_operations),
3497	ONE("status",    S_IRUGO, proc_pid_status),
3498	ONE("personality", S_IRUSR, proc_pid_personality),
3499	ONE("limits",	 S_IRUGO, proc_pid_limits),
3500#ifdef CONFIG_SCHED_DEBUG
3501	REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
3502#endif
3503	NOD("comm",      S_IFREG|S_IRUGO|S_IWUSR,
3504			 &proc_tid_comm_inode_operations,
3505			 &proc_pid_set_comm_operations, {}),
3506#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
3507	ONE("syscall",   S_IRUSR, proc_pid_syscall),
3508#endif
3509	REG("cmdline",   S_IRUGO, proc_pid_cmdline_ops),
3510	ONE("stat",      S_IRUGO, proc_tid_stat),
3511	ONE("statm",     S_IRUGO, proc_pid_statm),
3512	REG("maps",      S_IRUGO, proc_pid_maps_operations),
3513#ifdef CONFIG_PROC_CHILDREN
3514	REG("children",  S_IRUGO, proc_tid_children_operations),
3515#endif
3516#ifdef CONFIG_NUMA
3517	REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
3518#endif
3519	REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
3520	LNK("cwd",       proc_cwd_link),
3521	LNK("root",      proc_root_link),
3522	LNK("exe",       proc_exe_link),
3523	REG("mounts",    S_IRUGO, proc_mounts_operations),
3524	REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
3525#ifdef CONFIG_PROC_PAGE_MONITOR
3526	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
3527	REG("smaps",     S_IRUGO, proc_pid_smaps_operations),
3528	REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
3529	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
3530#endif
3531#ifdef CONFIG_SECURITY
3532	DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
3533#endif
3534#ifdef CONFIG_KALLSYMS
3535	ONE("wchan",     S_IRUGO, proc_pid_wchan),
3536#endif
3537#ifdef CONFIG_STACKTRACE
3538	ONE("stack",      S_IRUSR, proc_pid_stack),
3539#endif
3540#ifdef CONFIG_SCHED_INFO
3541	ONE("schedstat", S_IRUGO, proc_pid_schedstat),
3542#endif
3543#ifdef CONFIG_LATENCYTOP
3544	REG("latency",  S_IRUGO, proc_lstats_operations),
3545#endif
3546#ifdef CONFIG_PROC_PID_CPUSET
3547	ONE("cpuset",    S_IRUGO, proc_cpuset_show),
3548#endif
3549#ifdef CONFIG_CGROUPS
3550	ONE("cgroup",  S_IRUGO, proc_cgroup_show),
3551#endif
3552#ifdef CONFIG_PROC_CPU_RESCTRL
3553	ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
3554#endif
3555	ONE("oom_score", S_IRUGO, proc_oom_score),
3556	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
3557	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
3558#ifdef CONFIG_AUDIT
3559	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
3560	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
3561#endif
3562#ifdef CONFIG_FAULT_INJECTION
3563	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
3564	REG("fail-nth", 0644, proc_fail_nth_operations),
3565#endif
3566#ifdef CONFIG_TASK_IO_ACCOUNTING
3567	ONE("io",	S_IRUSR, proc_tid_io_accounting),
3568#endif
3569#ifdef CONFIG_USER_NS
3570	REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
3571	REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
3572	REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
3573	REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
3574#endif
3575#ifdef CONFIG_LIVEPATCH
3576	ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
3577#endif
3578#ifdef CONFIG_PROC_PID_ARCH_STATUS
3579	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
3580#endif
3581};
3582
3583static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
3584{
3585	return proc_pident_readdir(file, ctx,
3586				   tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
3587}
3588
3589static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
3590{
3591	return proc_pident_lookup(dir, dentry,
3592				  tid_base_stuff,
3593				  tid_base_stuff + ARRAY_SIZE(tid_base_stuff));
3594}
3595
3596static const struct file_operations proc_tid_base_operations = {
3597	.read		= generic_read_dir,
3598	.iterate_shared	= proc_tid_base_readdir,
3599	.llseek		= generic_file_llseek,
3600};
3601
3602static const struct inode_operations proc_tid_base_inode_operations = {
3603	.lookup		= proc_tid_base_lookup,
3604	.getattr	= pid_getattr,
3605	.setattr	= proc_setattr,
3606};
3607
3608static struct dentry *proc_task_instantiate(struct dentry *dentry,
3609	struct task_struct *task, const void *ptr)
3610{
3611	struct inode *inode;
3612	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
3613	if (!inode)
3614		return ERR_PTR(-ENOENT);
3615
3616	inode->i_op = &proc_tid_base_inode_operations;
3617	inode->i_fop = &proc_tid_base_operations;
3618	inode->i_flags |= S_IMMUTABLE;
3619
3620	set_nlink(inode, nlink_tid);
3621	pid_update_inode(task, inode);
3622
3623	d_set_d_op(dentry, &pid_dentry_operations);
3624	return d_splice_alias(inode, dentry);
3625}
3626
3627static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
3628{
3629	struct task_struct *task;
3630	struct task_struct *leader = get_proc_task(dir);
3631	unsigned tid;
3632	struct proc_fs_info *fs_info;
3633	struct pid_namespace *ns;
3634	struct dentry *result = ERR_PTR(-ENOENT);
3635
3636	if (!leader)
3637		goto out_no_task;
3638
3639	tid = name_to_int(&dentry->d_name);
3640	if (tid == ~0U)
3641		goto out;
3642
3643	fs_info = proc_sb_info(dentry->d_sb);
3644	ns = fs_info->pid_ns;
3645	rcu_read_lock();
3646	task = find_task_by_pid_ns(tid, ns);
3647	if (task)
3648		get_task_struct(task);
3649	rcu_read_unlock();
3650	if (!task)
3651		goto out;
3652	if (!same_thread_group(leader, task))
3653		goto out_drop_task;
3654
3655	result = proc_task_instantiate(dentry, task, NULL);
3656out_drop_task:
3657	put_task_struct(task);
3658out:
3659	put_task_struct(leader);
3660out_no_task:
3661	return result;
3662}
3663
3664/*
3665 * Find the first tid of a thread group to return to user space.
3666 *
3667 * Usually this is just the thread group leader, but if the users
3668 * buffer was too small or there was a seek into the middle of the
3669 * directory we have more work todo.
3670 *
3671 * In the case of a short read we start with find_task_by_pid.
3672 *
3673 * In the case of a seek we start with the leader and walk nr
3674 * threads past it.
3675 */
3676static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
3677					struct pid_namespace *ns)
3678{
3679	struct task_struct *pos, *task;
3680	unsigned long nr = f_pos;
3681
3682	if (nr != f_pos)	/* 32bit overflow? */
3683		return NULL;
3684
3685	rcu_read_lock();
3686	task = pid_task(pid, PIDTYPE_PID);
3687	if (!task)
3688		goto fail;
3689
3690	/* Attempt to start with the tid of a thread */
3691	if (tid && nr) {
3692		pos = find_task_by_pid_ns(tid, ns);
3693		if (pos && same_thread_group(pos, task))
3694			goto found;
3695	}
3696
3697	/* If nr exceeds the number of threads there is nothing todo */
3698	if (nr >= get_nr_threads(task))
3699		goto fail;
3700
3701	/* If we haven't found our starting place yet start
3702	 * with the leader and walk nr threads forward.
3703	 */
3704	pos = task = task->group_leader;
3705	do {
3706		if (!nr--)
3707			goto found;
3708	} while_each_thread(task, pos);
3709fail:
3710	pos = NULL;
3711	goto out;
3712found:
3713	get_task_struct(pos);
3714out:
3715	rcu_read_unlock();
3716	return pos;
3717}
3718
3719/*
3720 * Find the next thread in the thread list.
3721 * Return NULL if there is an error or no next thread.
3722 *
3723 * The reference to the input task_struct is released.
3724 */
3725static struct task_struct *next_tid(struct task_struct *start)
3726{
3727	struct task_struct *pos = NULL;
3728	rcu_read_lock();
3729	if (pid_alive(start)) {
3730		pos = next_thread(start);
3731		if (thread_group_leader(pos))
3732			pos = NULL;
3733		else
3734			get_task_struct(pos);
3735	}
3736	rcu_read_unlock();
3737	put_task_struct(start);
3738	return pos;
3739}
3740
3741/* for the /proc/TGID/task/ directories */
3742static int proc_task_readdir(struct file *file, struct dir_context *ctx)
3743{
3744	struct inode *inode = file_inode(file);
3745	struct task_struct *task;
3746	struct pid_namespace *ns;
3747	int tid;
3748
3749	if (proc_inode_is_dead(inode))
3750		return -ENOENT;
3751
3752	if (!dir_emit_dots(file, ctx))
3753		return 0;
3754
3755	/* f_version caches the tgid value that the last readdir call couldn't
3756	 * return. lseek aka telldir automagically resets f_version to 0.
3757	 */
3758	ns = proc_pid_ns(inode->i_sb);
3759	tid = (int)file->f_version;
3760	file->f_version = 0;
3761	for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
3762	     task;
3763	     task = next_tid(task), ctx->pos++) {
3764		char name[10 + 1];
3765		unsigned int len;
3766		tid = task_pid_nr_ns(task, ns);
3767		len = snprintf(name, sizeof(name), "%u", tid);
3768		if (!proc_fill_cache(file, ctx, name, len,
3769				proc_task_instantiate, task, NULL)) {
3770			/* returning this tgid failed, save it as the first
3771			 * pid for the next readir call */
3772			file->f_version = (u64)tid;
3773			put_task_struct(task);
3774			break;
3775		}
3776	}
3777
3778	return 0;
3779}
3780
3781static int proc_task_getattr(const struct path *path, struct kstat *stat,
3782			     u32 request_mask, unsigned int query_flags)
3783{
3784	struct inode *inode = d_inode(path->dentry);
3785	struct task_struct *p = get_proc_task(inode);
3786	generic_fillattr(inode, stat);
3787
3788	if (p) {
3789		stat->nlink += get_nr_threads(p);
3790		put_task_struct(p);
3791	}
3792
3793	return 0;
3794}
3795
3796static const struct inode_operations proc_task_inode_operations = {
3797	.lookup		= proc_task_lookup,
3798	.getattr	= proc_task_getattr,
3799	.setattr	= proc_setattr,
3800	.permission	= proc_pid_permission,
3801};
3802
3803static const struct file_operations proc_task_operations = {
3804	.read		= generic_read_dir,
3805	.iterate_shared	= proc_task_readdir,
3806	.llseek		= generic_file_llseek,
3807};
3808
3809void __init set_proc_pid_nlink(void)
3810{
3811	nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
3812	nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
3813}