include/linux/sched.h at v6.11 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / include / linux / sched.h
at v6.11 63 kB view raw
   1/* SPDX-License-Identifier: GPL-2.0 */
   2#ifndef _LINUX_SCHED_H
   3#define _LINUX_SCHED_H
   4
   5/*
   6 * Define 'struct task_struct' and provide the main scheduler
   7 * APIs (schedule(), wakeup variants, etc.)
   8 */
   9
  10#include <uapi/linux/sched.h>
  11
  12#include <asm/current.h>
  13#include <asm/processor.h>
  14#include <linux/thread_info.h>
  15#include <linux/preempt.h>
  16#include <linux/cpumask_types.h>
  17
  18#include <linux/cache.h>
  19#include <linux/irqflags_types.h>
  20#include <linux/smp_types.h>
  21#include <linux/pid_types.h>
  22#include <linux/sem_types.h>
  23#include <linux/shm.h>
  24#include <linux/kmsan_types.h>
  25#include <linux/mutex_types.h>
  26#include <linux/plist_types.h>
  27#include <linux/hrtimer_types.h>
  28#include <linux/timer_types.h>
  29#include <linux/seccomp_types.h>
  30#include <linux/nodemask_types.h>
  31#include <linux/refcount_types.h>
  32#include <linux/resource.h>
  33#include <linux/latencytop.h>
  34#include <linux/sched/prio.h>
  35#include <linux/sched/types.h>
  36#include <linux/signal_types.h>
  37#include <linux/syscall_user_dispatch_types.h>
  38#include <linux/mm_types_task.h>
  39#include <linux/netdevice_xmit.h>
  40#include <linux/task_io_accounting.h>
  41#include <linux/posix-timers_types.h>
  42#include <linux/restart_block.h>
  43#include <uapi/linux/rseq.h>
  44#include <linux/seqlock_types.h>
  45#include <linux/kcsan.h>
  46#include <linux/rv.h>
  47#include <linux/livepatch_sched.h>
  48#include <linux/uidgid_types.h>
  49#include <asm/kmap_size.h>
  50
  51/* task_struct member predeclarations (sorted alphabetically): */
  52struct audit_context;
  53struct bio_list;
  54struct blk_plug;
  55struct bpf_local_storage;
  56struct bpf_run_ctx;
  57struct bpf_net_context;
  58struct capture_control;
  59struct cfs_rq;
  60struct fs_struct;
  61struct futex_pi_state;
  62struct io_context;
  63struct io_uring_task;
  64struct mempolicy;
  65struct nameidata;
  66struct nsproxy;
  67struct perf_event_context;
  68struct pid_namespace;
  69struct pipe_inode_info;
  70struct rcu_node;
  71struct reclaim_state;
  72struct robust_list_head;
  73struct root_domain;
  74struct rq;
  75struct sched_attr;
  76struct sched_dl_entity;
  77struct seq_file;
  78struct sighand_struct;
  79struct signal_struct;
  80struct task_delay_info;
  81struct task_group;
  82struct task_struct;
  83struct user_event_mm;
  84
  85/*
  86 * Task state bitmask. NOTE! These bits are also
  87 * encoded in fs/proc/array.c: get_task_state().
  88 *
  89 * We have two separate sets of flags: task->__state
  90 * is about runnability, while task->exit_state are
  91 * about the task exiting. Confusing, but this way
  92 * modifying one set can't modify the other one by
  93 * mistake.
  94 */
  95
  96/* Used in tsk->__state: */
  97#define TASK_RUNNING			0x00000000
  98#define TASK_INTERRUPTIBLE		0x00000001
  99#define TASK_UNINTERRUPTIBLE		0x00000002
 100#define __TASK_STOPPED			0x00000004
 101#define __TASK_TRACED			0x00000008
 102/* Used in tsk->exit_state: */
 103#define EXIT_DEAD			0x00000010
 104#define EXIT_ZOMBIE			0x00000020
 105#define EXIT_TRACE			(EXIT_ZOMBIE | EXIT_DEAD)
 106/* Used in tsk->__state again: */
 107#define TASK_PARKED			0x00000040
 108#define TASK_DEAD			0x00000080
 109#define TASK_WAKEKILL			0x00000100
 110#define TASK_WAKING			0x00000200
 111#define TASK_NOLOAD			0x00000400
 112#define TASK_NEW			0x00000800
 113#define TASK_RTLOCK_WAIT		0x00001000
 114#define TASK_FREEZABLE			0x00002000
 115#define __TASK_FREEZABLE_UNSAFE	       (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP))
 116#define TASK_FROZEN			0x00008000
 117#define TASK_STATE_MAX			0x00010000
 118
 119#define TASK_ANY			(TASK_STATE_MAX-1)
 120
 121/*
 122 * DO NOT ADD ANY NEW USERS !
 123 */
 124#define TASK_FREEZABLE_UNSAFE		(TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE)
 125
 126/* Convenience macros for the sake of set_current_state: */
 127#define TASK_KILLABLE			(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
 128#define TASK_STOPPED			(TASK_WAKEKILL | __TASK_STOPPED)
 129#define TASK_TRACED			__TASK_TRACED
 130
 131#define TASK_IDLE			(TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
 132
 133/* Convenience macros for the sake of wake_up(): */
 134#define TASK_NORMAL			(TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
 135
 136/* get_task_state(): */
 137#define TASK_REPORT			(TASK_RUNNING | TASK_INTERRUPTIBLE | \
 138					 TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
 139					 __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
 140					 TASK_PARKED)
 141
 142#define task_is_running(task)		(READ_ONCE((task)->__state) == TASK_RUNNING)
 143
 144#define task_is_traced(task)		((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0)
 145#define task_is_stopped(task)		((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0)
 146#define task_is_stopped_or_traced(task)	((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0)
 147
 148/*
 149 * Special states are those that do not use the normal wait-loop pattern. See
 150 * the comment with set_special_state().
 151 */
 152#define is_special_task_state(state)				\
 153	((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
 154
 155#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 156# define debug_normal_state_change(state_value)				\
 157	do {								\
 158		WARN_ON_ONCE(is_special_task_state(state_value));	\
 159		current->task_state_change = _THIS_IP_;			\
 160	} while (0)
 161
 162# define debug_special_state_change(state_value)			\
 163	do {								\
 164		WARN_ON_ONCE(!is_special_task_state(state_value));	\
 165		current->task_state_change = _THIS_IP_;			\
 166	} while (0)
 167
 168# define debug_rtlock_wait_set_state()					\
 169	do {								 \
 170		current->saved_state_change = current->task_state_change;\
 171		current->task_state_change = _THIS_IP_;			 \
 172	} while (0)
 173
 174# define debug_rtlock_wait_restore_state()				\
 175	do {								 \
 176		current->task_state_change = current->saved_state_change;\
 177	} while (0)
 178
 179#else
 180# define debug_normal_state_change(cond)	do { } while (0)
 181# define debug_special_state_change(cond)	do { } while (0)
 182# define debug_rtlock_wait_set_state()		do { } while (0)
 183# define debug_rtlock_wait_restore_state()	do { } while (0)
 184#endif
 185
 186/*
 187 * set_current_state() includes a barrier so that the write of current->__state
 188 * is correctly serialised wrt the caller's subsequent test of whether to
 189 * actually sleep:
 190 *
 191 *   for (;;) {
 192 *	set_current_state(TASK_UNINTERRUPTIBLE);
 193 *	if (CONDITION)
 194 *	   break;
 195 *
 196 *	schedule();
 197 *   }
 198 *   __set_current_state(TASK_RUNNING);
 199 *
 200 * If the caller does not need such serialisation (because, for instance, the
 201 * CONDITION test and condition change and wakeup are under the same lock) then
 202 * use __set_current_state().
 203 *
 204 * The above is typically ordered against the wakeup, which does:
 205 *
 206 *   CONDITION = 1;
 207 *   wake_up_state(p, TASK_UNINTERRUPTIBLE);
 208 *
 209 * where wake_up_state()/try_to_wake_up() executes a full memory barrier before
 210 * accessing p->__state.
 211 *
 212 * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is,
 213 * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
 214 * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
 215 *
 216 * However, with slightly different timing the wakeup TASK_RUNNING store can
 217 * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not
 218 * a problem either because that will result in one extra go around the loop
 219 * and our @cond test will save the day.
 220 *
 221 * Also see the comments of try_to_wake_up().
 222 */
 223#define __set_current_state(state_value)				\
 224	do {								\
 225		debug_normal_state_change((state_value));		\
 226		WRITE_ONCE(current->__state, (state_value));		\
 227	} while (0)
 228
 229#define set_current_state(state_value)					\
 230	do {								\
 231		debug_normal_state_change((state_value));		\
 232		smp_store_mb(current->__state, (state_value));		\
 233	} while (0)
 234
 235/*
 236 * set_special_state() should be used for those states when the blocking task
 237 * can not use the regular condition based wait-loop. In that case we must
 238 * serialize against wakeups such that any possible in-flight TASK_RUNNING
 239 * stores will not collide with our state change.
 240 */
 241#define set_special_state(state_value)					\
 242	do {								\
 243		unsigned long flags; /* may shadow */			\
 244									\
 245		raw_spin_lock_irqsave(&current->pi_lock, flags);	\
 246		debug_special_state_change((state_value));		\
 247		WRITE_ONCE(current->__state, (state_value));		\
 248		raw_spin_unlock_irqrestore(&current->pi_lock, flags);	\
 249	} while (0)
 250
 251/*
 252 * PREEMPT_RT specific variants for "sleeping" spin/rwlocks
 253 *
 254 * RT's spin/rwlock substitutions are state preserving. The state of the
 255 * task when blocking on the lock is saved in task_struct::saved_state and
 256 * restored after the lock has been acquired.  These operations are
 257 * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT
 258 * lock related wakeups while the task is blocked on the lock are
 259 * redirected to operate on task_struct::saved_state to ensure that these
 260 * are not dropped. On restore task_struct::saved_state is set to
 261 * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail.
 262 *
 263 * The lock operation looks like this:
 264 *
 265 *	current_save_and_set_rtlock_wait_state();
 266 *	for (;;) {
 267 *		if (try_lock())
 268 *			break;
 269 *		raw_spin_unlock_irq(&lock->wait_lock);
 270 *		schedule_rtlock();
 271 *		raw_spin_lock_irq(&lock->wait_lock);
 272 *		set_current_state(TASK_RTLOCK_WAIT);
 273 *	}
 274 *	current_restore_rtlock_saved_state();
 275 */
 276#define current_save_and_set_rtlock_wait_state()			\
 277	do {								\
 278		lockdep_assert_irqs_disabled();				\
 279		raw_spin_lock(&current->pi_lock);			\
 280		current->saved_state = current->__state;		\
 281		debug_rtlock_wait_set_state();				\
 282		WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT);		\
 283		raw_spin_unlock(&current->pi_lock);			\
 284	} while (0);
 285
 286#define current_restore_rtlock_saved_state()				\
 287	do {								\
 288		lockdep_assert_irqs_disabled();				\
 289		raw_spin_lock(&current->pi_lock);			\
 290		debug_rtlock_wait_restore_state();			\
 291		WRITE_ONCE(current->__state, current->saved_state);	\
 292		current->saved_state = TASK_RUNNING;			\
 293		raw_spin_unlock(&current->pi_lock);			\
 294	} while (0);
 295
 296#define get_current_state()	READ_ONCE(current->__state)
 297
 298/*
 299 * Define the task command name length as enum, then it can be visible to
 300 * BPF programs.
 301 */
 302enum {
 303	TASK_COMM_LEN = 16,
 304};
 305
 306extern void sched_tick(void);
 307
 308#define	MAX_SCHEDULE_TIMEOUT		LONG_MAX
 309
 310extern long schedule_timeout(long timeout);
 311extern long schedule_timeout_interruptible(long timeout);
 312extern long schedule_timeout_killable(long timeout);
 313extern long schedule_timeout_uninterruptible(long timeout);
 314extern long schedule_timeout_idle(long timeout);
 315asmlinkage void schedule(void);
 316extern void schedule_preempt_disabled(void);
 317asmlinkage void preempt_schedule_irq(void);
 318#ifdef CONFIG_PREEMPT_RT
 319 extern void schedule_rtlock(void);
 320#endif
 321
 322extern int __must_check io_schedule_prepare(void);
 323extern void io_schedule_finish(int token);
 324extern long io_schedule_timeout(long timeout);
 325extern void io_schedule(void);
 326
 327/**
 328 * struct prev_cputime - snapshot of system and user cputime
 329 * @utime: time spent in user mode
 330 * @stime: time spent in system mode
 331 * @lock: protects the above two fields
 332 *
 333 * Stores previous user/system time values such that we can guarantee
 334 * monotonicity.
 335 */
 336struct prev_cputime {
 337#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 338	u64				utime;
 339	u64				stime;
 340	raw_spinlock_t			lock;
 341#endif
 342};
 343
 344enum vtime_state {
 345	/* Task is sleeping or running in a CPU with VTIME inactive: */
 346	VTIME_INACTIVE = 0,
 347	/* Task is idle */
 348	VTIME_IDLE,
 349	/* Task runs in kernelspace in a CPU with VTIME active: */
 350	VTIME_SYS,
 351	/* Task runs in userspace in a CPU with VTIME active: */
 352	VTIME_USER,
 353	/* Task runs as guests in a CPU with VTIME active: */
 354	VTIME_GUEST,
 355};
 356
 357struct vtime {
 358	seqcount_t		seqcount;
 359	unsigned long long	starttime;
 360	enum vtime_state	state;
 361	unsigned int		cpu;
 362	u64			utime;
 363	u64			stime;
 364	u64			gtime;
 365};
 366
 367/*
 368 * Utilization clamp constraints.
 369 * @UCLAMP_MIN:	Minimum utilization
 370 * @UCLAMP_MAX:	Maximum utilization
 371 * @UCLAMP_CNT:	Utilization clamp constraints count
 372 */
 373enum uclamp_id {
 374	UCLAMP_MIN = 0,
 375	UCLAMP_MAX,
 376	UCLAMP_CNT
 377};
 378
 379#ifdef CONFIG_SMP
 380extern struct root_domain def_root_domain;
 381extern struct mutex sched_domains_mutex;
 382#endif
 383
 384struct sched_param {
 385	int sched_priority;
 386};
 387
 388struct sched_info {
 389#ifdef CONFIG_SCHED_INFO
 390	/* Cumulative counters: */
 391
 392	/* # of times we have run on this CPU: */
 393	unsigned long			pcount;
 394
 395	/* Time spent waiting on a runqueue: */
 396	unsigned long long		run_delay;
 397
 398	/* Timestamps: */
 399
 400	/* When did we last run on a CPU? */
 401	unsigned long long		last_arrival;
 402
 403	/* When were we last queued to run? */
 404	unsigned long long		last_queued;
 405
 406#endif /* CONFIG_SCHED_INFO */
 407};
 408
 409/*
 410 * Integer metrics need fixed point arithmetic, e.g., sched/fair
 411 * has a few: load, load_avg, util_avg, freq, and capacity.
 412 *
 413 * We define a basic fixed point arithmetic range, and then formalize
 414 * all these metrics based on that basic range.
 415 */
 416# define SCHED_FIXEDPOINT_SHIFT		10
 417# define SCHED_FIXEDPOINT_SCALE		(1L << SCHED_FIXEDPOINT_SHIFT)
 418
 419/* Increase resolution of cpu_capacity calculations */
 420# define SCHED_CAPACITY_SHIFT		SCHED_FIXEDPOINT_SHIFT
 421# define SCHED_CAPACITY_SCALE		(1L << SCHED_CAPACITY_SHIFT)
 422
 423struct load_weight {
 424	unsigned long			weight;
 425	u32				inv_weight;
 426};
 427
 428/*
 429 * The load/runnable/util_avg accumulates an infinite geometric series
 430 * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
 431 *
 432 * [load_avg definition]
 433 *
 434 *   load_avg = runnable% * scale_load_down(load)
 435 *
 436 * [runnable_avg definition]
 437 *
 438 *   runnable_avg = runnable% * SCHED_CAPACITY_SCALE
 439 *
 440 * [util_avg definition]
 441 *
 442 *   util_avg = running% * SCHED_CAPACITY_SCALE
 443 *
 444 * where runnable% is the time ratio that a sched_entity is runnable and
 445 * running% the time ratio that a sched_entity is running.
 446 *
 447 * For cfs_rq, they are the aggregated values of all runnable and blocked
 448 * sched_entities.
 449 *
 450 * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU
 451 * capacity scaling. The scaling is done through the rq_clock_pelt that is used
 452 * for computing those signals (see update_rq_clock_pelt())
 453 *
 454 * N.B., the above ratios (runnable% and running%) themselves are in the
 455 * range of [0, 1]. To do fixed point arithmetics, we therefore scale them
 456 * to as large a range as necessary. This is for example reflected by
 457 * util_avg's SCHED_CAPACITY_SCALE.
 458 *
 459 * [Overflow issue]
 460 *
 461 * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities
 462 * with the highest load (=88761), always runnable on a single cfs_rq,
 463 * and should not overflow as the number already hits PID_MAX_LIMIT.
 464 *
 465 * For all other cases (including 32-bit kernels), struct load_weight's
 466 * weight will overflow first before we do, because:
 467 *
 468 *    Max(load_avg) <= Max(load.weight)
 469 *
 470 * Then it is the load_weight's responsibility to consider overflow
 471 * issues.
 472 */
 473struct sched_avg {
 474	u64				last_update_time;
 475	u64				load_sum;
 476	u64				runnable_sum;
 477	u32				util_sum;
 478	u32				period_contrib;
 479	unsigned long			load_avg;
 480	unsigned long			runnable_avg;
 481	unsigned long			util_avg;
 482	unsigned int			util_est;
 483} ____cacheline_aligned;
 484
 485/*
 486 * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
 487 * updates. When a task is dequeued, its util_est should not be updated if its
 488 * util_avg has not been updated in the meantime.
 489 * This information is mapped into the MSB bit of util_est at dequeue time.
 490 * Since max value of util_est for a task is 1024 (PELT util_avg for a task)
 491 * it is safe to use MSB.
 492 */
 493#define UTIL_EST_WEIGHT_SHIFT		2
 494#define UTIL_AVG_UNCHANGED		0x80000000
 495
 496struct sched_statistics {
 497#ifdef CONFIG_SCHEDSTATS
 498	u64				wait_start;
 499	u64				wait_max;
 500	u64				wait_count;
 501	u64				wait_sum;
 502	u64				iowait_count;
 503	u64				iowait_sum;
 504
 505	u64				sleep_start;
 506	u64				sleep_max;
 507	s64				sum_sleep_runtime;
 508
 509	u64				block_start;
 510	u64				block_max;
 511	s64				sum_block_runtime;
 512
 513	s64				exec_max;
 514	u64				slice_max;
 515
 516	u64				nr_migrations_cold;
 517	u64				nr_failed_migrations_affine;
 518	u64				nr_failed_migrations_running;
 519	u64				nr_failed_migrations_hot;
 520	u64				nr_forced_migrations;
 521
 522	u64				nr_wakeups;
 523	u64				nr_wakeups_sync;
 524	u64				nr_wakeups_migrate;
 525	u64				nr_wakeups_local;
 526	u64				nr_wakeups_remote;
 527	u64				nr_wakeups_affine;
 528	u64				nr_wakeups_affine_attempts;
 529	u64				nr_wakeups_passive;
 530	u64				nr_wakeups_idle;
 531
 532#ifdef CONFIG_SCHED_CORE
 533	u64				core_forceidle_sum;
 534#endif
 535#endif /* CONFIG_SCHEDSTATS */
 536} ____cacheline_aligned;
 537
 538struct sched_entity {
 539	/* For load-balancing: */
 540	struct load_weight		load;
 541	struct rb_node			run_node;
 542	u64				deadline;
 543	u64				min_vruntime;
 544
 545	struct list_head		group_node;
 546	unsigned int			on_rq;
 547
 548	u64				exec_start;
 549	u64				sum_exec_runtime;
 550	u64				prev_sum_exec_runtime;
 551	u64				vruntime;
 552	s64				vlag;
 553	u64				slice;
 554
 555	u64				nr_migrations;
 556
 557#ifdef CONFIG_FAIR_GROUP_SCHED
 558	int				depth;
 559	struct sched_entity		*parent;
 560	/* rq on which this entity is (to be) queued: */
 561	struct cfs_rq			*cfs_rq;
 562	/* rq "owned" by this entity/group: */
 563	struct cfs_rq			*my_q;
 564	/* cached value of my_q->h_nr_running */
 565	unsigned long			runnable_weight;
 566#endif
 567
 568#ifdef CONFIG_SMP
 569	/*
 570	 * Per entity load average tracking.
 571	 *
 572	 * Put into separate cache line so it does not
 573	 * collide with read-mostly values above.
 574	 */
 575	struct sched_avg		avg;
 576#endif
 577};
 578
 579struct sched_rt_entity {
 580	struct list_head		run_list;
 581	unsigned long			timeout;
 582	unsigned long			watchdog_stamp;
 583	unsigned int			time_slice;
 584	unsigned short			on_rq;
 585	unsigned short			on_list;
 586
 587	struct sched_rt_entity		*back;
 588#ifdef CONFIG_RT_GROUP_SCHED
 589	struct sched_rt_entity		*parent;
 590	/* rq on which this entity is (to be) queued: */
 591	struct rt_rq			*rt_rq;
 592	/* rq "owned" by this entity/group: */
 593	struct rt_rq			*my_q;
 594#endif
 595} __randomize_layout;
 596
 597typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
 598typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
 599
 600struct sched_dl_entity {
 601	struct rb_node			rb_node;
 602
 603	/*
 604	 * Original scheduling parameters. Copied here from sched_attr
 605	 * during sched_setattr(), they will remain the same until
 606	 * the next sched_setattr().
 607	 */
 608	u64				dl_runtime;	/* Maximum runtime for each instance	*/
 609	u64				dl_deadline;	/* Relative deadline of each instance	*/
 610	u64				dl_period;	/* Separation of two instances (period) */
 611	u64				dl_bw;		/* dl_runtime / dl_period		*/
 612	u64				dl_density;	/* dl_runtime / dl_deadline		*/
 613
 614	/*
 615	 * Actual scheduling parameters. Initialized with the values above,
 616	 * they are continuously updated during task execution. Note that
 617	 * the remaining runtime could be < 0 in case we are in overrun.
 618	 */
 619	s64				runtime;	/* Remaining runtime for this instance	*/
 620	u64				deadline;	/* Absolute deadline for this instance	*/
 621	unsigned int			flags;		/* Specifying the scheduler behaviour	*/
 622
 623	/*
 624	 * Some bool flags:
 625	 *
 626	 * @dl_throttled tells if we exhausted the runtime. If so, the
 627	 * task has to wait for a replenishment to be performed at the
 628	 * next firing of dl_timer.
 629	 *
 630	 * @dl_yielded tells if task gave up the CPU before consuming
 631	 * all its available runtime during the last job.
 632	 *
 633	 * @dl_non_contending tells if the task is inactive while still
 634	 * contributing to the active utilization. In other words, it
 635	 * indicates if the inactive timer has been armed and its handler
 636	 * has not been executed yet. This flag is useful to avoid race
 637	 * conditions between the inactive timer handler and the wakeup
 638	 * code.
 639	 *
 640	 * @dl_overrun tells if the task asked to be informed about runtime
 641	 * overruns.
 642	 */
 643	unsigned int			dl_throttled      : 1;
 644	unsigned int			dl_yielded        : 1;
 645	unsigned int			dl_non_contending : 1;
 646	unsigned int			dl_overrun	  : 1;
 647	unsigned int			dl_server         : 1;
 648
 649	/*
 650	 * Bandwidth enforcement timer. Each -deadline task has its
 651	 * own bandwidth to be enforced, thus we need one timer per task.
 652	 */
 653	struct hrtimer			dl_timer;
 654
 655	/*
 656	 * Inactive timer, responsible for decreasing the active utilization
 657	 * at the "0-lag time". When a -deadline task blocks, it contributes
 658	 * to GRUB's active utilization until the "0-lag time", hence a
 659	 * timer is needed to decrease the active utilization at the correct
 660	 * time.
 661	 */
 662	struct hrtimer			inactive_timer;
 663
 664	/*
 665	 * Bits for DL-server functionality. Also see the comment near
 666	 * dl_server_update().
 667	 *
 668	 * @rq the runqueue this server is for
 669	 *
 670	 * @server_has_tasks() returns true if @server_pick return a
 671	 * runnable task.
 672	 */
 673	struct rq			*rq;
 674	dl_server_has_tasks_f		server_has_tasks;
 675	dl_server_pick_f		server_pick;
 676
 677#ifdef CONFIG_RT_MUTEXES
 678	/*
 679	 * Priority Inheritance. When a DEADLINE scheduling entity is boosted
 680	 * pi_se points to the donor, otherwise points to the dl_se it belongs
 681	 * to (the original one/itself).
 682	 */
 683	struct sched_dl_entity *pi_se;
 684#endif
 685};
 686
 687#ifdef CONFIG_UCLAMP_TASK
 688/* Number of utilization clamp buckets (shorter alias) */
 689#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT
 690
 691/*
 692 * Utilization clamp for a scheduling entity
 693 * @value:		clamp value "assigned" to a se
 694 * @bucket_id:		bucket index corresponding to the "assigned" value
 695 * @active:		the se is currently refcounted in a rq's bucket
 696 * @user_defined:	the requested clamp value comes from user-space
 697 *
 698 * The bucket_id is the index of the clamp bucket matching the clamp value
 699 * which is pre-computed and stored to avoid expensive integer divisions from
 700 * the fast path.
 701 *
 702 * The active bit is set whenever a task has got an "effective" value assigned,
 703 * which can be different from the clamp value "requested" from user-space.
 704 * This allows to know a task is refcounted in the rq's bucket corresponding
 705 * to the "effective" bucket_id.
 706 *
 707 * The user_defined bit is set whenever a task has got a task-specific clamp
 708 * value requested from userspace, i.e. the system defaults apply to this task
 709 * just as a restriction. This allows to relax default clamps when a less
 710 * restrictive task-specific value has been requested, thus allowing to
 711 * implement a "nice" semantic. For example, a task running with a 20%
 712 * default boost can still drop its own boosting to 0%.
 713 */
 714struct uclamp_se {
 715	unsigned int value		: bits_per(SCHED_CAPACITY_SCALE);
 716	unsigned int bucket_id		: bits_per(UCLAMP_BUCKETS);
 717	unsigned int active		: 1;
 718	unsigned int user_defined	: 1;
 719};
 720#endif /* CONFIG_UCLAMP_TASK */
 721
 722union rcu_special {
 723	struct {
 724		u8			blocked;
 725		u8			need_qs;
 726		u8			exp_hint; /* Hint for performance. */
 727		u8			need_mb; /* Readers need smp_mb(). */
 728	} b; /* Bits. */
 729	u32 s; /* Set of bits. */
 730};
 731
 732enum perf_event_task_context {
 733	perf_invalid_context = -1,
 734	perf_hw_context = 0,
 735	perf_sw_context,
 736	perf_nr_task_contexts,
 737};
 738
 739/*
 740 * Number of contexts where an event can trigger:
 741 *      task, softirq, hardirq, nmi.
 742 */
 743#define PERF_NR_CONTEXTS	4
 744
 745struct wake_q_node {
 746	struct wake_q_node *next;
 747};
 748
 749struct kmap_ctrl {
 750#ifdef CONFIG_KMAP_LOCAL
 751	int				idx;
 752	pte_t				pteval[KM_MAX_IDX];
 753#endif
 754};
 755
 756struct task_struct {
 757#ifdef CONFIG_THREAD_INFO_IN_TASK
 758	/*
 759	 * For reasons of header soup (see current_thread_info()), this
 760	 * must be the first element of task_struct.
 761	 */
 762	struct thread_info		thread_info;
 763#endif
 764	unsigned int			__state;
 765
 766	/* saved state for "spinlock sleepers" */
 767	unsigned int			saved_state;
 768
 769	/*
 770	 * This begins the randomizable portion of task_struct. Only
 771	 * scheduling-critical items should be added above here.
 772	 */
 773	randomized_struct_fields_start
 774
 775	void				*stack;
 776	refcount_t			usage;
 777	/* Per task flags (PF_*), defined further below: */
 778	unsigned int			flags;
 779	unsigned int			ptrace;
 780
 781#ifdef CONFIG_MEM_ALLOC_PROFILING
 782	struct alloc_tag		*alloc_tag;
 783#endif
 784
 785#ifdef CONFIG_SMP
 786	int				on_cpu;
 787	struct __call_single_node	wake_entry;
 788	unsigned int			wakee_flips;
 789	unsigned long			wakee_flip_decay_ts;
 790	struct task_struct		*last_wakee;
 791
 792	/*
 793	 * recent_used_cpu is initially set as the last CPU used by a task
 794	 * that wakes affine another task. Waker/wakee relationships can
 795	 * push tasks around a CPU where each wakeup moves to the next one.
 796	 * Tracking a recently used CPU allows a quick search for a recently
 797	 * used CPU that may be idle.
 798	 */
 799	int				recent_used_cpu;
 800	int				wake_cpu;
 801#endif
 802	int				on_rq;
 803
 804	int				prio;
 805	int				static_prio;
 806	int				normal_prio;
 807	unsigned int			rt_priority;
 808
 809	struct sched_entity		se;
 810	struct sched_rt_entity		rt;
 811	struct sched_dl_entity		dl;
 812	struct sched_dl_entity		*dl_server;
 813	const struct sched_class	*sched_class;
 814
 815#ifdef CONFIG_SCHED_CORE
 816	struct rb_node			core_node;
 817	unsigned long			core_cookie;
 818	unsigned int			core_occupation;
 819#endif
 820
 821#ifdef CONFIG_CGROUP_SCHED
 822	struct task_group		*sched_task_group;
 823#endif
 824
 825
 826#ifdef CONFIG_UCLAMP_TASK
 827	/*
 828	 * Clamp values requested for a scheduling entity.
 829	 * Must be updated with task_rq_lock() held.
 830	 */
 831	struct uclamp_se		uclamp_req[UCLAMP_CNT];
 832	/*
 833	 * Effective clamp values used for a scheduling entity.
 834	 * Must be updated with task_rq_lock() held.
 835	 */
 836	struct uclamp_se		uclamp[UCLAMP_CNT];
 837#endif
 838
 839	struct sched_statistics         stats;
 840
 841#ifdef CONFIG_PREEMPT_NOTIFIERS
 842	/* List of struct preempt_notifier: */
 843	struct hlist_head		preempt_notifiers;
 844#endif
 845
 846#ifdef CONFIG_BLK_DEV_IO_TRACE
 847	unsigned int			btrace_seq;
 848#endif
 849
 850	unsigned int			policy;
 851	unsigned long			max_allowed_capacity;
 852	int				nr_cpus_allowed;
 853	const cpumask_t			*cpus_ptr;
 854	cpumask_t			*user_cpus_ptr;
 855	cpumask_t			cpus_mask;
 856	void				*migration_pending;
 857#ifdef CONFIG_SMP
 858	unsigned short			migration_disabled;
 859#endif
 860	unsigned short			migration_flags;
 861
 862#ifdef CONFIG_PREEMPT_RCU
 863	int				rcu_read_lock_nesting;
 864	union rcu_special		rcu_read_unlock_special;
 865	struct list_head		rcu_node_entry;
 866	struct rcu_node			*rcu_blocked_node;
 867#endif /* #ifdef CONFIG_PREEMPT_RCU */
 868
 869#ifdef CONFIG_TASKS_RCU
 870	unsigned long			rcu_tasks_nvcsw;
 871	u8				rcu_tasks_holdout;
 872	u8				rcu_tasks_idx;
 873	int				rcu_tasks_idle_cpu;
 874	struct list_head		rcu_tasks_holdout_list;
 875	int				rcu_tasks_exit_cpu;
 876	struct list_head		rcu_tasks_exit_list;
 877#endif /* #ifdef CONFIG_TASKS_RCU */
 878
 879#ifdef CONFIG_TASKS_TRACE_RCU
 880	int				trc_reader_nesting;
 881	int				trc_ipi_to_cpu;
 882	union rcu_special		trc_reader_special;
 883	struct list_head		trc_holdout_list;
 884	struct list_head		trc_blkd_node;
 885	int				trc_blkd_cpu;
 886#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
 887
 888	struct sched_info		sched_info;
 889
 890	struct list_head		tasks;
 891#ifdef CONFIG_SMP
 892	struct plist_node		pushable_tasks;
 893	struct rb_node			pushable_dl_tasks;
 894#endif
 895
 896	struct mm_struct		*mm;
 897	struct mm_struct		*active_mm;
 898	struct address_space		*faults_disabled_mapping;
 899
 900	int				exit_state;
 901	int				exit_code;
 902	int				exit_signal;
 903	/* The signal sent when the parent dies: */
 904	int				pdeath_signal;
 905	/* JOBCTL_*, siglock protected: */
 906	unsigned long			jobctl;
 907
 908	/* Used for emulating ABI behavior of previous Linux versions: */
 909	unsigned int			personality;
 910
 911	/* Scheduler bits, serialized by scheduler locks: */
 912	unsigned			sched_reset_on_fork:1;
 913	unsigned			sched_contributes_to_load:1;
 914	unsigned			sched_migrated:1;
 915
 916	/* Force alignment to the next boundary: */
 917	unsigned			:0;
 918
 919	/* Unserialized, strictly 'current' */
 920
 921	/*
 922	 * This field must not be in the scheduler word above due to wakelist
 923	 * queueing no longer being serialized by p->on_cpu. However:
 924	 *
 925	 * p->XXX = X;			ttwu()
 926	 * schedule()			  if (p->on_rq && ..) // false
 927	 *   smp_mb__after_spinlock();	  if (smp_load_acquire(&p->on_cpu) && //true
 928	 *   deactivate_task()		      ttwu_queue_wakelist())
 929	 *     p->on_rq = 0;			p->sched_remote_wakeup = Y;
 930	 *
 931	 * guarantees all stores of 'current' are visible before
 932	 * ->sched_remote_wakeup gets used, so it can be in this word.
 933	 */
 934	unsigned			sched_remote_wakeup:1;
 935#ifdef CONFIG_RT_MUTEXES
 936	unsigned			sched_rt_mutex:1;
 937#endif
 938
 939	/* Bit to tell TOMOYO we're in execve(): */
 940	unsigned			in_execve:1;
 941	unsigned			in_iowait:1;
 942#ifndef TIF_RESTORE_SIGMASK
 943	unsigned			restore_sigmask:1;
 944#endif
 945#ifdef CONFIG_MEMCG_V1
 946	unsigned			in_user_fault:1;
 947#endif
 948#ifdef CONFIG_LRU_GEN
 949	/* whether the LRU algorithm may apply to this access */
 950	unsigned			in_lru_fault:1;
 951#endif
 952#ifdef CONFIG_COMPAT_BRK
 953	unsigned			brk_randomized:1;
 954#endif
 955#ifdef CONFIG_CGROUPS
 956	/* disallow userland-initiated cgroup migration */
 957	unsigned			no_cgroup_migration:1;
 958	/* task is frozen/stopped (used by the cgroup freezer) */
 959	unsigned			frozen:1;
 960#endif
 961#ifdef CONFIG_BLK_CGROUP
 962	unsigned			use_memdelay:1;
 963#endif
 964#ifdef CONFIG_PSI
 965	/* Stalled due to lack of memory */
 966	unsigned			in_memstall:1;
 967#endif
 968#ifdef CONFIG_PAGE_OWNER
 969	/* Used by page_owner=on to detect recursion in page tracking. */
 970	unsigned			in_page_owner:1;
 971#endif
 972#ifdef CONFIG_EVENTFD
 973	/* Recursion prevention for eventfd_signal() */
 974	unsigned			in_eventfd:1;
 975#endif
 976#ifdef CONFIG_ARCH_HAS_CPU_PASID
 977	unsigned			pasid_activated:1;
 978#endif
 979#ifdef	CONFIG_CPU_SUP_INTEL
 980	unsigned			reported_split_lock:1;
 981#endif
 982#ifdef CONFIG_TASK_DELAY_ACCT
 983	/* delay due to memory thrashing */
 984	unsigned                        in_thrashing:1;
 985#endif
 986#ifdef CONFIG_PREEMPT_RT
 987	struct netdev_xmit		net_xmit;
 988#endif
 989	unsigned long			atomic_flags; /* Flags requiring atomic access. */
 990
 991	struct restart_block		restart_block;
 992
 993	pid_t				pid;
 994	pid_t				tgid;
 995
 996#ifdef CONFIG_STACKPROTECTOR
 997	/* Canary value for the -fstack-protector GCC feature: */
 998	unsigned long			stack_canary;
 999#endif
1000	/*
1001	 * Pointers to the (original) parent process, youngest child, younger sibling,
1002	 * older sibling, respectively.  (p->father can be replaced with
1003	 * p->real_parent->pid)
1004	 */
1005
1006	/* Real parent process: */
1007	struct task_struct __rcu	*real_parent;
1008
1009	/* Recipient of SIGCHLD, wait4() reports: */
1010	struct task_struct __rcu	*parent;
1011
1012	/*
1013	 * Children/sibling form the list of natural children:
1014	 */
1015	struct list_head		children;
1016	struct list_head		sibling;
1017	struct task_struct		*group_leader;
1018
1019	/*
1020	 * 'ptraced' is the list of tasks this task is using ptrace() on.
1021	 *
1022	 * This includes both natural children and PTRACE_ATTACH targets.
1023	 * 'ptrace_entry' is this task's link on the p->parent->ptraced list.
1024	 */
1025	struct list_head		ptraced;
1026	struct list_head		ptrace_entry;
1027
1028	/* PID/PID hash table linkage. */
1029	struct pid			*thread_pid;
1030	struct hlist_node		pid_links[PIDTYPE_MAX];
1031	struct list_head		thread_node;
1032
1033	struct completion		*vfork_done;
1034
1035	/* CLONE_CHILD_SETTID: */
1036	int __user			*set_child_tid;
1037
1038	/* CLONE_CHILD_CLEARTID: */
1039	int __user			*clear_child_tid;
1040
1041	/* PF_KTHREAD | PF_IO_WORKER */
1042	void				*worker_private;
1043
1044	u64				utime;
1045	u64				stime;
1046#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
1047	u64				utimescaled;
1048	u64				stimescaled;
1049#endif
1050	u64				gtime;
1051	struct prev_cputime		prev_cputime;
1052#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
1053	struct vtime			vtime;
1054#endif
1055
1056#ifdef CONFIG_NO_HZ_FULL
1057	atomic_t			tick_dep_mask;
1058#endif
1059	/* Context switch counts: */
1060	unsigned long			nvcsw;
1061	unsigned long			nivcsw;
1062
1063	/* Monotonic time in nsecs: */
1064	u64				start_time;
1065
1066	/* Boot based time in nsecs: */
1067	u64				start_boottime;
1068
1069	/* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
1070	unsigned long			min_flt;
1071	unsigned long			maj_flt;
1072
1073	/* Empty if CONFIG_POSIX_CPUTIMERS=n */
1074	struct posix_cputimers		posix_cputimers;
1075
1076#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
1077	struct posix_cputimers_work	posix_cputimers_work;
1078#endif
1079
1080	/* Process credentials: */
1081
1082	/* Tracer's credentials at attach: */
1083	const struct cred __rcu		*ptracer_cred;
1084
1085	/* Objective and real subjective task credentials (COW): */
1086	const struct cred __rcu		*real_cred;
1087
1088	/* Effective (overridable) subjective task credentials (COW): */
1089	const struct cred __rcu		*cred;
1090
1091#ifdef CONFIG_KEYS
1092	/* Cached requested key. */
1093	struct key			*cached_requested_key;
1094#endif
1095
1096	/*
1097	 * executable name, excluding path.
1098	 *
1099	 * - normally initialized setup_new_exec()
1100	 * - access it with [gs]et_task_comm()
1101	 * - lock it with task_lock()
1102	 */
1103	char				comm[TASK_COMM_LEN];
1104
1105	struct nameidata		*nameidata;
1106
1107#ifdef CONFIG_SYSVIPC
1108	struct sysv_sem			sysvsem;
1109	struct sysv_shm			sysvshm;
1110#endif
1111#ifdef CONFIG_DETECT_HUNG_TASK
1112	unsigned long			last_switch_count;
1113	unsigned long			last_switch_time;
1114#endif
1115	/* Filesystem information: */
1116	struct fs_struct		*fs;
1117
1118	/* Open file information: */
1119	struct files_struct		*files;
1120
1121#ifdef CONFIG_IO_URING
1122	struct io_uring_task		*io_uring;
1123#endif
1124
1125	/* Namespaces: */
1126	struct nsproxy			*nsproxy;
1127
1128	/* Signal handlers: */
1129	struct signal_struct		*signal;
1130	struct sighand_struct __rcu		*sighand;
1131	sigset_t			blocked;
1132	sigset_t			real_blocked;
1133	/* Restored if set_restore_sigmask() was used: */
1134	sigset_t			saved_sigmask;
1135	struct sigpending		pending;
1136	unsigned long			sas_ss_sp;
1137	size_t				sas_ss_size;
1138	unsigned int			sas_ss_flags;
1139
1140	struct callback_head		*task_works;
1141
1142#ifdef CONFIG_AUDIT
1143#ifdef CONFIG_AUDITSYSCALL
1144	struct audit_context		*audit_context;
1145#endif
1146	kuid_t				loginuid;
1147	unsigned int			sessionid;
1148#endif
1149	struct seccomp			seccomp;
1150	struct syscall_user_dispatch	syscall_dispatch;
1151
1152	/* Thread group tracking: */
1153	u64				parent_exec_id;
1154	u64				self_exec_id;
1155
1156	/* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
1157	spinlock_t			alloc_lock;
1158
1159	/* Protection of the PI data structures: */
1160	raw_spinlock_t			pi_lock;
1161
1162	struct wake_q_node		wake_q;
1163
1164#ifdef CONFIG_RT_MUTEXES
1165	/* PI waiters blocked on a rt_mutex held by this task: */
1166	struct rb_root_cached		pi_waiters;
1167	/* Updated under owner's pi_lock and rq lock */
1168	struct task_struct		*pi_top_task;
1169	/* Deadlock detection and priority inheritance handling: */
1170	struct rt_mutex_waiter		*pi_blocked_on;
1171#endif
1172
1173#ifdef CONFIG_DEBUG_MUTEXES
1174	/* Mutex deadlock detection: */
1175	struct mutex_waiter		*blocked_on;
1176#endif
1177
1178#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
1179	int				non_block_count;
1180#endif
1181
1182#ifdef CONFIG_TRACE_IRQFLAGS
1183	struct irqtrace_events		irqtrace;
1184	unsigned int			hardirq_threaded;
1185	u64				hardirq_chain_key;
1186	int				softirqs_enabled;
1187	int				softirq_context;
1188	int				irq_config;
1189#endif
1190#ifdef CONFIG_PREEMPT_RT
1191	int				softirq_disable_cnt;
1192#endif
1193
1194#ifdef CONFIG_LOCKDEP
1195# define MAX_LOCK_DEPTH			48UL
1196	u64				curr_chain_key;
1197	int				lockdep_depth;
1198	unsigned int			lockdep_recursion;
1199	struct held_lock		held_locks[MAX_LOCK_DEPTH];
1200#endif
1201
1202#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP)
1203	unsigned int			in_ubsan;
1204#endif
1205
1206	/* Journalling filesystem info: */
1207	void				*journal_info;
1208
1209	/* Stacked block device info: */
1210	struct bio_list			*bio_list;
1211
1212	/* Stack plugging: */
1213	struct blk_plug			*plug;
1214
1215	/* VM state: */
1216	struct reclaim_state		*reclaim_state;
1217
1218	struct io_context		*io_context;
1219
1220#ifdef CONFIG_COMPACTION
1221	struct capture_control		*capture_control;
1222#endif
1223	/* Ptrace state: */
1224	unsigned long			ptrace_message;
1225	kernel_siginfo_t		*last_siginfo;
1226
1227	struct task_io_accounting	ioac;
1228#ifdef CONFIG_PSI
1229	/* Pressure stall state */
1230	unsigned int			psi_flags;
1231#endif
1232#ifdef CONFIG_TASK_XACCT
1233	/* Accumulated RSS usage: */
1234	u64				acct_rss_mem1;
1235	/* Accumulated virtual memory usage: */
1236	u64				acct_vm_mem1;
1237	/* stime + utime since last update: */
1238	u64				acct_timexpd;
1239#endif
1240#ifdef CONFIG_CPUSETS
1241	/* Protected by ->alloc_lock: */
1242	nodemask_t			mems_allowed;
1243	/* Sequence number to catch updates: */
1244	seqcount_spinlock_t		mems_allowed_seq;
1245	int				cpuset_mem_spread_rotor;
1246	int				cpuset_slab_spread_rotor;
1247#endif
1248#ifdef CONFIG_CGROUPS
1249	/* Control Group info protected by css_set_lock: */
1250	struct css_set __rcu		*cgroups;
1251	/* cg_list protected by css_set_lock and tsk->alloc_lock: */
1252	struct list_head		cg_list;
1253#endif
1254#ifdef CONFIG_X86_CPU_RESCTRL
1255	u32				closid;
1256	u32				rmid;
1257#endif
1258#ifdef CONFIG_FUTEX
1259	struct robust_list_head __user	*robust_list;
1260#ifdef CONFIG_COMPAT
1261	struct compat_robust_list_head __user *compat_robust_list;
1262#endif
1263	struct list_head		pi_state_list;
1264	struct futex_pi_state		*pi_state_cache;
1265	struct mutex			futex_exit_mutex;
1266	unsigned int			futex_state;
1267#endif
1268#ifdef CONFIG_PERF_EVENTS
1269	u8				perf_recursion[PERF_NR_CONTEXTS];
1270	struct perf_event_context	*perf_event_ctxp;
1271	struct mutex			perf_event_mutex;
1272	struct list_head		perf_event_list;
1273#endif
1274#ifdef CONFIG_DEBUG_PREEMPT
1275	unsigned long			preempt_disable_ip;
1276#endif
1277#ifdef CONFIG_NUMA
1278	/* Protected by alloc_lock: */
1279	struct mempolicy		*mempolicy;
1280	short				il_prev;
1281	u8				il_weight;
1282	short				pref_node_fork;
1283#endif
1284#ifdef CONFIG_NUMA_BALANCING
1285	int				numa_scan_seq;
1286	unsigned int			numa_scan_period;
1287	unsigned int			numa_scan_period_max;
1288	int				numa_preferred_nid;
1289	unsigned long			numa_migrate_retry;
1290	/* Migration stamp: */
1291	u64				node_stamp;
1292	u64				last_task_numa_placement;
1293	u64				last_sum_exec_runtime;
1294	struct callback_head		numa_work;
1295
1296	/*
1297	 * This pointer is only modified for current in syscall and
1298	 * pagefault context (and for tasks being destroyed), so it can be read
1299	 * from any of the following contexts:
1300	 *  - RCU read-side critical section
1301	 *  - current->numa_group from everywhere
1302	 *  - task's runqueue locked, task not running
1303	 */
1304	struct numa_group __rcu		*numa_group;
1305
1306	/*
1307	 * numa_faults is an array split into four regions:
1308	 * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
1309	 * in this precise order.
1310	 *
1311	 * faults_memory: Exponential decaying average of faults on a per-node
1312	 * basis. Scheduling placement decisions are made based on these
1313	 * counts. The values remain static for the duration of a PTE scan.
1314	 * faults_cpu: Track the nodes the process was running on when a NUMA
1315	 * hinting fault was incurred.
1316	 * faults_memory_buffer and faults_cpu_buffer: Record faults per node
1317	 * during the current scan window. When the scan completes, the counts
1318	 * in faults_memory and faults_cpu decay and these values are copied.
1319	 */
1320	unsigned long			*numa_faults;
1321	unsigned long			total_numa_faults;
1322
1323	/*
1324	 * numa_faults_locality tracks if faults recorded during the last
1325	 * scan window were remote/local or failed to migrate. The task scan
1326	 * period is adapted based on the locality of the faults with different
1327	 * weights depending on whether they were shared or private faults
1328	 */
1329	unsigned long			numa_faults_locality[3];
1330
1331	unsigned long			numa_pages_migrated;
1332#endif /* CONFIG_NUMA_BALANCING */
1333
1334#ifdef CONFIG_RSEQ
1335	struct rseq __user *rseq;
1336	u32 rseq_len;
1337	u32 rseq_sig;
1338	/*
1339	 * RmW on rseq_event_mask must be performed atomically
1340	 * with respect to preemption.
1341	 */
1342	unsigned long rseq_event_mask;
1343#endif
1344
1345#ifdef CONFIG_SCHED_MM_CID
1346	int				mm_cid;		/* Current cid in mm */
1347	int				last_mm_cid;	/* Most recent cid in mm */
1348	int				migrate_from_cpu;
1349	int				mm_cid_active;	/* Whether cid bitmap is active */
1350	struct callback_head		cid_work;
1351#endif
1352
1353	struct tlbflush_unmap_batch	tlb_ubc;
1354
1355	/* Cache last used pipe for splice(): */
1356	struct pipe_inode_info		*splice_pipe;
1357
1358	struct page_frag		task_frag;
1359
1360#ifdef CONFIG_TASK_DELAY_ACCT
1361	struct task_delay_info		*delays;
1362#endif
1363
1364#ifdef CONFIG_FAULT_INJECTION
1365	int				make_it_fail;
1366	unsigned int			fail_nth;
1367#endif
1368	/*
1369	 * When (nr_dirtied >= nr_dirtied_pause), it's time to call
1370	 * balance_dirty_pages() for a dirty throttling pause:
1371	 */
1372	int				nr_dirtied;
1373	int				nr_dirtied_pause;
1374	/* Start of a write-and-pause period: */
1375	unsigned long			dirty_paused_when;
1376
1377#ifdef CONFIG_LATENCYTOP
1378	int				latency_record_count;
1379	struct latency_record		latency_record[LT_SAVECOUNT];
1380#endif
1381	/*
1382	 * Time slack values; these are used to round up poll() and
1383	 * select() etc timeout values. These are in nanoseconds.
1384	 */
1385	u64				timer_slack_ns;
1386	u64				default_timer_slack_ns;
1387
1388#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
1389	unsigned int			kasan_depth;
1390#endif
1391
1392#ifdef CONFIG_KCSAN
1393	struct kcsan_ctx		kcsan_ctx;
1394#ifdef CONFIG_TRACE_IRQFLAGS
1395	struct irqtrace_events		kcsan_save_irqtrace;
1396#endif
1397#ifdef CONFIG_KCSAN_WEAK_MEMORY
1398	int				kcsan_stack_depth;
1399#endif
1400#endif
1401
1402#ifdef CONFIG_KMSAN
1403	struct kmsan_ctx		kmsan_ctx;
1404#endif
1405
1406#if IS_ENABLED(CONFIG_KUNIT)
1407	struct kunit			*kunit_test;
1408#endif
1409
1410#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1411	/* Index of current stored address in ret_stack: */
1412	int				curr_ret_stack;
1413	int				curr_ret_depth;
1414
1415	/* Stack of return addresses for return function tracing: */
1416	unsigned long			*ret_stack;
1417
1418	/* Timestamp for last schedule: */
1419	unsigned long long		ftrace_timestamp;
1420
1421	/*
1422	 * Number of functions that haven't been traced
1423	 * because of depth overrun:
1424	 */
1425	atomic_t			trace_overrun;
1426
1427	/* Pause tracing: */
1428	atomic_t			tracing_graph_pause;
1429#endif
1430
1431#ifdef CONFIG_TRACING
1432	/* Bitmask and counter of trace recursion: */
1433	unsigned long			trace_recursion;
1434#endif /* CONFIG_TRACING */
1435
1436#ifdef CONFIG_KCOV
1437	/* See kernel/kcov.c for more details. */
1438
1439	/* Coverage collection mode enabled for this task (0 if disabled): */
1440	unsigned int			kcov_mode;
1441
1442	/* Size of the kcov_area: */
1443	unsigned int			kcov_size;
1444
1445	/* Buffer for coverage collection: */
1446	void				*kcov_area;
1447
1448	/* KCOV descriptor wired with this task or NULL: */
1449	struct kcov			*kcov;
1450
1451	/* KCOV common handle for remote coverage collection: */
1452	u64				kcov_handle;
1453
1454	/* KCOV sequence number: */
1455	int				kcov_sequence;
1456
1457	/* Collect coverage from softirq context: */
1458	unsigned int			kcov_softirq;
1459#endif
1460
1461#ifdef CONFIG_MEMCG_V1
1462	struct mem_cgroup		*memcg_in_oom;
1463#endif
1464
1465#ifdef CONFIG_MEMCG
1466	/* Number of pages to reclaim on returning to userland: */
1467	unsigned int			memcg_nr_pages_over_high;
1468
1469	/* Used by memcontrol for targeted memcg charge: */
1470	struct mem_cgroup		*active_memcg;
1471
1472	/* Cache for current->cgroups->memcg->objcg lookups: */
1473	struct obj_cgroup		*objcg;
1474#endif
1475
1476#ifdef CONFIG_BLK_CGROUP
1477	struct gendisk			*throttle_disk;
1478#endif
1479
1480#ifdef CONFIG_UPROBES
1481	struct uprobe_task		*utask;
1482#endif
1483#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
1484	unsigned int			sequential_io;
1485	unsigned int			sequential_io_avg;
1486#endif
1487	struct kmap_ctrl		kmap_ctrl;
1488#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
1489	unsigned long			task_state_change;
1490# ifdef CONFIG_PREEMPT_RT
1491	unsigned long			saved_state_change;
1492# endif
1493#endif
1494	struct rcu_head			rcu;
1495	refcount_t			rcu_users;
1496	int				pagefault_disabled;
1497#ifdef CONFIG_MMU
1498	struct task_struct		*oom_reaper_list;
1499	struct timer_list		oom_reaper_timer;
1500#endif
1501#ifdef CONFIG_VMAP_STACK
1502	struct vm_struct		*stack_vm_area;
1503#endif
1504#ifdef CONFIG_THREAD_INFO_IN_TASK
1505	/* A live task holds one reference: */
1506	refcount_t			stack_refcount;
1507#endif
1508#ifdef CONFIG_LIVEPATCH
1509	int patch_state;
1510#endif
1511#ifdef CONFIG_SECURITY
1512	/* Used by LSM modules for access restriction: */
1513	void				*security;
1514#endif
1515#ifdef CONFIG_BPF_SYSCALL
1516	/* Used by BPF task local storage */
1517	struct bpf_local_storage __rcu	*bpf_storage;
1518	/* Used for BPF run context */
1519	struct bpf_run_ctx		*bpf_ctx;
1520#endif
1521	/* Used by BPF for per-TASK xdp storage */
1522	struct bpf_net_context		*bpf_net_context;
1523
1524#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
1525	unsigned long			lowest_stack;
1526	unsigned long			prev_lowest_stack;
1527#endif
1528
1529#ifdef CONFIG_X86_MCE
1530	void __user			*mce_vaddr;
1531	__u64				mce_kflags;
1532	u64				mce_addr;
1533	__u64				mce_ripv : 1,
1534					mce_whole_page : 1,
1535					__mce_reserved : 62;
1536	struct callback_head		mce_kill_me;
1537	int				mce_count;
1538#endif
1539
1540#ifdef CONFIG_KRETPROBES
1541	struct llist_head               kretprobe_instances;
1542#endif
1543#ifdef CONFIG_RETHOOK
1544	struct llist_head               rethooks;
1545#endif
1546
1547#ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH
1548	/*
1549	 * If L1D flush is supported on mm context switch
1550	 * then we use this callback head to queue kill work
1551	 * to kill tasks that are not running on SMT disabled
1552	 * cores
1553	 */
1554	struct callback_head		l1d_flush_kill;
1555#endif
1556
1557#ifdef CONFIG_RV
1558	/*
1559	 * Per-task RV monitor. Nowadays fixed in RV_PER_TASK_MONITORS.
1560	 * If we find justification for more monitors, we can think
1561	 * about adding more or developing a dynamic method. So far,
1562	 * none of these are justified.
1563	 */
1564	union rv_task_monitor		rv[RV_PER_TASK_MONITORS];
1565#endif
1566
1567#ifdef CONFIG_USER_EVENTS
1568	struct user_event_mm		*user_event_mm;
1569#endif
1570
1571	/*
1572	 * New fields for task_struct should be added above here, so that
1573	 * they are included in the randomized portion of task_struct.
1574	 */
1575	randomized_struct_fields_end
1576
1577	/* CPU-specific state of this task: */
1578	struct thread_struct		thread;
1579
1580	/*
1581	 * WARNING: on x86, 'thread_struct' contains a variable-sized
1582	 * structure.  It *MUST* be at the end of 'task_struct'.
1583	 *
1584	 * Do not put anything below here!
1585	 */
1586};
1587
1588#define TASK_REPORT_IDLE	(TASK_REPORT + 1)
1589#define TASK_REPORT_MAX		(TASK_REPORT_IDLE << 1)
1590
1591static inline unsigned int __task_state_index(unsigned int tsk_state,
1592					      unsigned int tsk_exit_state)
1593{
1594	unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT;
1595
1596	BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);
1597
1598	if ((tsk_state & TASK_IDLE) == TASK_IDLE)
1599		state = TASK_REPORT_IDLE;
1600
1601	/*
1602	 * We're lying here, but rather than expose a completely new task state
1603	 * to userspace, we can make this appear as if the task has gone through
1604	 * a regular rt_mutex_lock() call.
1605	 */
1606	if (tsk_state & TASK_RTLOCK_WAIT)
1607		state = TASK_UNINTERRUPTIBLE;
1608
1609	return fls(state);
1610}
1611
1612static inline unsigned int task_state_index(struct task_struct *tsk)
1613{
1614	return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state);
1615}
1616
1617static inline char task_index_to_char(unsigned int state)
1618{
1619	static const char state_char[] = "RSDTtXZPI";
1620
1621	BUILD_BUG_ON(TASK_REPORT_MAX * 2 != 1 << (sizeof(state_char) - 1));
1622
1623	return state_char[state];
1624}
1625
1626static inline char task_state_to_char(struct task_struct *tsk)
1627{
1628	return task_index_to_char(task_state_index(tsk));
1629}
1630
1631extern struct pid *cad_pid;
1632
1633/*
1634 * Per process flags
1635 */
1636#define PF_VCPU			0x00000001	/* I'm a virtual CPU */
1637#define PF_IDLE			0x00000002	/* I am an IDLE thread */
1638#define PF_EXITING		0x00000004	/* Getting shut down */
1639#define PF_POSTCOREDUMP		0x00000008	/* Coredumps should ignore this task */
1640#define PF_IO_WORKER		0x00000010	/* Task is an IO worker */
1641#define PF_WQ_WORKER		0x00000020	/* I'm a workqueue worker */
1642#define PF_FORKNOEXEC		0x00000040	/* Forked but didn't exec */
1643#define PF_MCE_PROCESS		0x00000080      /* Process policy on mce errors */
1644#define PF_SUPERPRIV		0x00000100	/* Used super-user privileges */
1645#define PF_DUMPCORE		0x00000200	/* Dumped core */
1646#define PF_SIGNALED		0x00000400	/* Killed by a signal */
1647#define PF_MEMALLOC		0x00000800	/* Allocating memory to free memory. See memalloc_noreclaim_save() */
1648#define PF_NPROC_EXCEEDED	0x00001000	/* set_user() noticed that RLIMIT_NPROC was exceeded */
1649#define PF_USED_MATH		0x00002000	/* If unset the fpu must be initialized before use */
1650#define PF_USER_WORKER		0x00004000	/* Kernel thread cloned from userspace thread */
1651#define PF_NOFREEZE		0x00008000	/* This thread should not be frozen */
1652#define PF__HOLE__00010000	0x00010000
1653#define PF_KSWAPD		0x00020000	/* I am kswapd */
1654#define PF_MEMALLOC_NOFS	0x00040000	/* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */
1655#define PF_MEMALLOC_NOIO	0x00080000	/* All allocations inherit GFP_NOIO. See memalloc_noio_save() */
1656#define PF_LOCAL_THROTTLE	0x00100000	/* Throttle writes only against the bdi I write to,
1657						 * I am cleaning dirty pages from some other bdi. */
1658#define PF_KTHREAD		0x00200000	/* I am a kernel thread */
1659#define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
1660#define PF_MEMALLOC_NORECLAIM	0x00800000	/* All allocation requests will clear __GFP_DIRECT_RECLAIM */
1661#define PF_MEMALLOC_NOWARN	0x01000000	/* All allocation requests will inherit __GFP_NOWARN */
1662#define PF__HOLE__02000000	0x02000000
1663#define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
1664#define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
1665#define PF_MEMALLOC_PIN		0x10000000	/* Allocations constrained to zones which allow long term pinning.
1666						 * See memalloc_pin_save() */
1667#define PF_BLOCK_TS		0x20000000	/* plug has ts that needs updating */
1668#define PF__HOLE__40000000	0x40000000
1669#define PF_SUSPEND_TASK		0x80000000      /* This thread called freeze_processes() and should not be frozen */
1670
1671/*
1672 * Only the _current_ task can read/write to tsk->flags, but other
1673 * tasks can access tsk->flags in readonly mode for example
1674 * with tsk_used_math (like during threaded core dumping).
1675 * There is however an exception to this rule during ptrace
1676 * or during fork: the ptracer task is allowed to write to the
1677 * child->flags of its traced child (same goes for fork, the parent
1678 * can write to the child->flags), because we're guaranteed the
1679 * child is not running and in turn not changing child->flags
1680 * at the same time the parent does it.
1681 */
1682#define clear_stopped_child_used_math(child)	do { (child)->flags &= ~PF_USED_MATH; } while (0)
1683#define set_stopped_child_used_math(child)	do { (child)->flags |= PF_USED_MATH; } while (0)
1684#define clear_used_math()			clear_stopped_child_used_math(current)
1685#define set_used_math()				set_stopped_child_used_math(current)
1686
1687#define conditional_stopped_child_used_math(condition, child) \
1688	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
1689
1690#define conditional_used_math(condition)	conditional_stopped_child_used_math(condition, current)
1691
1692#define copy_to_stopped_child_used_math(child) \
1693	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
1694
1695/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
1696#define tsk_used_math(p)			((p)->flags & PF_USED_MATH)
1697#define used_math()				tsk_used_math(current)
1698
1699static __always_inline bool is_percpu_thread(void)
1700{
1701#ifdef CONFIG_SMP
1702	return (current->flags & PF_NO_SETAFFINITY) &&
1703		(current->nr_cpus_allowed  == 1);
1704#else
1705	return true;
1706#endif
1707}
1708
1709/* Per-process atomic flags. */
1710#define PFA_NO_NEW_PRIVS		0	/* May not gain new privileges. */
1711#define PFA_SPREAD_PAGE			1	/* Spread page cache over cpuset */
1712#define PFA_SPREAD_SLAB			2	/* Spread some slab caches over cpuset */
1713#define PFA_SPEC_SSB_DISABLE		3	/* Speculative Store Bypass disabled */
1714#define PFA_SPEC_SSB_FORCE_DISABLE	4	/* Speculative Store Bypass force disabled*/
1715#define PFA_SPEC_IB_DISABLE		5	/* Indirect branch speculation restricted */
1716#define PFA_SPEC_IB_FORCE_DISABLE	6	/* Indirect branch speculation permanently restricted */
1717#define PFA_SPEC_SSB_NOEXEC		7	/* Speculative Store Bypass clear on execve() */
1718
1719#define TASK_PFA_TEST(name, func)					\
1720	static inline bool task_##func(struct task_struct *p)		\
1721	{ return test_bit(PFA_##name, &p->atomic_flags); }
1722
1723#define TASK_PFA_SET(name, func)					\
1724	static inline void task_set_##func(struct task_struct *p)	\
1725	{ set_bit(PFA_##name, &p->atomic_flags); }
1726
1727#define TASK_PFA_CLEAR(name, func)					\
1728	static inline void task_clear_##func(struct task_struct *p)	\
1729	{ clear_bit(PFA_##name, &p->atomic_flags); }
1730
1731TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs)
1732TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs)
1733
1734TASK_PFA_TEST(SPREAD_PAGE, spread_page)
1735TASK_PFA_SET(SPREAD_PAGE, spread_page)
1736TASK_PFA_CLEAR(SPREAD_PAGE, spread_page)
1737
1738TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
1739TASK_PFA_SET(SPREAD_SLAB, spread_slab)
1740TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
1741
1742TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable)
1743TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable)
1744TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable)
1745
1746TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec)
1747TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec)
1748TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec)
1749
1750TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
1751TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
1752
1753TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable)
1754TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable)
1755TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable)
1756
1757TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
1758TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
1759
1760static inline void
1761current_restore_flags(unsigned long orig_flags, unsigned long flags)
1762{
1763	current->flags &= ~flags;
1764	current->flags |= orig_flags & flags;
1765}
1766
1767extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
1768extern int task_can_attach(struct task_struct *p);
1769extern int dl_bw_alloc(int cpu, u64 dl_bw);
1770extern void dl_bw_free(int cpu, u64 dl_bw);
1771#ifdef CONFIG_SMP
1772
1773/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
1774extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
1775
1776/**
1777 * set_cpus_allowed_ptr - set CPU affinity mask of a task
1778 * @p: the task
1779 * @new_mask: CPU affinity mask
1780 *
1781 * Return: zero if successful, or a negative error code
1782 */
1783extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
1784extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
1785extern void release_user_cpus_ptr(struct task_struct *p);
1786extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
1787extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
1788extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);
1789#else
1790static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1791{
1792}
1793static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
1794{
1795	/* Opencoded cpumask_test_cpu(0, new_mask) to avoid dependency on cpumask.h */
1796	if ((*cpumask_bits(new_mask) & 1) == 0)
1797		return -EINVAL;
1798	return 0;
1799}
1800static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
1801{
1802	if (src->user_cpus_ptr)
1803		return -EINVAL;
1804	return 0;
1805}
1806static inline void release_user_cpus_ptr(struct task_struct *p)
1807{
1808	WARN_ON(p->user_cpus_ptr);
1809}
1810
1811static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
1812{
1813	return 0;
1814}
1815#endif
1816
1817extern int yield_to(struct task_struct *p, bool preempt);
1818extern void set_user_nice(struct task_struct *p, long nice);
1819extern int task_prio(const struct task_struct *p);
1820
1821/**
1822 * task_nice - return the nice value of a given task.
1823 * @p: the task in question.
1824 *
1825 * Return: The nice value [ -20 ... 0 ... 19 ].
1826 */
1827static inline int task_nice(const struct task_struct *p)
1828{
1829	return PRIO_TO_NICE((p)->static_prio);
1830}
1831
1832extern int can_nice(const struct task_struct *p, const int nice);
1833extern int task_curr(const struct task_struct *p);
1834extern int idle_cpu(int cpu);
1835extern int available_idle_cpu(int cpu);
1836extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
1837extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
1838extern void sched_set_fifo(struct task_struct *p);
1839extern void sched_set_fifo_low(struct task_struct *p);
1840extern void sched_set_normal(struct task_struct *p, int nice);
1841extern int sched_setattr(struct task_struct *, const struct sched_attr *);
1842extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
1843extern struct task_struct *idle_task(int cpu);
1844
1845/**
1846 * is_idle_task - is the specified task an idle task?
1847 * @p: the task in question.
1848 *
1849 * Return: 1 if @p is an idle task. 0 otherwise.
1850 */
1851static __always_inline bool is_idle_task(const struct task_struct *p)
1852{
1853	return !!(p->flags & PF_IDLE);
1854}
1855
1856extern struct task_struct *curr_task(int cpu);
1857extern void ia64_set_curr_task(int cpu, struct task_struct *p);
1858
1859void yield(void);
1860
1861union thread_union {
1862	struct task_struct task;
1863#ifndef CONFIG_THREAD_INFO_IN_TASK
1864	struct thread_info thread_info;
1865#endif
1866	unsigned long stack[THREAD_SIZE/sizeof(long)];
1867};
1868
1869#ifndef CONFIG_THREAD_INFO_IN_TASK
1870extern struct thread_info init_thread_info;
1871#endif
1872
1873extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)];
1874
1875#ifdef CONFIG_THREAD_INFO_IN_TASK
1876# define task_thread_info(task)	(&(task)->thread_info)
1877#elif !defined(__HAVE_THREAD_FUNCTIONS)
1878# define task_thread_info(task)	((struct thread_info *)(task)->stack)
1879#endif
1880
1881/*
1882 * find a task by one of its numerical ids
1883 *
1884 * find_task_by_pid_ns():
1885 *      finds a task by its pid in the specified namespace
1886 * find_task_by_vpid():
1887 *      finds a task by its virtual pid
1888 *
1889 * see also find_vpid() etc in include/linux/pid.h
1890 */
1891
1892extern struct task_struct *find_task_by_vpid(pid_t nr);
1893extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns);
1894
1895/*
1896 * find a task by its virtual pid and get the task struct
1897 */
1898extern struct task_struct *find_get_task_by_vpid(pid_t nr);
1899
1900extern int wake_up_state(struct task_struct *tsk, unsigned int state);
1901extern int wake_up_process(struct task_struct *tsk);
1902extern void wake_up_new_task(struct task_struct *tsk);
1903
1904#ifdef CONFIG_SMP
1905extern void kick_process(struct task_struct *tsk);
1906#else
1907static inline void kick_process(struct task_struct *tsk) { }
1908#endif
1909
1910extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
1911
1912static inline void set_task_comm(struct task_struct *tsk, const char *from)
1913{
1914	__set_task_comm(tsk, from, false);
1915}
1916
1917extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
1918#define get_task_comm(buf, tsk) ({			\
1919	BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN);	\
1920	__get_task_comm(buf, sizeof(buf), tsk);		\
1921})
1922
1923#ifdef CONFIG_SMP
1924static __always_inline void scheduler_ipi(void)
1925{
1926	/*
1927	 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
1928	 * TIF_NEED_RESCHED remotely (for the first time) will also send
1929	 * this IPI.
1930	 */
1931	preempt_fold_need_resched();
1932}
1933#else
1934static inline void scheduler_ipi(void) { }
1935#endif
1936
1937extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
1938
1939/*
1940 * Set thread flags in other task's structures.
1941 * See asm/thread_info.h for TIF_xxxx flags available:
1942 */
1943static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
1944{
1945	set_ti_thread_flag(task_thread_info(tsk), flag);
1946}
1947
1948static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
1949{
1950	clear_ti_thread_flag(task_thread_info(tsk), flag);
1951}
1952
1953static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag,
1954					  bool value)
1955{
1956	update_ti_thread_flag(task_thread_info(tsk), flag, value);
1957}
1958
1959static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
1960{
1961	return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
1962}
1963
1964static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
1965{
1966	return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
1967}
1968
1969static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
1970{
1971	return test_ti_thread_flag(task_thread_info(tsk), flag);
1972}
1973
1974static inline void set_tsk_need_resched(struct task_struct *tsk)
1975{
1976	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
1977}
1978
1979static inline void clear_tsk_need_resched(struct task_struct *tsk)
1980{
1981	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
1982}
1983
1984static inline int test_tsk_need_resched(struct task_struct *tsk)
1985{
1986	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
1987}
1988
1989/*
1990 * cond_resched() and cond_resched_lock(): latency reduction via
1991 * explicit rescheduling in places that are safe. The return
1992 * value indicates whether a reschedule was done in fact.
1993 * cond_resched_lock() will drop the spinlock before scheduling,
1994 */
1995#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
1996extern int __cond_resched(void);
1997
1998#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
1999
2000void sched_dynamic_klp_enable(void);
2001void sched_dynamic_klp_disable(void);
2002
2003DECLARE_STATIC_CALL(cond_resched, __cond_resched);
2004
2005static __always_inline int _cond_resched(void)
2006{
2007	return static_call_mod(cond_resched)();
2008}
2009
2010#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
2011
2012extern int dynamic_cond_resched(void);
2013
2014static __always_inline int _cond_resched(void)
2015{
2016	return dynamic_cond_resched();
2017}
2018
2019#else /* !CONFIG_PREEMPTION */
2020
2021static inline int _cond_resched(void)
2022{
2023	klp_sched_try_switch();
2024	return __cond_resched();
2025}
2026
2027#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
2028
2029#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */
2030
2031static inline int _cond_resched(void)
2032{
2033	klp_sched_try_switch();
2034	return 0;
2035}
2036
2037#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */
2038
2039#define cond_resched() ({			\
2040	__might_resched(__FILE__, __LINE__, 0);	\
2041	_cond_resched();			\
2042})
2043
2044extern int __cond_resched_lock(spinlock_t *lock);
2045extern int __cond_resched_rwlock_read(rwlock_t *lock);
2046extern int __cond_resched_rwlock_write(rwlock_t *lock);
2047
2048#define MIGHT_RESCHED_RCU_SHIFT		8
2049#define MIGHT_RESCHED_PREEMPT_MASK	((1U << MIGHT_RESCHED_RCU_SHIFT) - 1)
2050
2051#ifndef CONFIG_PREEMPT_RT
2052/*
2053 * Non RT kernels have an elevated preempt count due to the held lock,
2054 * but are not allowed to be inside a RCU read side critical section
2055 */
2056# define PREEMPT_LOCK_RESCHED_OFFSETS	PREEMPT_LOCK_OFFSET
2057#else
2058/*
2059 * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in
2060 * cond_resched*lock() has to take that into account because it checks for
2061 * preempt_count() and rcu_preempt_depth().
2062 */
2063# define PREEMPT_LOCK_RESCHED_OFFSETS	\
2064	(PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT))
2065#endif
2066
2067#define cond_resched_lock(lock) ({						\
2068	__might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);	\
2069	__cond_resched_lock(lock);						\
2070})
2071
2072#define cond_resched_rwlock_read(lock) ({					\
2073	__might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);	\
2074	__cond_resched_rwlock_read(lock);					\
2075})
2076
2077#define cond_resched_rwlock_write(lock) ({					\
2078	__might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);	\
2079	__cond_resched_rwlock_write(lock);					\
2080})
2081
2082static __always_inline bool need_resched(void)
2083{
2084	return unlikely(tif_need_resched());
2085}
2086
2087/*
2088 * Wrappers for p->thread_info->cpu access. No-op on UP.
2089 */
2090#ifdef CONFIG_SMP
2091
2092static inline unsigned int task_cpu(const struct task_struct *p)
2093{
2094	return READ_ONCE(task_thread_info(p)->cpu);
2095}
2096
2097extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
2098
2099#else
2100
2101static inline unsigned int task_cpu(const struct task_struct *p)
2102{
2103	return 0;
2104}
2105
2106static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
2107{
2108}
2109
2110#endif /* CONFIG_SMP */
2111
2112extern bool sched_task_on_rq(struct task_struct *p);
2113extern unsigned long get_wchan(struct task_struct *p);
2114extern struct task_struct *cpu_curr_snapshot(int cpu);
2115
2116#include <linux/spinlock.h>
2117
2118/*
2119 * In order to reduce various lock holder preemption latencies provide an
2120 * interface to see if a vCPU is currently running or not.
2121 *
2122 * This allows us to terminate optimistic spin loops and block, analogous to
2123 * the native optimistic spin heuristic of testing if the lock owner task is
2124 * running or not.
2125 */
2126#ifndef vcpu_is_preempted
2127static inline bool vcpu_is_preempted(int cpu)
2128{
2129	return false;
2130}
2131#endif
2132
2133extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
2134extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
2135
2136#ifndef TASK_SIZE_OF
2137#define TASK_SIZE_OF(tsk)	TASK_SIZE
2138#endif
2139
2140#ifdef CONFIG_SMP
2141static inline bool owner_on_cpu(struct task_struct *owner)
2142{
2143	/*
2144	 * As lock holder preemption issue, we both skip spinning if
2145	 * task is not on cpu or its cpu is preempted
2146	 */
2147	return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner));
2148}
2149
2150/* Returns effective CPU energy utilization, as seen by the scheduler */
2151unsigned long sched_cpu_util(int cpu);
2152#endif /* CONFIG_SMP */
2153
2154#ifdef CONFIG_SCHED_CORE
2155extern void sched_core_free(struct task_struct *tsk);
2156extern void sched_core_fork(struct task_struct *p);
2157extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
2158				unsigned long uaddr);
2159extern int sched_core_idle_cpu(int cpu);
2160#else
2161static inline void sched_core_free(struct task_struct *tsk) { }
2162static inline void sched_core_fork(struct task_struct *p) { }
2163static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
2164#endif
2165
2166extern void sched_set_stop_task(int cpu, struct task_struct *stop);
2167
2168#ifdef CONFIG_MEM_ALLOC_PROFILING
2169static __always_inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag)
2170{
2171	swap(current->alloc_tag, tag);
2172	return tag;
2173}
2174
2175static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old)
2176{
2177#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
2178	WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n");
2179#endif
2180	current->alloc_tag = old;
2181}
2182#else
2183#define alloc_tag_save(_tag)			NULL
2184#define alloc_tag_restore(_tag, _old)		do {} while (0)
2185#endif
2186
2187#endif