at master 8.4 kB view raw
1/* SPDX-License-Identifier: GPL-2.0 */ 2/* 3 * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst 4 * 5 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 6 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 7 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 8 */ 9#ifndef _LINUX_SCHED_EXT_H 10#define _LINUX_SCHED_EXT_H 11 12#ifdef CONFIG_SCHED_CLASS_EXT 13 14#include <linux/llist.h> 15#include <linux/rhashtable-types.h> 16 17enum scx_public_consts { 18 SCX_OPS_NAME_LEN = 128, 19 20 /* 21 * %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses 22 * to set the slice for a task that is selected for execution. 23 * %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice 24 * refill has been triggered. 25 * 26 * %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass 27 * mode. As making forward progress for all tasks is the main goal of 28 * the bypass mode, a shorter slice is used. 29 */ 30 SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ 31 SCX_SLICE_BYPASS = 5 * 1000000, /* 5ms */ 32 SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ 33}; 34 35/* 36 * DSQ (dispatch queue) IDs are 64bit of the format: 37 * 38 * Bits: [63] [62 .. 0] 39 * [ B] [ ID ] 40 * 41 * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs 42 * ID: 63 bit ID 43 * 44 * Built-in IDs: 45 * 46 * Bits: [63] [62] [61..32] [31 .. 0] 47 * [ 1] [ L] [ R ] [ V ] 48 * 49 * 1: 1 for built-in DSQs. 50 * L: 1 for LOCAL_ON DSQ IDs, 0 for others 51 * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value. 52 */ 53enum scx_dsq_id_flags { 54 SCX_DSQ_FLAG_BUILTIN = 1LLU << 63, 55 SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62, 56 57 SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, 58 SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, 59 SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, 60 SCX_DSQ_BYPASS = SCX_DSQ_FLAG_BUILTIN | 3, 61 SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, 62 SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, 63}; 64 65/* 66 * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered 67 * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to 68 * buffer between the scheduler core and the BPF scheduler. See the 69 * documentation for more details. 70 */ 71struct scx_dispatch_q { 72 raw_spinlock_t lock; 73 struct task_struct __rcu *first_task; /* lockless peek at head */ 74 struct list_head list; /* tasks in dispatch order */ 75 struct rb_root priq; /* used to order by p->scx.dsq_vtime */ 76 u32 nr; 77 u32 seq; /* used by BPF iter */ 78 u64 id; 79 struct rhash_head hash_node; 80 struct llist_node free_node; 81 struct rcu_head rcu; 82}; 83 84/* scx_entity.flags */ 85enum scx_ent_flags { 86 SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ 87 SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ 88 SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ 89 90 SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ 91 SCX_TASK_STATE_BITS = 2, 92 SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, 93 94 SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */ 95}; 96 97/* scx_entity.flags & SCX_TASK_STATE_MASK */ 98enum scx_task_state { 99 SCX_TASK_NONE, /* ops.init_task() not called yet */ 100 SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */ 101 SCX_TASK_READY, /* fully initialized, but not in sched_ext */ 102 SCX_TASK_ENABLED, /* fully initialized and in sched_ext */ 103 104 SCX_TASK_NR_STATES, 105}; 106 107/* scx_entity.dsq_flags */ 108enum scx_ent_dsq_flags { 109 SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */ 110}; 111 112/* 113 * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from 114 * everywhere and the following bits track which kfunc sets are currently 115 * allowed for %current. This simple per-task tracking works because SCX ops 116 * nest in a limited way. BPF will likely implement a way to allow and disallow 117 * kfuncs depending on the calling context which will replace this manual 118 * mechanism. See scx_kf_allow(). 119 */ 120enum scx_kf_mask { 121 SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */ 122 /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ 123 SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */ 124 /* 125 * ops.dispatch() may release rq lock temporarily and thus ENQUEUE and 126 * SELECT_CPU may be nested inside. ops.dequeue (in REST) may also be 127 * nested inside DISPATCH. 128 */ 129 SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */ 130 SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */ 131 SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */ 132 SCX_KF_REST = 1 << 4, /* other rq-locked operations */ 133 134 __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | 135 SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, 136 __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, 137}; 138 139enum scx_dsq_lnode_flags { 140 SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0, 141 142 /* high 16 bits can be for iter cursor flags */ 143 __SCX_DSQ_LNODE_PRIV_SHIFT = 16, 144}; 145 146struct scx_dsq_list_node { 147 struct list_head node; 148 u32 flags; 149 u32 priv; /* can be used by iter cursor */ 150}; 151 152#define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv) \ 153 (struct scx_dsq_list_node) { \ 154 .node = LIST_HEAD_INIT((__node).node), \ 155 .flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags), \ 156 .priv = (__priv), \ 157 } 158 159/* 160 * The following is embedded in task_struct and contains all fields necessary 161 * for a task to be scheduled by SCX. 162 */ 163struct sched_ext_entity { 164 struct scx_dispatch_q *dsq; 165 struct scx_dsq_list_node dsq_list; /* dispatch order */ 166 struct rb_node dsq_priq; /* p->scx.dsq_vtime order */ 167 u32 dsq_seq; 168 u32 dsq_flags; /* protected by DSQ lock */ 169 u32 flags; /* protected by rq lock */ 170 u32 weight; 171 s32 sticky_cpu; 172 s32 holding_cpu; 173 s32 selected_cpu; 174 u32 kf_mask; /* see scx_kf_mask above */ 175 struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ 176 atomic_long_t ops_state; 177 178 struct list_head runnable_node; /* rq->scx.runnable_list */ 179 unsigned long runnable_at; 180 181#ifdef CONFIG_SCHED_CORE 182 u64 core_sched_at; /* see scx_prio_less() */ 183#endif 184 u64 ddsp_dsq_id; 185 u64 ddsp_enq_flags; 186 187 /* BPF scheduler modifiable fields */ 188 189 /* 190 * Runtime budget in nsecs. This is usually set through 191 * scx_bpf_dsq_insert() but can also be modified directly by the BPF 192 * scheduler. Automatically decreased by SCX as the task executes. On 193 * depletion, a scheduling event is triggered. 194 * 195 * This value is cleared to zero if the task is preempted by 196 * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the 197 * task ran. Use p->se.sum_exec_runtime instead. 198 */ 199 u64 slice; 200 201 /* 202 * Used to order tasks when dispatching to the vtime-ordered priority 203 * queue of a dsq. This is usually set through 204 * scx_bpf_dsq_insert_vtime() but can also be modified directly by the 205 * BPF scheduler. Modifying it while a task is queued on a dsq may 206 * mangle the ordering and is not recommended. 207 */ 208 u64 dsq_vtime; 209 210 /* 211 * If set, reject future sched_setscheduler(2) calls updating the policy 212 * to %SCHED_EXT with -%EACCES. 213 * 214 * Can be set from ops.init_task() while the BPF scheduler is being 215 * loaded (!scx_init_task_args->fork). If set and the task's policy is 216 * already %SCHED_EXT, the task's policy is rejected and forcefully 217 * reverted to %SCHED_NORMAL. The number of such events are reported 218 * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag 219 * during fork is not allowed. 220 */ 221 bool disallow; /* reject switching into SCX */ 222 223 /* cold fields */ 224#ifdef CONFIG_EXT_GROUP_SCHED 225 struct cgroup *cgrp_moving_from; 226#endif 227 struct list_head tasks_node; 228}; 229 230void sched_ext_dead(struct task_struct *p); 231void print_scx_info(const char *log_lvl, struct task_struct *p); 232void scx_softlockup(u32 dur_s); 233bool scx_hardlockup(int cpu); 234bool scx_rcu_cpu_stall(void); 235 236#else /* !CONFIG_SCHED_CLASS_EXT */ 237 238static inline void sched_ext_dead(struct task_struct *p) {} 239static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} 240static inline void scx_softlockup(u32 dur_s) {} 241static inline bool scx_hardlockup(int cpu) { return false; } 242static inline bool scx_rcu_cpu_stall(void) { return false; } 243 244#endif /* CONFIG_SCHED_CLASS_EXT */ 245 246struct scx_task_group { 247#ifdef CONFIG_EXT_GROUP_SCHED 248 u32 flags; /* SCX_TG_* */ 249 u32 weight; 250 u64 bw_period_us; 251 u64 bw_quota_us; 252 u64 bw_burst_us; 253 bool idle; 254#endif 255}; 256 257#endif /* _LINUX_SCHED_EXT_H */