sched_ext: Allow scx_bpf_reenqueue_local() to be called from anywhere

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

The ops.cpu_acquire/release() callbacks miss events under multiple conditions.
There are two distinct task dispatch gaps that can cause cpu_released flag
desynchronization:

1. balance-to-pick_task gap: This is what was originally reported. balance_scx()
can enqueue a task, but during consume_remote_task() when the rq lock is
released, a higher priority task can be enqueued and ultimately picked while
cpu_released remains false. This gap is closeable via RETRY_TASK handling.

2. ttwu-to-pick_task gap: ttwu() can directly dispatch a task to a CPU's local
DSQ. By the time the sched path runs on the target CPU, higher class tasks may
already be queued. In such cases, nothing on sched_ext side will be invoked,
and the only solution would be a hook invoked regardless of sched class, which
isn't desirable.

Rather than adding invasive core hooks, BPF schedulers can use generic BPF
mechanisms like tracepoints. From SCX scheduler's perspective, this is congruent
with other mechanisms it already uses and doesn't add further friction.

The main use case for cpu_release() was calling scx_bpf_reenqueue_local() when
a CPU gets preempted by a higher priority scheduling class. However, the old
scx_bpf_reenqueue_local() could only be called from cpu_release() context.

Add a new version of scx_bpf_reenqueue_local() that can be called from any
context by deferring the actual re-enqueue operation. This eliminates the need
for cpu_acquire/release() ops entirely. Schedulers can now use standard BPF
mechanisms like the sched_switch tracepoint to detect and handle CPU preemption.

Update scx_qmap to demonstrate the new approach using sched_switch instead of
cpu_release, with compat support for older kernels. Mark cpu_acquire/release()
as deprecated. The old scx_bpf_reenqueue_local() variant will be removed in
v6.23.

Reported-by: Wen-Fang Liu <liuwenfang@honor.com>
Link: https://lore.kernel.org/all/8d64c74118c6440f81bcf5a4ac6b9f00@honor.com/
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Tejun Heo <tj@kernel.org>

Tejun Heo 5 months ago a3f5d482 8803e6a7

+83 -11

5 changed files

expand all

kernel

sched

ext.c

sched.h

tools

sched_ext

include

scx

common.bpf.h

compat.bpf.h

scx_qmap.bpf.c

+31

kernel/sched/ext.c

··· 147 147 #include <trace/events/sched_ext.h> 148 148 149 149 static void process_ddsp_deferred_locals(struct rq *rq); 150 + static u32 reenq_local(struct rq *rq); 150 151 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); 151 152 static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, 152 153 s64 exit_code, const char *fmt, va_list args); ··· 756 755 static void run_deferred(struct rq *rq) 757 756 { 758 757 process_ddsp_deferred_locals(rq); 758 + 759 + if (local_read(&rq->scx.reenq_local_deferred)) { 760 + local_set(&rq->scx.reenq_local_deferred, 0); 761 + reenq_local(rq); 762 + } 759 763 } 760 764 761 765 static void deferred_bal_cb_workfn(struct rq *rq) ··· 4575 4569 if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT) 4576 4570 pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n"); 4577 4571 4572 + if (ops->cpu_acquire || ops->cpu_release) 4573 + pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n"); 4574 + 4578 4575 return 0; 4579 4576 } 4580 4577 ··· 5938 5929 * Iterate over all of the tasks currently enqueued on the local DSQ of the 5939 5930 * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of 5940 5931 * processed tasks. Can only be called from ops.cpu_release(). 5932 + * 5933 + * COMPAT: Will be removed in v6.23 along with the ___v2 suffix on the void 5934 + * returning variant that can be called from anywhere. 5941 5935 */ 5942 5936 __bpf_kfunc u32 scx_bpf_reenqueue_local(void) 5943 5937 { ··· 6500 6488 } 6501 6489 6502 6490 /** 6491 + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 6492 + * 6493 + * Iterate over all of the tasks currently enqueued on the local DSQ of the 6494 + * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from 6495 + * anywhere. 6496 + */ 6497 + __bpf_kfunc void scx_bpf_reenqueue_local___v2(void) 6498 + { 6499 + struct rq *rq; 6500 + 6501 + guard(preempt)(); 6502 + 6503 + rq = this_rq(); 6504 + local_set(&rq->scx.reenq_local_deferred, 1); 6505 + schedule_deferred(rq); 6506 + } 6507 + 6508 + /** 6503 6509 * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU 6504 6510 * @cpu: CPU of interest 6505 6511 * ··· 6930 6900 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) 6931 6901 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) 6932 6902 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) 6903 + BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2) 6933 6904 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) 6934 6905 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) 6935 6906 BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)

kernel/sched/sched.h

··· 804 804 cpumask_var_t cpus_to_preempt; 805 805 cpumask_var_t cpus_to_wait; 806 806 unsigned long kick_sync; 807 + local_t reenq_local_deferred; 807 808 struct balance_callback deferred_bal_cb; 808 809 struct irq_work deferred_irq_work; 809 810 struct irq_work kick_cpus_irq_work;

-1

tools/sched_ext/include/scx/common.bpf.h

··· 70 70 void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; 71 71 bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; 72 72 bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; 73 - u32 scx_bpf_reenqueue_local(void) __ksym; 74 73 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; 75 74 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; 76 75 void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;

+23

tools/sched_ext/include/scx/compat.bpf.h

··· 279 279 } 280 280 281 281 /* 282 + * v6.19: The new void variant can be called from anywhere while the older v1 283 + * variant can only be called from ops.cpu_release(). The double ___ prefixes on 284 + * the v2 variant need to be removed once libbpf is updated to ignore ___ prefix 285 + * on kernel side. Drop the wrapper and move the decl to common.bpf.h after 286 + * v6.22. 287 + */ 288 + u32 scx_bpf_reenqueue_local___v1(void) __ksym __weak; 289 + void scx_bpf_reenqueue_local___v2___compat(void) __ksym __weak; 290 + 291 + static inline bool __COMPAT_scx_bpf_reenqueue_local_from_anywhere(void) 292 + { 293 + return bpf_ksym_exists(scx_bpf_reenqueue_local___v2___compat); 294 + } 295 + 296 + static inline void scx_bpf_reenqueue_local(void) 297 + { 298 + if (__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) 299 + scx_bpf_reenqueue_local___v2___compat(); 300 + else 301 + scx_bpf_reenqueue_local___v1(); 302 + } 303 + 304 + /* 282 305 * Define sched_ext_ops. This may be expanded to define multiple variants for 283 306 * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). 284 307 */

+28 -10

tools/sched_ext/scx_qmap.bpf.c

··· 202 202 void *ring; 203 203 s32 cpu; 204 204 205 + if (enq_flags & SCX_ENQ_REENQ) 206 + __sync_fetch_and_add(&nr_reenqueued, 1); 207 + 205 208 if (p->flags & PF_KTHREAD) { 206 209 if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) 207 210 return; ··· 532 529 return task_qdist(a) > task_qdist(b); 533 530 } 534 531 535 - void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) 532 + SEC("tp_btf/sched_switch") 533 + int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev, 534 + struct task_struct *next, unsigned long prev_state) 536 535 { 537 - u32 cnt; 536 + if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) 537 + return 0; 538 538 539 539 /* 540 - * Called when @cpu is taken by a higher priority scheduling class. This 541 - * makes @cpu no longer available for executing sched_ext tasks. As we 542 - * don't want the tasks in @cpu's local dsq to sit there until @cpu 543 - * becomes available again, re-enqueue them into the global dsq. See 544 - * %SCX_ENQ_REENQ handling in qmap_enqueue(). 540 + * If @cpu is taken by a higher priority scheduling class, it is no 541 + * longer available for executing sched_ext tasks. As we don't want the 542 + * tasks in @cpu's local dsq to sit there until @cpu becomes available 543 + * again, re-enqueue them into the global dsq. See %SCX_ENQ_REENQ 544 + * handling in qmap_enqueue(). 545 545 */ 546 - cnt = scx_bpf_reenqueue_local(); 547 - if (cnt) 548 - __sync_fetch_and_add(&nr_reenqueued, cnt); 546 + switch (next->policy) { 547 + case 1: /* SCHED_FIFO */ 548 + case 2: /* SCHED_RR */ 549 + case 6: /* SCHED_DEADLINE */ 550 + scx_bpf_reenqueue_local(); 551 + } 552 + 553 + return 0; 554 + } 555 + 556 + void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) 557 + { 558 + /* see qmap_sched_switch() to learn how to do this on newer kernels */ 559 + if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) 560 + scx_bpf_reenqueue_local(); 549 561 } 550 562 551 563 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,