Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'sched_ext-for-6.12-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fixes from Tejun Heo:

- ops.enqueue() didn't have a way to tell whether select_task_rq_scx()
and thus ops.select() were skipped. Some schedulers were incorrectly
using SCX_ENQ_WAKEUP. Add SCX_ENQ_CPU_SELECTED and fix scx_qmap using
it.

- Remove a spurious WARN_ON_ONCE() in scx_cgroup_exit()

- Fix error information clobbering during load

- Add missing __weak markers to BPF helper declarations

- Doc update

* tag 'sched_ext-for-6.12-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
sched_ext: Documentation: Update instructions for running example schedulers
sched_ext, scx_qmap: Add and use SCX_ENQ_CPU_SELECTED
sched/core: Add ENQUEUE_RQ_SELECTED to indicate whether ->select_task_rq() was called
sched/core: Make select_task_rq() take the pointer to wake_flags instead of value
sched_ext: scx_cgroup_exit() may be called without successful scx_cgroup_init()
sched_ext: Improve error reporting during loading
sched_ext: Add __weak markers to BPF helper function decalarations

+43 -25
+1 -1
Documentation/scheduler/sched-ext.rst
··· 66 66 .. code-block:: none 67 67 68 68 # make -j16 -C tools/sched_ext 69 - # tools/sched_ext/scx_simple 69 + # tools/sched_ext/build/bin/scx_simple 70 70 local=0 global=3 71 71 local=5 global=24 72 72 local=9 global=44
+14 -7
kernel/sched/core.c
··· 3518 3518 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. 3519 3519 */ 3520 3520 static inline 3521 - int select_task_rq(struct task_struct *p, int cpu, int wake_flags) 3521 + int select_task_rq(struct task_struct *p, int cpu, int *wake_flags) 3522 3522 { 3523 3523 lockdep_assert_held(&p->pi_lock); 3524 3524 3525 - if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) 3526 - cpu = p->sched_class->select_task_rq(p, cpu, wake_flags); 3527 - else 3525 + if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) { 3526 + cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags); 3527 + *wake_flags |= WF_RQ_SELECTED; 3528 + } else { 3528 3529 cpu = cpumask_any(p->cpus_ptr); 3530 + } 3529 3531 3530 3532 /* 3531 3533 * In order not to call set_task_cpu() on a blocking task we need ··· 3661 3659 rq->nr_uninterruptible--; 3662 3660 3663 3661 #ifdef CONFIG_SMP 3662 + if (wake_flags & WF_RQ_SELECTED) 3663 + en_flags |= ENQUEUE_RQ_SELECTED; 3664 3664 if (wake_flags & WF_MIGRATED) 3665 3665 en_flags |= ENQUEUE_MIGRATED; 3666 3666 else ··· 4124 4120 guard(preempt)(); 4125 4121 int cpu, success = 0; 4126 4122 4123 + wake_flags |= WF_TTWU; 4124 + 4127 4125 if (p == current) { 4128 4126 /* 4129 4127 * We're waking current, this means 'p->on_rq' and 'task_cpu(p) ··· 4258 4252 */ 4259 4253 smp_cond_load_acquire(&p->on_cpu, !VAL); 4260 4254 4261 - cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); 4255 + cpu = select_task_rq(p, p->wake_cpu, &wake_flags); 4262 4256 if (task_cpu(p) != cpu) { 4263 4257 if (p->in_iowait) { 4264 4258 delayacct_blkio_end(p); ··· 4799 4793 { 4800 4794 struct rq_flags rf; 4801 4795 struct rq *rq; 4796 + int wake_flags = WF_FORK; 4802 4797 4803 4798 raw_spin_lock_irqsave(&p->pi_lock, rf.flags); 4804 4799 WRITE_ONCE(p->__state, TASK_RUNNING); ··· 4814 4807 */ 4815 4808 p->recent_used_cpu = task_cpu(p); 4816 4809 rseq_migrate(p); 4817 - __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK)); 4810 + __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); 4818 4811 #endif 4819 4812 rq = __task_rq_lock(p, &rf); 4820 4813 update_rq_clock(rq); ··· 4822 4815 4823 4816 activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); 4824 4817 trace_sched_wakeup_new(p); 4825 - wakeup_preempt(rq, p, WF_FORK); 4818 + wakeup_preempt(rq, p, wake_flags); 4826 4819 #ifdef CONFIG_SMP 4827 4820 if (p->sched_class->task_woken) { 4828 4821 /*
+20 -12
kernel/sched/ext.c
··· 625 625 /** 626 626 * exit - Clean up after the BPF scheduler 627 627 * @info: Exit info 628 + * 629 + * ops.exit() is also called on ops.init() failure, which is a bit 630 + * unusual. This is to allow rich reporting through @info on how 631 + * ops.init() failed. 628 632 */ 629 633 void (*exit)(struct scx_exit_info *info); 630 634 ··· 696 692 /* expose select ENQUEUE_* flags as enums */ 697 693 SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, 698 694 SCX_ENQ_HEAD = ENQUEUE_HEAD, 695 + SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, 699 696 700 697 /* high 32bits are SCX specific */ 701 698 ··· 4053 4048 4054 4049 percpu_rwsem_assert_held(&scx_cgroup_rwsem); 4055 4050 4056 - WARN_ON_ONCE(!scx_cgroup_enabled); 4057 4051 scx_cgroup_enabled = false; 4058 4052 4059 4053 /* ··· 4121 4117 css->cgroup, &args); 4122 4118 if (ret) { 4123 4119 css_put(css); 4120 + scx_ops_error("ops.cgroup_init() failed (%d)", ret); 4124 4121 return ret; 4125 4122 } 4126 4123 tg->scx_flags |= SCX_TG_INITED; ··· 5046 5041 if (ret) { 5047 5042 ret = ops_sanitize_err("init", ret); 5048 5043 cpus_read_unlock(); 5044 + scx_ops_error("ops.init() failed (%d)", ret); 5049 5045 goto err_disable; 5050 5046 } 5051 5047 } ··· 5156 5150 spin_lock_irq(&scx_tasks_lock); 5157 5151 scx_task_iter_exit(&sti); 5158 5152 spin_unlock_irq(&scx_tasks_lock); 5159 - pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", 5160 - ret, p->comm, p->pid); 5153 + scx_ops_error("ops.init_task() failed (%d) for %s[%d]", 5154 + ret, p->comm, p->pid); 5161 5155 goto err_disable_unlock_all; 5162 5156 } 5163 5157 ··· 5205 5199 5206 5200 scx_ops_bypass(false); 5207 5201 5208 - /* 5209 - * Returning an error code here would lose the recorded error 5210 - * information. Exit indicating success so that the error is notified 5211 - * through ops.exit() with all the details. 5212 - */ 5213 5202 if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { 5214 5203 WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); 5215 - ret = 0; 5216 5204 goto err_disable; 5217 5205 } 5218 5206 ··· 5241 5241 scx_ops_bypass(false); 5242 5242 err_disable: 5243 5243 mutex_unlock(&scx_ops_enable_mutex); 5244 - /* must be fully disabled before returning */ 5245 - scx_ops_disable(SCX_EXIT_ERROR); 5244 + /* 5245 + * Returning an error code here would not pass all the error information 5246 + * to userspace. Record errno using scx_ops_error() for cases 5247 + * scx_ops_error() wasn't already invoked and exit indicating success so 5248 + * that the error is notified through ops.exit() with all the details. 5249 + * 5250 + * Flush scx_ops_disable_work to ensure that error is reported before 5251 + * init completion. 5252 + */ 5253 + scx_ops_error("scx_ops_enable() failed (%d)", ret); 5246 5254 kthread_flush_work(&scx_ops_disable_work); 5247 - return ret; 5255 + return 0; 5248 5256 } 5249 5257 5250 5258
+3
kernel/sched/sched.h
··· 2292 2292 #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ 2293 2293 #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ 2294 2294 #define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ 2295 + #define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ 2295 2296 2296 2297 #ifdef CONFIG_SMP 2297 2298 static_assert(WF_EXEC == SD_BALANCE_EXEC); ··· 2335 2334 * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) 2336 2335 * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) 2337 2336 * ENQUEUE_MIGRATED - the task was migrated during wakeup 2337 + * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called 2338 2338 * 2339 2339 */ 2340 2340 ··· 2362 2360 #define ENQUEUE_INITIAL 0x80 2363 2361 #define ENQUEUE_MIGRATING 0x100 2364 2362 #define ENQUEUE_DELAYED 0x200 2363 + #define ENQUEUE_RQ_SELECTED 0x400 2365 2364 2366 2365 #define RETRY_TASK ((void *)-1UL) 2367 2366
+3 -3
tools/sched_ext/include/scx/common.bpf.h
··· 41 41 u32 scx_bpf_dispatch_nr_slots(void) __ksym; 42 42 void scx_bpf_dispatch_cancel(void) __ksym; 43 43 bool scx_bpf_consume(u64 dsq_id) __ksym; 44 - void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym; 45 - void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym; 44 + void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; 45 + void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; 46 46 bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; 47 47 bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; 48 48 u32 scx_bpf_reenqueue_local(void) __ksym; ··· 71 71 bool scx_bpf_task_running(const struct task_struct *p) __ksym; 72 72 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; 73 73 struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; 74 - struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym; 74 + struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; 75 75 76 76 /* 77 77 * Use the following as @it__iter when calling
+2 -2
tools/sched_ext/scx_qmap.bpf.c
··· 230 230 return; 231 231 } 232 232 233 - /* if !WAKEUP, select_cpu() wasn't called, try direct dispatch */ 234 - if (!(enq_flags & SCX_ENQ_WAKEUP) && 233 + /* if select_cpu() wasn't called, try direct dispatch */ 234 + if (!(enq_flags & SCX_ENQ_CPU_SELECTED) && 235 235 (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { 236 236 __sync_fetch_and_add(&nr_ddsp_from_enq, 1); 237 237 scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);